In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the labeled dataset from Excel
labeled_dataset_file = "../Persuation_labeling/labeled_dataset.xlsx"
labeled_dataset = pd.read_excel(labeled_dataset_file)

# Shuffle the dataset thoroughly
labeled_dataset = labeled_dataset.sample(frac=1).reset_index(drop=True)

# Extract features and target label
X_text_features = labeled_dataset[["subject", "sender", "sender_email", "body", "persuation"]]
y = labeled_dataset["phishing"].values

# Combine text features into a single column
X = X_text_features.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Tokenize the text data
max_words = 10000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

# Convert text data to sequences of integers
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to make them of the same length
max_sequence_length = 100  # Maximum length of sequences
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length, padding="post")


# Split the dataset into training, validation, and testing sets
test_size = 0.2
validation_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=test_size, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state=42)

# Create and train the Logistic Regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = logistic_model.predict(X_val)

# Evaluate the model on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

# Predict on the test set
y_test_pred = logistic_model.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy_test)
print("Classification Report for Test Set:")
print(classification_report(y_test, y_test_pred))

Validation Accuracy: 0.8490530658166136
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.84      0.87      3123
           1       0.80      0.86      0.82      2210

    accuracy                           0.85      5333
   macro avg       0.84      0.85      0.85      5333
weighted avg       0.85      0.85      0.85      5333

Test Accuracy: 0.8498349834983498
Classification Report for Test Set:
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      3928
           1       0.80      0.85      0.82      2738

    accuracy                           0.85      6666
   macro avg       0.84      0.85      0.85      6666
weighted avg       0.85      0.85      0.85      6666



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
# Load the labeled dataset from Excel
labeled_dataset_file = "../Persuation_labeling/labeled_dataset.xlsx"
labeled_dataset = pd.read_excel(labeled_dataset_file)

# Shuffle the dataset thoroughly
labeled_dataset = labeled_dataset.sample(frac=1).reset_index(drop=True)

# Extract features and target label
X_text_features = labeled_dataset[["subject", "sender", "sender_email", "body", "persuation"]]
y = labeled_dataset["phishing"].values

# Combine text features into a single column
X = X_text_features.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Tokenize the text data
max_words = 10000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

# Convert text data to sequences of integers
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to make them of the same length
max_sequence_length = 100  # Maximum length of sequences
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length, padding="post")

# Train the Word2Vec model
sentences = [text.split() for text in X]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to obtain the average word embeddings for each text
def get_average_word_embeddings(text):
    word_embeddings = []
    for word in text.split():
        if word in w2v_model.wv:
            word_embeddings.append(w2v_model.wv[word])
    if len(word_embeddings) == 0:
        return [0] * w2v_model.vector_size
    return sum(word_embeddings) / len(word_embeddings)

# Obtain lexical features using Word2Vec model
lexical_features = np.array([get_average_word_embeddings(text) for text in X])

# Split the dataset into training, validation, and testing sets
test_size = 0.2
validation_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(lexical_features, y, test_size=test_size, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state=42)

# Create and train the Logistic Regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = logistic_model.predict(X_val)

# Evaluate the model on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

# Predict on the test set
y_test_pred = logistic_model.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy_test)
print("Classification Report for Test Set:")
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 0.9904369023063941
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3159
           1       0.99      0.99      0.99      2174

    accuracy                           0.99      5333
   macro avg       0.99      0.99      0.99      5333
weighted avg       0.99      0.99      0.99      5333

Test Accuracy: 0.9887488748874887
Classification Report for Test Set:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3922
           1       0.99      0.99      0.99      2744

    accuracy                           0.99      6666
   macro avg       0.99      0.99      0.99      6666
weighted avg       0.99      0.99      0.99      6666

