In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from url_preprocessing.url_preprocessor import URLPreprocessor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import joblib
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('./data/text_model_data.csv')
df = df.sample(frac=1).reset_index(drop=True)  
X = df['url']
y = df['status']

In [4]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
param_grids = {
    "Logistic Regression": {'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1.0, 10, 20]},
    "Multinomial Naive Bayes": {'alpha': [0.1, 1.0, 10]}
}

In [6]:
def train_evaluate_model(name, pipeline, X_train, X_test, y_train, y_test):
    pipeline.fit(X_train, y_train)
    best_classifier = pipeline.named_steps['classifier'].best_estimator_
    print(f"GridSearchCV results for {name}:")
    print("Best parameters found:")
    print(best_classifier.get_params())
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    acc_train = metrics.accuracy_score(y_train, y_train_pred)
    acc_test = metrics.accuracy_score(y_test, y_test_pred)
    f1_train = metrics.f1_score(y_train, y_train_pred)
    f1_test = metrics.f1_score(y_test, y_test_pred)
    recall_train = metrics.recall_score(y_train, y_train_pred)
    recall_test = metrics.recall_score(y_test, y_test_pred)
    precision_train = metrics.precision_score(y_train, y_train_pred)
    precision_test = metrics.precision_score(y_test, y_test_pred)
    return acc_train, acc_test, f1_train, f1_test, recall_train, recall_test, precision_train, precision_test

In [7]:
def store_results(ML_Model, accuracy_train, accuracy_test, f1_score_train, f1_score_test, recall_train, recall_test, precision_train, precision_test):
    results = {"ML Model": ML_Model, "Accuracy (Train)": accuracy_train, "Accuracy (Test)": accuracy_test, "F1 Score (Train)": f1_score_train, "F1 Score (Test)": f1_score_test, "Recall (Train)": recall_train, "Recall (Test)": recall_test, "Precision (Train)": precision_train, "Precision (Test)": precision_test}
    return results

In [10]:
def main():
    
    classifiers = {
        "Logistic Regression": Pipeline([
            ('preprocessor', URLPreprocessor()),
            ('vectorizer', CountVectorizer(tokenizer=None, stop_words=None, lowercase=False, ngram_range=(1, 2))),
            ('classifier', GridSearchCV(LogisticRegression(), param_grids["Logistic Regression"], cv=5, scoring='f1'))
        ]),
        "Multinomial Naive Bayes": Pipeline([
            ('preprocessor', URLPreprocessor()),
            ('vectorizer', CountVectorizer(tokenizer=None, stop_words=None, lowercase=False, ngram_range=(1, 2))),
            ('classifier', GridSearchCV(MultinomialNB(), param_grids["Multinomial Naive Bayes"], cv=5, n_jobs=-1, scoring='f1'))
        ])
    }

    best_model = None
    best_f1_score = 0
    best_model_name = ""
    
    results = []
    for name, model in classifiers.items():
        acc_train, acc_test, f1_train, f1_test, recall_train, recall_test, precision_train, precision_test = train_evaluate_model(name, model, X_train, X_test, y_train, y_test)
        if f1_test > best_f1_score:
            best_f1_score = f1_test
            best_model = model
            best_model_name = name
        results.append(store_results(name, acc_train, acc_test, f1_train, f1_test, recall_train, recall_test, precision_train, precision_test))

        y_test_pred = model.predict(X_test)
        print(f"Classification Report for {name}:")
        print(metrics.classification_report(y_test, y_test_pred, target_names=('Phishing', 'Legitimate')))

    if best_model is not None:
        joblib.dump(best_model, 'text_model.joblib')
        print(f"Best performing model saved as 'text_model.joblib'. Classifier: {best_model_name}, with f1-Score: {best_f1_score}")

    results_df = pd.DataFrame(results)
    sorted_results = results_df.sort_values(by=['Accuracy (Test)', 'F1 Score (Test)'], ascending=False).reset_index(drop=True)
    print(sorted_results)


if __name__ == "__main__":
    main()

GridSearchCV results for Logistic Regression:
Best parameters found:
{'C': 20, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

    Phishing       0.92      0.97      0.94     78899
  Legitimate       0.97      0.92      0.95     85503

    accuracy                           0.94    164402
   macro avg       0.95      0.95      0.94    164402
weighted avg       0.95      0.94      0.94    164402

GridSearchCV results for Multinomial Naive Bayes:
Best parameters found:
{'alpha': 0.1, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}
Classification Report for Multinomial Naive Bayes:
              precision    recall  f1-score   support

    Phishing       0