In [None]:
# Library functions
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
import multiprocessing
from numpy import round
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import os
import joblib
from tqdm import tqdm
# from sklearnex import patch_sklearn

# Our functions
from utils import TextPreprocessor, FeatureGenerator, remove_nan_questions, get_param_grid

# patch_sklearn()  # to speed up scikit-learn

In [None]:
_path_folder_quora = "~/Datasets/QuoraQuestionPairs"
MODELS_DIR = "model_artifacts"
SEED = 123

In [None]:
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)
    print(f"Folder '{MODELS_DIR}' created successfully.")
else:
    print(f"Folder '{MODELS_DIR}' already exists.")

In [None]:
_train_df = pd.read_csv("C:/Users/polri/Desktop/NLP/Assignment1/FlaviaFerrus_PolRiba_GerardCastro_ClaudiaHerron/QuoraQuestionPairs/Datasets/quora_train_data.csv")
x_train = _train_df.loc[:, ["question1", "question2"]]
y_train = _train_df.loc[:, "is_duplicate"]

x_train, y_train = remove_nan_questions(x_train, y_train)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
        x_train, y_train, test_size=0.2, random_state=SEED)
# and we fit the pipeline...

In [None]:
GRID_SEARCH: bool = False  # True

In [None]:
if not GRID_SEARCH:
    models = {"AdaBoostClassifier": AdaBoostClassifier(n_estimators=20)}

In [None]:
if not GRID_SEARCH:
    pipe = Pipeline(
            [('preprocessor', TextPreprocessor()),
             ('generator', FeatureGenerator(exts=('cv_2w', 'tf_idf_2w'), aggs=('stack', 'absolute'))),
             ('classifier', models["AdaBoostClassifier"])],
            verbose=True)
    pipe.fit(x_train, y_train)

In [None]:
if GRID_SEARCH:
    models = {
        "AdaBoostClassifier": AdaBoostClassifier(),
        "RandomForestClassifier": RandomForestClassifier(),
        "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
        "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
        "BernoulliNB": BernoulliNB(),
        "GaussianNB": GaussianNB(),
        "KNeighborsClassifier": KNeighborsClassifier(),
        "SVC": SVC(),
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "GradientBoostingClassifier": GradientBoostingClassifier(),
        "XGBClassifier": XGBClassifier(n_jobs=multiprocessing.cpu_count() - 1),
        "CatBoostClassifier": CatBoostClassifier(silent=True
        ),
    }

In [None]:
if GRID_SEARCH:
    fitted_models = {}
    scores = {}
    for name, model in tqdm(models.items()):
        # define pipeline given a model
        pipe = Pipeline(
            [('preprocessor', TextPreprocessor()),
             ('generator', FeatureGenerator(exts=('cv_2w', 'tf_idf_2w'), aggs=('stack', 'absolute'))),
             ('classifier', model)],
            verbose=True)
        # get grid of parameters to search
        grid = get_param_grid(name, SEED)
        grid = {}
        grid_search = GridSearchCV(
            pipe,
            param_grid=grid,
            scoring= "roc_auc",
            cv=2,
            verbose=10,
            n_jobs=multiprocessing.cpu_count() - 1,
            error_score="raise",
        )

        # fit grid search with pipeline and grid
        grid_search.fit(x_train, y_train)

        # save model
        fitted_models[name] = grid_search.best_estimator_
        scores[name] = grid_search.best_score_

        joblib.dump(grid_search, os.path.join(MODELS_DIR, f"fitted_{name}.pk1"))

In [None]:
if GRID_SEARCH:
    best_model_name = max(scores)
    best_model = fitted_models[best_model_name]
    print(f"Best model found in the grid search is {best_model_name}, with a CV score of {scores[best_model_name]:.4f}")
    fitted_pipe = best_model
else:
    fitted_pipe = pipe
    
joblib.dump(fitted_pipe, f'{MODELS_DIR}/fitted_pipeline.joblib')

In [None]:
y_pred_train = fitted_pipe.predict(x_train)
y_pred_test = fitted_pipe.predict(x_test)

fpr_train, tpr_train, _ = roc_curve(
    y_train, fitted_pipe.predict_log_proba(x_train)[:, 1])
auc_roc_train = auc(fpr_train, tpr_train)
fpr_test, tpr_test, _ = roc_curve(
    y_test, fitted_pipe.predict_log_proba(x_test)[:, 1])
auc_roc_test = auc(fpr_test, tpr_test)

print("TRAINING results:\n", classification_report(y_train, y_pred_train))
print("TESTING results:\n", classification_report(y_test, y_pred_test))

print("Training AUC:", auc_roc_train)
print("Testing AUC:", auc_roc_test)

In [None]:
plt.plot(fpr_train, tpr_train,
         label=f'Train (AUC = {round(auc_roc_train, 3)})')
plt.plot(fpr_test, tpr_test,
         label=f'Test (AUC = {round(auc_roc_test, 3)})')
plt.legend()
plt.savefig(f'{MODELS_DIR}/fitted_pipe_roc.png', dpi=250)
plt.show()
plt.close()