In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc
import matplotlib.pyplot as plt
# from sklearnex import patch_sklearn

# our custom functions
from utils import remove_nan_questions

# patch_sklearn() 

In [None]:
_path_folder_quora = "~/Datasets/QuoraQuestionPairs"
MODELS_DIR = "model_artifacts"
SEED = 123

In [None]:
_train_df = pd.read_csv(os.path.join(_path_folder_quora, "quora_train_data.csv"))
x_train = _train_df.loc[:, ["question1", "question2"]]
y_train = _train_df.loc[:, "is_duplicate"]

x_train, y_train = remove_nan_questions(x_train, y_train)
x_train, x_test, y_train, y_test = train_test_split(
        x_train, y_train, test_size=0.2, random_state=SEED)

In [None]:
import joblib
fitted_pipe = joblib.load(f'{MODELS_DIR}/fitted_pipeline.joblib')

In [None]:
y_pred_train = fitted_pipe.predict(x_train)
y_pred_test = fitted_pipe.predict(x_test)

fpr_train, tpr_train, _ = roc_curve(
    y_train, fitted_pipe.predict_log_proba(x_train)[:, 1])
auc_roc_train = auc(fpr_train, tpr_train)
fpr_test, tpr_test, _ = roc_curve(
    y_test, fitted_pipe.predict_log_proba(x_test)[:, 1])
auc_roc_test = auc(fpr_test, tpr_test)

print("TRAINING results:\n", classification_report(y_train, y_pred_train))
print("TESTING results:\n", classification_report(y_test, y_pred_test))

print("Training AUC:", auc_roc_train)
print("Testing AUC:", auc_roc_test)

In [None]:
plt.plot(fpr_train, tpr_train,
         label=f'Train (AUC = {round(auc_roc_train, 3)})')
plt.plot(fpr_test, tpr_test,
         label=f'Test (AUC = {round(auc_roc_test, 3)})')
plt.legend()
plt.savefig(f'{MODELS_DIR}/fitted_pipe_roc.png', dpi=250)
plt.show()
plt.close()