# Models
In this part of the project we will proceed to train the training data and validate the predictions obtained in order to select the correct model for the test data.

# Libraries

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
# import plotly.express as px
import category_encoders as ce
# from sklearn.preprocessing import OneHotEncoder
import functions as function
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.linear_model import LogisticRegressionCV 
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc, \
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve
from sklearn.compose import ColumnTransformer
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from catboost import CatBoostClassifier 


# Funciones

In [3]:
def conf_matrix (model):
    titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
    for title, normalize in titles_options:
        disp = plot_confusion_matrix(model, X_validation, y_validation,
                                 # display_labels=ytest,
                                 cmap=plt.cm.Blues)
                                 #normalize=normalize)
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)

plt.show()

def plot_roc_curve (model):
    prob_predictions = model.predict_proba(X_validation)
    # keep probabilities for the positive outcome only
    yhat = prob_predictions[:, 1]
    # calculate roc curves
    fpr, tpr, thresholds = roc_curve(y_validation, yhat)
    # plot the roc curve for the model
    plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', label='Random Forest')
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    # show the plot
    plt.show()
    

def evaluate_model(ytest, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model: {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

# def upload (name):
#     with open('../models/', +name.pickle', 'rb') as f:
#         name = pickle.load(f)

# def save (name):
#     with open ('../models/name.pickle','wb') as f:
#         pickle.dump(name,f)

# Read data

In [4]:
#Read the traing data
pd_fraud = pd.read_parquet('../data/training_data.parquet')

# Process the data

In [5]:
#Defining the steps in the numerical pipeline 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#Defining the steps in the categorical pipeline 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Numerical features to pass down the numerical pipeline 
numeric_features = pd_fraud.select_dtypes(include=['int64', 'float64']).drop(['isFraud'], axis=1).columns
#Categrical features to pass down the categorical pipeline 
categorical_features = pd_fraud.select_dtypes(include=['object']).columns

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [7]:
with open ('../models/preprocessor.pickle','wb') as f:
    pickle.dump(preprocessor,f)

In [8]:
with open('../models/preprocessor.pickle', 'rb') as f:
    preprocessor = pickle.load(f)

In [9]:
#Separate the training data in training and validation
X_train, X_validation, y_train, y_validation = train_test_split(pd_fraud, pd_fraud['isFraud'], test_size=0.15, random_state=1)

In [11]:
X_train = X_train.drop(['isFraud'], axis=1)
X_validation = X_validation.drop(['isFraud'], axis=1)

Unnamed: 0,step,type_1,type_2,type_3,type_4,type_5,amount,device_1,device_2,device_3,...,zone_1,zone_2,zone_3,zone_4,user_number,user_connections,security_alert_1,security_alert_2,oldbalanceDest,newbalanceDest
618346,15,1,0,0,0,0,6439.0,0,0,1,...,0,1,0,0,2305,1,0,1,0.0,0.0
483884,18,1,0,0,0,0,7387.21,1,0,0,...,1,0,0,0,887,7,0,1,0.0,0.0
772752,16,1,0,0,0,0,3793.31,0,1,0,...,1,0,0,0,4716,2,0,1,0.0,0.0
1074909,38,0,0,1,0,0,1115675.0,0,0,0,...,0,0,0,0,881,3,0,1,1083615.0,2199289.0
581555,13,1,0,0,0,0,12324.96,0,0,1,...,0,1,0,0,82,10,0,1,0.0,0.0


# Base model

In [None]:
#stratify=y

In [19]:
model_base = Pipeline(steps=[
    ('preprocesador', preprocessor), 
    ('clasificador', DummyClassifier(strategy='most_frequent',random_state=1))])

In [20]:
model_base.fit(X_train, y_train)

In [21]:
with open('../models/base.pickle', 'wb') as f:
    pickle.dump(model_base, f)

In [22]:
with open('../models/base.pickle', 'rb') as f:
    model_base = pickle.load(f)

### Results

In [24]:
y_pred_base = model_base.predict(X_validation)
y_pred_proba_base = model_base.predict_proba(X_validation)
evaluate_model(y_validation, y_pred_base, y_pred_proba_base)

ROC-AUC score of the model: 0.5
Accuracy of the model: 0.49997215371151243



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report: 
              precision    recall  f1-score   support

           0       0.50      1.00      0.67    125683
           1       0.00      0.00      0.00    125697

    accuracy                           0.50    251380
   macro avg       0.25      0.50      0.33    251380
weighted avg       0.25      0.50      0.33    251380


Confusion matrix: 
[[125683      0]
 [125697      0]]



# Lasso

In [18]:
model_lasso = Pipeline(steps=[
    ('preprocesador', preprocessor), 
    ('clasificador', LogisticRegression(C=1.5,random_state=1, n_jobs=2, penalty='l1', solver='liblinear', tol= 0.0005))])

In [19]:
model_lasso.fit(X_train, y_train)



In [20]:
with open('../models/model_lasso.pickle', 'wb') as f:
    pickle.dump(model_lasso, f)

In [21]:
with open('../models/model_lasso.pickle', 'rb') as f:
    model_lasso = pickle.load(f)

### Results

In [22]:
y_pred_lasso = model_lasso.predict(X_validation)
y_pred_proba_lasso = model_lasso.predict_proba(X_validation)
evaluate_model(y_validation, y_pred_lasso, y_pred_proba_lasso)

ROC-AUC score of the model: 0.9992950277957097
Accuracy of the model: 0.9910454292306469

Classification report: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    125683
           1       1.00      0.99      0.99    125697

    accuracy                           0.99    251380
   macro avg       0.99      0.99      0.99    251380
weighted avg       0.99      0.99      0.99    251380


Confusion matrix: 
[[125289    394]
 [  1857 123840]]



In [None]:
# from sklearn.linear_model import Lasso, LassoCV: 
#     %%time 
    
# lassocv = LassoCV(alphas = None, cv = 10, max_iter = 500, normalize = True, random_state = 50)                
# # no predefinimos los alphas, queremos que los seleccione el propio modelo cv = 10, 
# # indicamos que el cross validation realice 10K max_iter = 500, # número máximo de iteraciones = 500 normalize = True, # normalizamos las variables random_state = 50) 
                  
# # entrenamos el modelo de cross validation con toda la muestra: 
# lassocv.fit(predictiveVariables, target ,) # valores de alpha seleccionados por el modelo: 
# lassocv.alpha_ 
# # definimos: lasso function, en el que el alpha será el calculado anteriormente 
# model_lasso = Lasso(alpha=lassocv.alpha_) 
# # entrenamos el modelo lasso con toda la muestra 
# model_lasso.fit(predictiveVariables, target) 
# # coefficientes lasso de las variables predictoras: 
# lasso_coefficients = pd.DataFrame(model_lasso.coef_, set(predictiveVariables), columns = ['Coefficients']) 
# # término independiente del modelo Lasso 
# lasso_coefficients.loc['Intercept'] = model_lasso.intercept_ lasso_coefficients 
# # data frame de los coeficientes 
# Lasso df_lasso = pd.DataFrame(lasso_coefficients) 
# # ordenamos las variables en función de sus coeficientes 
# df_lasso_ordered = df_lasso.sort_values(by = "Coefficients")

# Random Forest

In [39]:
model_rf = Pipeline(steps=[
    ('preprocesador', preprocessor), 
    
    ('clasificador', RandomForestClassifier(n_jobs=-1, random_state=0))])

In [40]:
model_rf.fit(X_train, y_train)

In [41]:
with open ('../models/random_forest.pickle','wb') as f:
    pickle.dump(model_rf,f)

In [42]:
with open('../models/random_forest.pickle', 'rb') as f:
    model_rf = pickle.load(f)

In [43]:
y_pred_rf = model_rf.predict(X_validation)
y_pred_proba_rf = model_rf.predict_proba(X_validation)

### Results

In [44]:
evaluate_model(y_validation, y_pred_rf,y_pred_proba_rf)

ROC-AUC score of the model: 0.9999957854094864
Accuracy of the model: 0.9999244172169623

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    125683
           1       1.00      1.00      1.00    125697

    accuracy                           1.00    251380
   macro avg       1.00      1.00      1.00    251380
weighted avg       1.00      1.00      1.00    251380


Confusion matrix: 
[[125682      1]
 [    18 125679]]



# GLM

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import glm
from statsmodels.sm_exceptions import ConvergenceWarning

In [None]:
model_glm = sm.glm(X_train, y_train)

# SVM

In [None]:
from sklearn.svm import SVC


In [None]:
model_svm = Pipeline(steps=[
    ('preprocesador', preprocessor), 
    
    ('clasificador', SVC(n_jobs=-1, random_state=1))])

In [None]:
model_svm.fit(X_train, y_train)

In [None]:
with open('../models/model_svm.pickle', 'wb') as f:
    pickle.dump(model_svm, f)

In [None]:
with open('../models/model_svm.pickle', 'rb') as f:
    model_svm = pickle.load(f)

In [None]:
y_pred_svm = model_svm.predict(X_validation)
y_pred_proba_svm = model_svm.predict_proba(X_validation)

### Results

In [None]:
evaluate_model(y_validation, y_pred_svm,y_pred_proba_svm)

### Results



In [None]:
evaluate_model

# LightGBM

In [32]:
import lightgbm as lgb

In [33]:
model_lgbm = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', lgb.LGBMClassifier(n_jobs=-1, random_state=0))])

In [34]:
model_lgbm.fit(X_train, y_train)

In [35]:
with open('../models/LightGBM.pickle', 'wb') as f:
    pickle.dump(model_lgbm, f)

In [36]:
with open('../models/LightGBM.pickle', 'rb') as f:
    model_lgbm = pickle.load(f)

### Results

In [38]:
y_pred_lgbm = model_lgbm.predict(X_validation)
y_pred_proba_lgbm = model_lgbm.predict_proba(X_validation)
evaluate_model(y_validation, y_pred_lgbm, y_pred_proba_lgbm)

ROC-AUC score of the model: 0.9999975741196292
Accuracy of the model: 0.999733471238762

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    125683
           1       1.00      1.00      1.00    125697

    accuracy                           1.00    251380
   macro avg       1.00      1.00      1.00    251380
weighted avg       1.00      1.00      1.00    251380


Confusion matrix: 
[[125636     47]
 [    20 125677]]



# XGBoost

In [48]:
from xgboost import XGBClassifier
# from sklearn.model_selection import GrideSearchCV

In [49]:
model_xgb = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', XGBClassifier(n_jobs=-1, random_state=0))])

In [50]:
model_xgb.fit(X_train, y_train)





In [52]:
with open('../models/XGBoost.pickle', 'wb') as f:
    pickle.dump(model_xgb, f)

In [53]:
with open('../models/XGBoost.pickle', 'rb') as f:
    model_xgb = pickle.load(f)

### Resuts

In [54]:
y_pred_xgb = model_xgb.predict(X_validation)
y_pred_proba_xgb = model_xgb.predict_proba(X_validation)
evaluate_model(y_validation, y_pred_xgb, y_pred_proba_xgb)

ROC-AUC score of the model: 0.9999992040752588
Accuracy of the model: 0.9998965709284748

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    125683
           1       1.00      1.00      1.00    125697

    accuracy                           1.00    251380
   macro avg       1.00      1.00      1.00    251380
weighted avg       1.00      1.00      1.00    251380


Confusion matrix: 
[[125672     11]
 [    15 125682]]



# CatBoost

In [57]:
model_cat = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', CatBoostClassifier(random_state=0, task_type="GPU"))])

In [59]:
model_cat.fit(X_train, y_train)

CatBoostError: C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/catboost/cuda/cuda_lib/cuda_manager.cpp:201: Condition violated: `State == nullptr'

In [None]:
with open('../models/CatBoost.pickle', 'wb') as f:
    pickle.dump(model_cat, f)

In [None]:
with open('../models/CatBoost.pickle', 'rb') as f:
    model_cat = pickle.load(f)

In [None]:
ypred_cat = model_cat.predict(X_validation)
ypred_proba_cat = model_cat.predict_proba(X_validation)
evaluate_model(y_validation,ypred_cat,ypred_proba_cat)

In [None]:
evaluate_model(y_train_cat, y_pred_cat,ypred_proba_cat)