# Setup
Also included is the Requirements.txt file to install the libraries in the virtual environment.

In [1]:
import logging

# Create a custom logger
logger = logging.getLogger(__name__)

# Set level of logger
logger.setLevel(logging.DEBUG)

# Create handlers
c_handler = logging.StreamHandler()
f_handler = logging.FileHandler('file.log')

c_handler.setLevel(logging.WARNING)
f_handler.setLevel(logging.ERROR)

# Create formatters and add it to handlers
c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)

# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)

In [2]:
try:
    import pandas as pd
    logger.info('Successfully imported pandas.')
except ImportError:
    logger.error('Could not import pandas.')

try:
    import numpy as np
    logger.info('Successfully imported numpy.')
except ImportError:
    logger.error('Could not import numpy.')
    
try:
    import matplotlib.pyplot as plt
    logger.info('Successfully imported matplotlib.')
except ImportError:
    logger.error('Could not import matplotlib.')
    
try:
    import warnings
    logger.info('Successfully imported warnings.')
except ImportError:
    logger.error('Could not import warnings.')
    
try:
    import scipy.stats as ss
    from scipy.stats import f_oneway, norm
    logger.info('Successfully imported scipy.stats.')
except ImportError:
    logger.error('Could not import scipy.stats.')

try:
    from collections import Counter
    logger.info('Successfully imported collections.Counter.')
except ImportError:
    logger.error('Could not import collections.Counter.')

try:
    import math
    logger.info('Successfully imported math.')
except ImportError:
    logger.error('Could not import math.')
    
try:
    from itertools import product
    logger.info('Successfully imported itertools.product.')
except ImportError:
    logger.error('Could not import itertools.product.')

try:
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler
    logger.info('Successfully imported sklearn.preprocessing modules.')
except ImportError:
    logger.error('Could not import sklearn.preprocessing modules.')

try:
    from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
    logger.info('Successfully imported sklearn.model_selection modules.')
except ImportError:
    logger.error('Could not import sklearn.model_selection modules.')

try:
    from sklearn.metrics import classification_report, f1_score, roc_auc_score, confusion_matrix, accuracy_score
    from sklearn.metrics import roc_curve, roc_auc_score, ConfusionMatrixDisplay, recall_score, precision_score
    logger.info('Successfully imported sklearn.metrics modules.')
except ImportError:
    logger.error('Could not import sklearn.metrics modules.')

try:
    from sklearn.tree import DecisionTreeClassifier
    logger.info('Successfully imported DecisionTreeClassifier.')
except ImportError:
    logger.error('Could not import DecisionTreeClassifier.')

try:
    from sklearn.ensemble import RandomForestClassifier
    logger.info('Successfully imported RandomForestClassifier.')
except ImportError:
    logger.error('Could not import RandomForestClassifier.')

try:
    from sklearn.ensemble import AdaBoostClassifier
    logger.info('Successfully imported AdaBoostClassifier.')
except ImportError:
    logger.error('Could not import AdaBoostClassifier.')
    
try:
    from sklearn.ensemble import GradientBoostingClassifier
    logger.info('Successfully imported GradientBoostingClassifier.')
except ImportError:
    logger.error('Could not import GradientBoostingClassifier.')

try:
    from sklearn.linear_model import LogisticRegression
    logger.info('Successfully imported LogisticRegression.')
except ImportError:
    logger.error('Could not import LogisticRegression.')

try:
    from sklearn.neural_network import MLPClassifier
    logger.info('Successfully imported MLPClassifier.')
except ImportError:
    logger.error('Could not import MLPClassifier.')

try:
    from sklearn.preprocessing import LabelEncoder
    logger.info('Successfully imported LabelEncoder from sklearn.preprocessing.')
except ImportError:
    logger.error('Could not import LabelEncoder from sklearn.preprocessing.')

try:
    warnings.filterwarnings("ignore")
    logger.info('Successfully ignored warnings.')
except Exception as e:
    logger.error('Could not ignore warnings. Error: %s', e)

try:
    pd.set_option('display.max_columns', None)
    logger.info('Successfully set pandas display option to maximum.')
except Exception as e:
    logger.error('Could not set pandas display option. Error: %s', e)

try:
    import os
    logger.info('Successfully imported os.')
except ImportError:
    logger.error('Could not import os.')

# Data load

In [3]:
file_path = r'C:\Users\brianda.nunez\Documents\GitHub\finalproject\Project\finalproject_itesm_mlops\data\fraud_oracle.csv'

try:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")

    df = pd.read_csv(file_path)
    logger.info('Successfully read the CSV file.')
except FileNotFoundError as e:
    logger.error(str(e))
except Exception as e:
    logger.error('Could not read the CSV file. Error: %s', e)

# Data exploration

In [4]:
try:
    head_df = df.head()
    logger.info('Successfully retrieved the head of the DataFrame.')
    logger.debug('Head of DataFrame:\n%s', head_df)
except Exception as e:
    logger.error('Could not retrieve the head of the DataFrame. Error: %s', e)

In [5]:
try:
    info_df = df.info()
    logger.info('Successfully retrieved the info of the DataFrame.')
    logger.debug('Info of DataFrame:\n%s', info_df)
except Exception as e:
    logger.error('Could not retrieve the info of the DataFrame. Error: %s', e)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [6]:
try:
    null_counts = df.isnull().sum()
    logger.info('Null counts:\n%s', null_counts)
except Exception as e:
    logger.error('An error occurred while calculating null counts. Error: %s', e)

# Data transformation

In [7]:
# Refactored code:
for column in df.columns:
    if column != 'PolicyNumber':
        logger.info(f'Column: {column}')
        unique_values = sorted(df[column].unique())
        logger.info(f'Unique values: {unique_values}\n')

## Issue
- DayOfWeekClaimed, MonthClaimed, and Age contains a 0
- PolicyType is a concenation of VehicleCategory and BasePolicy
- PolicyNumber just a row number

In [8]:
# Check DayOfWeekClaimed == 0
try:
    filtered_df = df[df['DayOfWeekClaimed'] == '0']
    logger.debug('Filtered DataFrame:\n%s', filtered_df)
except Exception as e:
    logger.warning('A warning occurred while filtering the DataFrame. Error: %s', e)

In [9]:
# Check MonthClaimed == 0
try:
    filtered_df = df[df['MonthClaimed'] == '0']
    logger.debug('Filtered DataFrame:\n%s', filtered_df)
except Exception as e:
    logger.warning('A warning occurred while filtering the DataFrame. Error: %s', e)

In [10]:
# Check Age == 0
print(df[df['Age']==0].shape)
df[df['Age']==0].sample(3)

(320, 33)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
1072,Feb,1,Saturday,Honda,Urban,Monday,Feb,3,Male,Single,0,Policy Holder,Sedan - All Perils,Sedan,more than 69000,1,1073,3,400,1,more than 30,more than 30,more than 4,new,16 to 17,No,No,External,none,no change,1 vehicle,1994,All Perils
10181,Dec,5,Thursday,Honda,Urban,Monday,Jan,1,Male,Single,0,Policy Holder,Utility - All Perils,Utility,more than 69000,0,10182,3,400,3,more than 30,more than 30,1,new,16 to 17,No,No,External,none,no change,1 vehicle,1995,All Perils
2730,Apr,2,Saturday,Honda,Rural,Monday,Apr,2,Male,Single,0,Policy Holder,Sedan - Collision,Sedan,more than 69000,0,2731,6,400,3,more than 30,more than 30,none,new,16 to 17,No,No,External,none,1 year,2 vehicles,1994,Collision


In [11]:
df = df.drop(columns='PolicyNumber')

## Cleansing

In [12]:
# Drop DayOfWeekClaimed & MonthClaimed == 0, since the 0 in the same row and only one row, so we can drop it
df = df[~(df['MonthClaimed']=='0')]

In [13]:
# Analysis row with Age == 0
df[df['Age']==0]['AgeOfPolicyHolder'].unique() #Result = array(['16 to 17'], dtype=object)

# Because the row with Age == 0, only appear in row with AgeOfPolicyHolder == '16 to 17', i will impute with 16.5
df['Age'] =df['Age'].replace({0:16.5})

# Preprocessing

In [14]:
# Drop Policy Type
df = df.drop(columns='PolicyType')

## Encode Categorical Data

In [15]:
class CustomLabelEncoder:
    def __init__(self, col_ordering):
        self.col_ordering = col_ordering
        self.label_encoders = {}

    def fit_transform(self, df):
        for item in self.col_ordering:
            col = item['col']
            mapping = item['mapping']
            le = LabelEncoder()
            le.fit_transform(list(mapping.keys()))
            self.label_encoders[col] = le

            df[col] = df[col].map(mapping)

            logger.info(f'Column: {col} - LabelEncoder created and fit_transformed')

        return df

    def transform(self, df):
        for col, le in self.label_encoders.items():
            df[col] = df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else None)

            logger.info(f'Column: {col} - Transform applied')

        return df

In [16]:
col_ordering = [
    {'col':'AccidentArea','mapping':{'Urban':1, 'Rural':0}},
    {'col':'Sex','mapping':{'Female':1, 'Male':0}},
    {'col':'Fault','mapping':{'Policy Holder':1, 'Third Party':0}},
    {'col':'PoliceReportFiled','mapping':{'Yes':1, 'No':0}},
    {'col':'WitnessPresent','mapping':{'Yes':1, 'No':0}},
    {'col':'AgentType','mapping':{'External':1, 'Internal':0}},
    {'col':'Month','mapping':{'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}},
    {'col':'DayOfWeek','mapping':{'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}},
    {'col':'DayOfWeekClaimed','mapping':{'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}},
    {'col':'MonthClaimed','mapping':{'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}},
    {'col':'PastNumberOfClaims','mapping':{'none':0 ,'1':1,'2 to 4':2,'more than 4':3 }},
    {'col':'NumberOfSuppliments','mapping':{'none':0,'1 to 2':1,'3 to 5':2,'more than 5':3}}, 
    {'col':'VehiclePrice','mapping':{'less than 20000':0,'20000 to 29000':1,'30000 to 39000':2,
                                     '40000 to 59000':3,'60000 to 69000':4,'more than 69000':5}},
    {'col':'AgeOfVehicle','mapping':{'3 years':3,'6 years':6,'7 years':7,'more than 7':8,'5 years':5,'new':0,'4 years':4,'2 years':2}},
    {'col':'Days_Policy_Accident','mapping':{'more than 30':4,'15 to 30':3,'none':0,'1 to 7':1,'8 to 15':2}},
    {'col':'Days_Policy_Claim','mapping':{'more than 30':4,'15 to 30':3,'none':0,'1 to 7':1,'8 to 15':2}},
    {'col':'AgeOfPolicyHolder','mapping':{'16 to 17':1,'18 to 20':2,'21 to 25':3,'26 to 30':4,'31 to 35':5,'36 to 40':6,
                                          '41 to 50':7,'51 to 65':8,'over 65':9}},
    {'col':'AddressChange_Claim','mapping':{'no change':0,'under 6 months':1,'1 year':2,'2 to 3 years':3,'4 to 8 years':4}},
    {'col':'NumberOfCars','mapping':{'1 vehicle':1,'2 vehicles':2,'3 to 4':3,'5 to 8':4,'more than 8':5}}
]

label_encoder = CustomLabelEncoder(col_ordering)
df_encoded = label_encoder.fit_transform(df)

In [17]:
df3 = pd.get_dummies(df_encoded)

In [18]:
df3.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,Age,Fault,VehiclePrice,FraudFound_P,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,Make_Accura,Make_BMW,Make_Chevrolet,Make_Dodge,Make_Ferrari,Make_Ford,Make_Honda,Make_Jaguar,Make_Lexus,Make_Mazda,Make_Mecedes,Make_Mercury,Make_Nisson,Make_Pontiac,Make_Porche,Make_Saab,Make_Saturn,Make_Toyota,Make_VW,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,VehicleCategory_Sedan,VehicleCategory_Sport,VehicleCategory_Utility,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability
0,12,5,3,1,2,1,1,1,21.0,1,5,0,12,300,1,4,4,0,3,4,0,0,1,0,2,3,1994,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True
1,1,3,3,1,1,1,4,0,34.0,1,5,0,15,400,4,4,4,0,6,5,1,0,1,0,0,1,1994,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False
2,10,5,5,1,4,11,2,0,47.0,1,5,0,7,400,3,4,4,1,7,7,0,0,1,0,0,1,1994,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False
3,6,2,6,0,5,7,1,0,65.0,0,1,0,4,400,2,4,4,1,8,8,1,0,1,3,0,1,1994,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,True
4,1,5,1,1,2,2,2,1,27.0,0,5,0,3,400,1,4,4,0,5,5,0,0,1,0,0,1,1994,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False


In [19]:
# Refactored code:
def chi_square(df3):
    cat_var_prod = [(i,j) for i in df3.columns for j in df3.columns if i!=j]
    result = []
    for i in cat_var_prod:
        result.append((i[0],i[1],list(ss.chi2_contingency(pd.crosstab(
                                    df3[i[0]], df3[i[1]])))[1]))
    chi_test_output = pd.DataFrame(result, columns = ['var1', 'var2', 'coeff'])
    chi_test_output2 = chi_test_output[chi_test_output['var1']=='FraudFound_P'].sort_values('coeff').reset_index(drop=True)
    def rej_acc(x):
        if x > 0.05:
            Ho = 'A_H0'
        else:
            Ho = 'R_H0'
        return Ho

    chi_test_output2['result'] = chi_test_output2['coeff'].apply(rej_acc)
    return chi_test_output2

print(chi_square(df3))


            var1                     var2         coeff result
0   FraudFound_P     BasePolicy_Liability  3.116620e-81   R_H0
1   FraudFound_P    VehicleCategory_Sport  1.208553e-63   R_H0
2   FraudFound_P                    Fault  1.406180e-59   R_H0
3   FraudFound_P    VehicleCategory_Sedan  9.021607e-52   R_H0
4   FraudFound_P    BasePolicy_All Perils  8.400805e-44   R_H0
5   FraudFound_P      AddressChange_Claim  9.704718e-22   R_H0
6   FraudFound_P               Deductible  1.302831e-15   R_H0
7   FraudFound_P             VehiclePrice  2.888324e-13   R_H0
8   FraudFound_P       PastNumberOfClaims  1.405198e-11   R_H0
9   FraudFound_P              Make_Accura  2.516880e-09   R_H0
10  FraudFound_P     BasePolicy_Collision  6.305578e-08   R_H0
11  FraudFound_P  VehicleCategory_Utility  1.431136e-05   R_H0
12  FraudFound_P             MonthClaimed  1.495245e-05   R_H0
13  FraudFound_P             AccidentArea  3.936304e-05   R_H0
14  FraudFound_P        AgeOfPolicyHolder  5.896560e-05

# Split Data

In [20]:
X = df3.drop(columns='FraudFound_P')
y = df3['FraudFound_P']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48, stratify=y)

In [22]:
y_train.value_counts()

FraudFound_P
0    11597
1      738
Name: count, dtype: int64

# Model function for training

In [23]:
# Refactored code:
modelname, acc, recall, precision, f1, roc_auc = [],[],[],[],[],[]
# Define a dictionary with the models we want to test.
model = {'dt': DecisionTreeClassifier(),
         'rf':RandomForestClassifier(),
          'adaboost':AdaBoostClassifier(), 
          'gradientb':GradientBoostingClassifier(),
         'mlp':MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32, 16, 8), max_iter=300,activation = 'relu',solver='adam',random_state=1),
        }

# Define a function to fit the models and save the evaluation metrics.
def fit_model(x_train, x_test, y_train, y_test, sampling):
    # Iterate over the models
    for key,value in zip(model, model.values()):
        print(f"Model {key} {sampling}")
        ml_model = value
        ml_model.fit(x_train, y_train)
        y_pred = ml_model.predict(x_test)
        
        # Save the evaluation metrics
        modelname.append(f'{key} {sampling}')
        acc.append(accuracy_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        roc_auc.append(roc_auc_score(y_test, y_pred))

# Define a function to oversample or undersample the training data
def sampling(x_train, y_train, kind='over', ss=0.5):
    if kind == 'over':
        oversample = SMOTE(sampling_strategy=ss)
        X, y = oversample.fit_resample(x_train, y_train)
    elif kind == 'under':
        join_train = pd.concat([x_train, y_train], axis=1)
        claim = join_train[join_train['FraudFound_P']==1]
        no_claim = join_train[join_train['FraudFound_P']==0]

        undersample_noclaim = no_claim.sample(len(claim)*3)
        join_train2 = pd.concat([claim,undersample_noclaim]).sample(frac=1)

        X = join_train2.drop(columns='FraudFound_P')
        y = join_train2['FraudFound_P']
    return X, y

def metric_result(y_test, y_pred):
    # Prints the F1 Score, Recall Score and Precision Score metrics
    print("F1 Score : ",f1_score(y_test, y_pred, average='binary'))
    print("Recall Score : ",recall_score(y_test, y_pred))
    print("Precision Score : ",precision_score(y_test, y_pred))

    # Shows the confusion matrix
    ig, ax = plt.subplots(1,2, figsize=(10,5))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(ax=ax[0])
    ax[0].set_title('Confusion Matrix')

    # Shows the ROC AUC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    ax[1].plot(fpr,tpr, label="AUC="+str(auc))
    ax[1].set_title('ROC AUC')
    ax[1].set_ylabel('True Positive Rate')
    ax[1].set_xlabel('False Positive Rate')
    ax[1].legend(loc=4)
    plt.show()

def glm_result(res, X_test, y_test):
    # Adds a constant to the test data X_test
    X_test_sm = sm.add_constant(X_test)
    
    # Predicts the values of y from the model res and the test data X_test_sm
    y_pred = res.predict(X_test_sm)

    # Creates a DataFrame with the columns is_claim_real and is_claim_prob
    df_res = pd.DataFrame({'is_claim_real':y_test, 'is_claim_prob':y_pred})
    
    # Creates a list with the thresholds for binary prediction
    cut_off = [0.0,0.05, 0.1, 0.15 , 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6,
               0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    
    # Creates a list with the F1 Score results for each threshold
    f1_result = []
    
    # Iterates over the thresholds to calculate the corresponding F1 Score for each threshold
    for i in cut_off:
        df_res['final_predicted'] = df_res['is_claim_prob'].map( lambda x: 1 if x > i else 0)
        f1_result.append(roc_auc_score(df_res['is_claim_real'], df_res['final_predicted']))
    
    # Creates a DataFrame with the columns cut_off and f1_score
    df_res2 = pd.DataFrame({'cut_off':cut_off,'f1_score':f1_result})
    
    # Finds the best threshold for binary prediction from ROC AUC analysis
    best_tresh = df_res2.sort_values('f1_score', ascending=False).head(1)['cut_off'].values[0]
    
    # Predicts binary values from the best threshold found.
    y_pred_thresh = (y_pred >= best_tresh).astype('float')
    print('Best Threshold :', best_tresh)
    metric_result(y_test, y_pred_thresh)

def find_best_tresh(pred_proba, y_test):
    # Crea un DataFrame con las probabilidades de predicción
    df_res = pd.DataFrame({'prob':pred_proba})
    
    # Creates a list with the thresholds for binary prediction
    cut_off = [0.0,0.05, 0.1, 0.15 , 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6,
               0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    
    # Creates a list with the F1 Score results for each threshold
    f1_result = []
    
    # Itera sobre los umbrales para calcular el F1 Score correspondiente a cada umbral
    for i in cut_off:
        df_res['final_predicted'] = df_res['prob'].map( lambda x: 1 if x > i else 0)
        f1_result.append(roc_auc_score(y_test, df_res['final_predicted']))
    
    # Creates a DataFrame with the columns cut_off and f1_score
    df_res2 = pd.DataFrame({'cut_off':cut_off,'f1_score':f1_result})
    
    # Finds the best threshold for binary prediction from ROC AUC analysis
    best_tresh = df_res2.sort_values('f1_score', ascending=False).head(1)['cut_off'].values[0]
    
    # Predicts binary values from the best threshold found
    y_pred_thresh = (y_pred >= best_tresh).astype('float')
    
    # Prints the best threshold found
    print('Best Threshold :', best_tresh)
    
    # Prints evaluation metrics for test values and predicted values
    metric_result(y_test, y_pred_thresh)

# Model results

In [24]:
fit_model(X_train, X_test, y_train, y_test, 'Normal')

Model dt Normal
Model rf Normal
Model adaboost Normal
Model gradientb Normal
Model mlp Normal


In [25]:
df_model = pd.DataFrame({'model':modelname, 'accuracy':acc, 'recall':recall, 'precision':precision, 'f1':f1, 'roc_auc':roc_auc})

In [26]:
df_model

Unnamed: 0,model,accuracy,recall,precision,f1,roc_auc
0,dt Normal,0.89332,0.210811,0.175676,0.191646,0.573843
1,rf Normal,0.940013,0.010811,0.5,0.021164,0.50506
2,adaboost Normal,0.937743,0.027027,0.294118,0.049505,0.511444
3,gradientb Normal,0.941958,0.037838,0.875,0.072539,0.518746
4,mlp Normal,0.940013,0.0,0.0,0.0,0.5


# Save the model

In [27]:
from joblib import dump

dump(model['dt'], 'dtmodel.pkl')

['dtmodel.pkl']