In [6]:
# Data handling
import numpy as np
import pandas as pd

# For visualization
import altair as alt

# Feature Selection
from sklearn.feature_selection import RFE

# Models
from sklearn import tree 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Split
from sklearn.model_selection import train_test_split

# Evaluation
from sklearn.metrics import confusion_matrix

import time
import pickle

# Data

In [7]:
# Training data
train = pd.read_csv("../data/processed_data/home_train.csv") 
X = train.drop(columns = ['CLAIM', 'Unnamed: 0'])
y = train['CLAIM']

# Splitting the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=1234)

In [8]:
print(f"X train shape: {X_train.shape}")
print(f"X valid shape: {X_valid.shape}")
print(f"y train shape: {y_train.shape}")
print(f"y valid shape: {y_valid.shape}")

X train shape: (74009, 83)
X valid shape: (18503, 83)
y train shape: (74009,)
y valid shape: (18503,)


# Feature selection

In this section I will select the features

In [9]:
def fit_and_report(model, X, y, Xv, yv, mode = 'regression'):
    """
    Fits a given model and calculates its score in the training and the validation set.
    ----------------------------------------------------
    
    Paramaters:
    ----------------------------------------------------
    model: model to fit
    X: Training X matrix
    y: Training response vector
    Xv: Validation X matrix
    yv: Validation response vector
    mode: Type of estimation classification and
    
    Returns:
    ---------------------------------------------------
    errors: list with the training and validation error
    
    Example:
    --------------------------------------------------
    fit_and_report(LogisticRegression(), X, y, Xv, yv, mode = 'classification')
    """
    model.fit(X, y)
    if mode.lower().startswith('regress'):
        errors = [mean_squared_error(y, model.predict(X)), mean_squared_error(yv, model.predict(Xv))]
    if mode.lower().startswith('classif'):
        errors = [1 - model.score(X,y), 1 - model.score(Xv,yv)]        
    return errors

In [None]:
# This section is using the RFE function to select the most important variables
# I chose 40 variables to test but I am interested in the order and the error
n_features = 40

# This dictionary stores the results
results = {'Stage':[],
           'N_features':[],
           'Score':[]}

for i in range(1, n_features + 1):
    
    start = time.time()
    
    print(f"iteration {i}")
    # I am using a logistic regression as a 
    lr = LogisticRegression(solver = "liblinear", class_weight='balanced')
    rfe = RFE(estimator = lr, n_features_to_select = i)
    rfe.fit(X_train, y_train)
    
    selected_features = rfe.support_
    print(f"Selected Features: {X_train.columns[selected_features]}")
    
    
    scores = fit_and_report(lr, 
                            X_train.iloc[:,selected_features], 
                            y_train, 
                            X_valid.iloc[:,selected_features], 
                            y_valid,
                            mode="classification")
    
    end = time.time() - start
    print(f"Time: {end}")
    print("-------------------")
    results['Stage'].append('Train')
    results['N_features'].append(i)
    results['Score'].append(scores[0])
    results['Stage'].append('Validation')
    results['N_features'].append(i)
    results['Score'].append(scores[1])

iteration 1


In [None]:
variable_selection = pd.DataFrame(results)

In [None]:
# This section shows the results of the variable selection process

alt.Chart(variable_selection).mark_line().encode(
    alt.Y('Score:Q'),
    alt.X('N_features:O', title = 'Number of features'),
    alt.Color('Stage:N')
).properties(
    width = 600,
    height = 400,
    title = 'Score by number of features using RFE'
)

In [None]:
variable_selection.to_csv("../results/data/variable_selection.csv")

## Modeling

Given it is a classification model, I am going to fit a:
- Decision Tree
- KNN classifier
- Logistic regression
- Support vector classifier
- Random forest
- XGboost
- lgbm

In [10]:
# Chosen variables
variables = ['NCD_GRANTED_YEARS_C', 'AD_CONTENTS_Y', 'PAYMENT_METHOD_NonDD', 'PAYMENT_METHOD_PureDD']

In [11]:
# Helper function
def evaluate_model(X_train, y_train, X_valid, y_valid, models):
    """
    Evaluates a group of models

    Parameters:
    X_valid -- (dataframe) validation X
    X_train -- (dataframe) train X
    y_valid -- (series) validation y
    y_train -- (series) train y
    models -- (dictionary) models dictionary
    
    Returns:
    results -- (dictionary) dictionary containing model, train error, validation error
    and elapsed training and validation time
    
    importances -- (dataframe) feature importances
    """
    results = {}
        
    for model_name, model in models.items():
        
        # Timing the model
        print(f"Fitting model: {model_name}")
        t = time.time()

        # Fitting the model as a pipeline

        model.fit(X_train, y_train);
        tr_err, valid_err = model.score(X_train, y_train), model.score(X_valid, y_valid)
        
        elapsed_time = time.time() - t
        
        results[model_name] = [model, round(tr_err,3), round(valid_err,3), round(elapsed_time,4)]
    
    return results


In [12]:
# List of models to use
models = {
          'decision tree': DecisionTreeClassifier(class_weight='balanced'),
          'kNN': KNeighborsClassifier(),
          'logistic regression': LogisticRegression(solver ='liblinear', class_weight='balanced'),
          'RBF SVM' :  SVC(gamma = 'scale', class_weight='balanced'), 
          'random forest' : RandomForestClassifier(class_weight='balanced'), 
          'xgboost' : XGBClassifier(),
          'lgbm': LGBMClassifier(class_weight='balanced'),
          'Dummy': DummyClassifier(strategy='stratified')
         }

In [16]:
results = evaluate_model(X_train.loc[:,variables], y_train, X_valid.loc[:,variables], y_valid, models)

Fitting model: decision tree
Fitting model: kNN
Fitting model: logistic regression
Fitting model: RBF SVM
Fitting model: random forest
Fitting model: xgboost
Fitting model: lgbm
Fitting model: Dummy


In [18]:
for model_name, model in results.items():
    print(f"Model Name: {model_name}")
    cm = confusion_matrix(y_valid, model[0].predict(X_valid[variables]))
    cm_train = confusion_matrix(y_train, model[0].predict(X_train[variables]))
    print(f"Confusion matrix: {cm_train}")

    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    
    recall = tp/(tp+fn)
    precision = tp/(tp+fp)
    f1 = 2 * precision * recall/(precision + recall)
    
    results[model_name].append(recall)
    results[model_name].append(precision)
    results[model_name].append(f1)
    
    print(f"The recall is {recall:.3f}")
    print(f"The precision is {precision:.3f}")
    print(f"The f1 is {f1:.3f}")
    print("--------")

Model Name: decision tree
Confusion matrix: [[62483  2827]
 [ 1525  7174]]
The recall is 0.835
The precision is 0.724
The f1 is 0.775
--------
Model Name: kNN
Confusion matrix: [[64717   593]
 [ 2311  6388]]
The recall is 0.745
The precision is 0.918
The f1 is 0.822
--------
Model Name: logistic regression
Confusion matrix: [[60116  5194]
 [ 1376  7323]]
The recall is 0.857
The precision is 0.594
The f1 is 0.701
--------
Model Name: RBF SVM
Confusion matrix: [[62483  2827]
 [ 1525  7174]]
The recall is 0.835
The precision is 0.724
The f1 is 0.775
--------
Model Name: random forest
Confusion matrix: [[62483  2827]
 [ 1525  7174]]
The recall is 0.835
The precision is 0.724
The f1 is 0.775
--------
Model Name: xgboost
Confusion matrix: [[64641   669]
 [ 2228  6471]]
The recall is 0.754
The precision is 0.910
The f1 is 0.825
--------
Model Name: lgbm
Confusion matrix: [[62483  2827]
 [ 1525  7174]]
The recall is 0.835
The precision is 0.724
The f1 is 0.775
--------
Model Name: Dummy
Confus

In [15]:
# Final model
fm = results['xgboost'][0]

In [None]:
results_df = pd.DataFrame(results, index=['model', 'Train_score', 'Test_score', 'Train_test_time', 'Recall', 'Precision', 'F1']).T
results_df.to_csv("../results/data/results_df.csv")

In [None]:
# Dumping the model
filename = '../results/final_model.sav'
pickle.dump(fm, open(filename, 'wb'))

# Final model evaluation

Testing the model in the test dataset

In [None]:
test_final = pd.read_csv("../data/processed_data/home_test.csv") 
X_test_final = test_final.loc[:,variables]
y_test_final = test_final['CLAIM']

In [None]:
y_test_predict = fm.predict(X_test_final)

In [None]:
cm = confusion_matrix(y_test_final, y_test_predict)

tp = cm[1,1]
fn = cm[1,0]
fp = cm[0,1]
tn = cm[0,0]
    
recall = tp/(tp+fn)
precision = tp/(tp+fp)
f1 = 2 * precision * recall/(precision + recall)

score = fm.score(X_test_final,y_test_final)

print(f"The recall is {recall:.3f}")
print(f"The precision is {precision:.3f}")
print(f"The f1 is {f1:.3f}")
print(f"The score {score:.3f}")

In [149]:
results_test = {
    'tp':tp,
    'fp':fp,
    'tn':tn,
    'fn':fn,
    'recall':recall,
    'precision':precision,
    'f1': f1,
    'score':score
}

results_test_df = pd.DataFrame(results_test, index=[0])

In [152]:
results_test_df.to_csv("../results/data/results_test.csv")