All EDA and feature engineering can be found in the EDA notebook:                  
https://colab.research.google.com/drive/19JRIwA37hVrOnGjvbksFUxSELrN_GiSe?usp=sharing
                           
                           
The final results of this project can be found at:                   
https://docs.google.com/spreadsheets/d/1n8iYSgsZQE5TFF12d1C_VqRRVXb-NFfnHcusiSEQkdQ/edit?usp=sharing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
data=pd.read_csv("Final Train Data.csv")
data

Unnamed: 0,disrict,client_catg,region,target,diff_index,tarif_type_mode,reading_remarque_mode,counter_coefficient_mode,counter_type_mode,consommation_level_1_mean,...,consommation_level_3_mean,consommation_level_3_stddiv,consommation_level_4_mean,consommation_level_4_stddiv,diff_index_mean,diff_index_stddiv,reading_remarque_min,CreationDay,CreationMonth,CreationYear
0,60,11,1,0.0,82,11,6,1,0,352.400000,...,0.000000,0.000000,0.000000,0.000000,362.971429,341.553930,6,31,12,1994
1,69,11,1,0.0,1384,11,6,1,0,557.540541,...,0.000000,0.000000,0.000000,0.000000,557.540541,197.935960,6,29,5,2002
2,62,11,3,0.0,123,11,6,1,0,798.611111,...,0.000000,0.000000,0.000000,0.000000,836.500000,646.808386,6,13,3,1986
3,69,11,1,0.0,102,11,6,1,0,1.200000,...,0.000000,0.000000,0.000000,0.000000,1.200000,3.607011,6,7,11,1996
4,62,11,3,0.0,572,11,9,1,0,663.714286,...,117.357143,289.433294,36.714286,105.421081,922.642857,633.485669,8,14,10,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135488,62,11,3,0.0,819,11,6,1,0,1.957746,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6,26,7,2004
135489,63,11,3,0.0,737,40,9,1,1,185.853659,...,0.000000,0.000000,0.000000,0.000000,186.609756,203.347840,6,25,10,2012
135490,63,11,3,0.0,274,11,9,1,0,273.083333,...,0.000000,0.000000,0.000000,0.000000,273.083333,211.123441,9,22,11,2011
135491,60,11,1,0.0,825,10,6,1,0,300.000000,...,0.000000,0.000000,0.000000,0.000000,370.500000,232.638131,6,22,12,1993


In [None]:
def prepareData(filename='Final Train Data.csv'):
    data=pd.read_csv(filename)   
    x,y=data.drop('target',axis=1),data['target']
    return x,y

In [None]:
#!pip install imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler

oversample,undersample,smote=False,False,True # Pick whichever one you need to do

x,y=prepareData()
x_res,y_res=x,y

if oversample:
    print("Over sampling chosen")
    ros=RandomOverSampler()
    x_res,y_res=ros.fit_resample(x,y)
elif undersample:
    print("Under sampling chosen")
    rus=RandomUnderSampler()
    x_res,y_res=rus.fit_resample(x,y)
elif smote:
    print("SMOTE chosen")
    os=SMOTE()
    x_res,y_res=os.fit_resample(x,y)

SMOTE chosen


In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x_res,y_res,test_size=0.2,stratify=y_res)

In [None]:
print(f'Xtrain: {xtrain.shape}\nXtest: {xtest.shape}')

Xtrain: (204683, 22)
Xtest: (51171, 22)


In [None]:
def printAllResults(ytest,preds):
    from sklearn.metrics import recall_score,precision_score,f1_score,accuracy_score,roc_auc_score
    recall,prec,f1,acc,roc=recall_score(ytest,preds),precision_score(ytest,preds),f1_score(ytest,preds),accuracy_score(ytest,preds),roc_auc_score(ytest,preds)
    with open("results smote.csv",'a') as f:
        f.write(f'{acc},{prec},{recall},{f1},{roc}\n')
    
    print("Recall:",recall)
    print("Precision:",prec)
    print("F1 Score:",f1)
    print("Accuracy:",acc)
    print("AUC ROC:",roc)
    
    return recall,prec,f1,acc,roc

In [None]:
def trainModel(model):
    if type(model).__name__=='SVC':
        #Scale all input data to ensure faster training for SVMs
        from sklearn.preprocessing import MinMaxScaler
        s1=MinMaxScaler()
        s1.fit(xtrain)
        model.fit(s1.transform(xtrain),ytrain)
        
        s2=MinMaxScaler()
        s2.fit(xtest)
        preds=model.predict(s2.transform(xtest))
        printAllResults(ytest,preds)
    else:    
        model.fit(xtrain,ytrain)
        preds=model.predict(xtest)
        printAllResults(ytest,preds)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import lightgbm as lgbm

In [None]:
from time import time
def readTime(s):
    import datetime
    print(str(datetime.timedelta(seconds=s)))

In [None]:
modelsToTrain=[LogisticRegression(),GaussianNB(),DecisionTreeClassifier(),RandomForestClassifier(),
               KNeighborsClassifier(n_neighbors=int(np.sqrt(len(xtrain)))),
              SVC(kernel='poly',degree=3), SVC(kernel='rbf'),lgbm.LGBMClassifier()]

for model in modelsToTrain:
    print(f'Training {type(model).__name__}...')
    t0=time()
    trainModel(model)
    readTime(time()-t0)
    print('-'*90)

Training LogisticRegression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Recall: 0.43244088333007624
Precision: 0.6723792160437557
F1 Score: 0.5263558515699335
Accuracy: 0.6108733462312638
AUC ROC: 0.6108698593153157
0:00:01.438742
------------------------------------------------------------------------------------------
Training GaussianNB...
Recall: 0.9356263435606801
Precision: 0.5373891570322146
F1 Score: 0.682675032083274
Accuracy: 0.565105235387231
AUC ROC: 0.5651124760873829
0:00:00.203456
------------------------------------------------------------------------------------------
Training DecisionTreeClassifier...
Recall: 0.9177643150283369
Precision: 0.8826116373477673
F1 Score: 0.8998447948801472
Accuracy: 0.8978522991538176
AUC ROC: 0.8978526882731772
0:00:03.366994
------------------------------------------------------------------------------------------
Training RandomForestClassifier...
Recall: 0.9482900136798905
Precision: 0.9544077731009795
F1 Score: 0.9513390581500215
Accuracy: 0.9514959645111489
AUC ROC: 0.9514959018606597
0:00:47.958718
---