In [1]:
# import libraries
import numpy as np
import pandas as pd
import patsy as pt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
## for SVM
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
## for kNN
from sklearn.neighbors import KNeighborsClassifier
## for logistic
import statsmodels.api as sm
## for boosting
from sklearn.ensemble import GradientBoostingClassifier

  import pandas.util.testing as tm


In [2]:
# read in data
data = pd.read_csv('the_train.csv').drop('EQUIPMENT_COST_DIVISION_CD', axis = 1)
#data = pd.concat([data, pd.get_dummies(data.EQUIPMENT_COST_DIVISION_CD, prefix='Equip', drop_first=True)], axis=1).drop('EQUIPMENT_COST_DIVISION_CD', axis = 1)
data = pd.concat([data, pd.get_dummies(data.GENDER_CD, prefix='Gender', drop_first=True)], axis=1).drop('GENDER_CD', axis = 1)

test = pd.read_csv('the_test.csv').drop('EQUIPMENT_COST_DIVISION_CD', axis = 1)
#test = pd.concat([test, pd.get_dummies(test.EQUIPMENT_COST_DIVISION_CD, prefix='Equip', drop_first=True)], axis=1).drop('EQUIPMENT_COST_DIVISION_CD', axis = 1)
test = pd.concat([test, pd.get_dummies(test.GENDER_CD, prefix='Gender', drop_first=True)], axis=1).drop('GENDER_CD', axis = 1)

In [None]:
# calculate metrics for unbalanced data
## includes true positive rate, true negative rate, precision, f-measure, g-mean, and weighted accuracy

def measurements(real,pred,Name):
                 
    Confusion_Matrix = confusion_matrix(real,pred,labels = [0,1])
    TN= Confusion_Matrix[0][0]
    FN= Confusion_Matrix[1][0]
    FP= Confusion_Matrix[0][1]
    TP= Confusion_Matrix[1][1]
    
    # True Negative Rate 
    Acc_Negative= TN/(TN+FP)
    
    # True Positive Rate
    Acc_Positive = TP/(TP+FN)
    Recall = Acc_Positive
    
    # G-mean 
    G_mean = np.power(Acc_Negative * Acc_Positive, 0.5)
    
    # Precision
    Precision = TP/(TP+FP)
    
    # Weighted Accuracy
    Beta= 0.5 # Here we use equal weights for both true positive rate and true negative rate; i.e., β equals 0.5
    Weighted_Accuracy=  (Beta * Acc_Positive) + ((1-Beta)*Acc_Negative)
    
    # F-measure
    F_measure = (2 * Precision * Recall) /(Precision + Recall)
    
    performance_measures = [{"Method":Name,"Acc_Positive(Recall)":Acc_Positive, "Acc_Negative":Acc_Negative, "Precision":Precision,"F_measure":F_measure,"G_mean":G_mean,"Weighted_Accuracy":Weighted_Accuracy}]
    df = pd.DataFrame(performance_measures)
    
    return(df)

In [3]:
# independent and dependent variables 

## training set
y = data['quitIn30Days']
x = data.drop('quitIn30Days', axis = 1)

## validation set
yt = test['quitIn30Days']
xt = test.drop('quitIn30Days', axis = 1)

In [4]:
# kNN

## model
model = KNeighborsClassifier(n_neighbors=10, n_jobs = -1, metric='euclidean')

## fitted model
modelFit = model.fit(x,y)

## prediction
pred = modelFit.predict(x)

# Calculate accuracy score
accuracy_score(pred, y)

confusion_matrix(y,pred,labels = [0,1])

array([[41132,   208],
       [  803,  6396]])

In [8]:
measurements(y,pred,'kNN')

Unnamed: 0,Method,Acc_Positive(Recall),Acc_Negative,Precision,F_measure,G_mean,Weighted_Accuracy
0,kNN,0.888457,0.994969,0.968504,0.926755,0.940206,0.941713


In [9]:
# logistic

## model
model = sm.Logit(y, x)

## fitted model
modelFit = model.fit()

## prediction
pred = modelFit.predict(x)
pred[pred > 0.5] = 1
pred[pred <= 0.5] = 0


confusion_matrix(y,pred,labels = [0,1])

         Current function value: 0.365128
         Iterations: 35




array([[41109,   231],
       [ 6557,   642]])

In [10]:
measurements(y,pred,'logit')

Unnamed: 0,Method,Acc_Positive(Recall),Acc_Negative,Precision,F_measure,G_mean,Weighted_Accuracy
0,logit,0.089179,0.994412,0.735395,0.159068,0.297793,0.541796


In [9]:
# Boosting

# Generate the boosting model
model = GradientBoostingClassifier(n_estimators=100, max_depth=10, min_samples_leaf=10, random_state=42)

# Fit the model to the training data
modelFit = model.fit(x, y)

# Make predictions
pred = modelFit.predict(x)

# Confusion Matrix
print(confusion_matrix(y,pred,labels = [0,1]))

# Weighted Accuracy
measurements(y,pred,'boosting')

[[41180   160]
 [ 1181  6018]]


Unnamed: 0,Method,Acc_Positive(Recall),Acc_Negative,Precision,F_measure,G_mean,Weighted_Accuracy
0,boosting,0.835949,0.99613,0.974102,0.899753,0.912532,0.91604
