In [1]:
import pandas as pd
import numpy as np

In [77]:
X_train = pd.read_csv('data/train/x_train.csv')
X_test = pd.read_csv('data/test/x_test.csv')
y_train = pd.read_csv('data/train/y_train.csv')
y_test = pd.read_csv('data/test/y_test.csv')

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

## Training the Logistic Regression model

In [78]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=130)
lr_clf.fit(X_train, y_train)

print('Prediction of observation 5 in the test data: {}'.format(lr_clf.predict(X_test[[6]])))

Prediction of observation 5 in the test data: [0]


In [79]:
y_pred_lr = lr_clf.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

pd.DataFrame(
    confusion_matrix(y_test, y_pred_lr),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7899,94
Actual,Positive,867,183


In [80]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_lr) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_lr) * 100))

7899 records were correctly predicted negative (true negative)
183 records were correctly predicted positive (true positives)
867 records were incorrectly predicted negative (false negative)
94 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 89.37%
Precision: 66.06%


## Training the K-NN model

In [86]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='minkowski', p=2)
knn_clf.fit(X_train, y_train)

print('Prediction of observation 5 in the test data: {}'.format(knn_clf.predict(X_test[[6]])))

Prediction of observation 5 in the test data: [0]


In [82]:
y_pred_knn = knn_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_knn),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7811,182
Actual,Positive,852,198


In [84]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_knn).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_knn) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_knn) * 100))

7811 records were correctly predicted negative (true negative)
198 records were correctly predicted positive (true positives)
852 records were incorrectly predicted negative (false negative)
182 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 88.57%
Precision: 52.11%


## Training the SVM model

In [87]:
from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', random_state=0)
svm_clf.fit(X_train, y_train)

print('Prediction of observation 5 in the test data: {}'.format(svm_clf.predict(X_test[[6]])))

Prediction of observation 5 in the test data: [0]


In [88]:
y_pred_svm = svm_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_svm),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7883,110
Actual,Positive,846,204


In [89]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_svm).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_svm) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_svm) * 100))

7883 records were correctly predicted negative (true negative)
204 records were correctly predicted positive (true positives)
846 records were incorrectly predicted negative (false negative)
110 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 89.43%
Precision: 64.97%


## Training the Kernel SVM model

In [90]:
from sklearn.svm import SVC

ksvm_clf = SVC(kernel='rbf', random_state=0)
ksvm_clf.fit(X_train, y_train)

print('Prediction of observation 5 in the test data: {}'.format(ksvm_clf.predict(X_test[[6]])))

Prediction of observation 5 in the test data: [0]


In [91]:
y_pred_ksvm = ksvm_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_ksvm),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7885,108
Actual,Positive,846,204


In [92]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_ksvm).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_ksvm) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_ksvm) * 100))

7885 records were correctly predicted negative (true negative)
204 records were correctly predicted positive (true positives)
846 records were incorrectly predicted negative (false negative)
108 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 89.45%
Precision: 65.38%


## Training the Naive Bayes model

In [93]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

print('Prediction of observation 5 in the test data: {}'.format(gnb_clf.predict(X_test[[6]])))

Prediction of observation 5 in the test data: [0]


In [94]:
y_pred_gnb = gnb_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_gnb),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,6691,1302
Actual,Positive,569,481


In [95]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_gnb).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_gnb) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_gnb) * 100))

6691 records were correctly predicted negative (true negative)
481 records were correctly predicted positive (true positives)
569 records were incorrectly predicted negative (false negative)
1302 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 79.31%
Precision: 26.98%


## Training the Decision Tree model

In [99]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_clf.fit(X_train, y_train)

print('Prediction of observation 5 in the test data: {}'.format(dt_clf.predict(X_test[[6]])))

Prediction of observation 5 in the test data: [0]


In [100]:
y_pred_dt = dt_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_dt),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7203,790
Actual,Positive,676,374


In [101]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_dt).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_dt) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_dt) * 100))

7203 records were correctly predicted negative (true negative)
374 records were correctly predicted positive (true positives)
676 records were incorrectly predicted negative (false negative)
790 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 83.79%
Precision: 32.13%


## Training the Random Forest model

In [102]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
rf_clf.fit(X_train, y_train)

print('Prediction of observation 5 in the test data: {}'.format(rf_clf.predict(X_test[[6]])))

Prediction of observation 5 in the test data: [0]


In [103]:
y_pred_rf = dt_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_rf),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7203,790
Actual,Positive,676,374


In [104]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_rf) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_rf) * 100))

7203 records were correctly predicted negative (true negative)
374 records were correctly predicted positive (true positives)
676 records were incorrectly predicted negative (false negative)
790 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 83.79%
Precision: 32.13%
