In [1]:
import pandas as pd
import numpy as np

## Precision or Recall?

It is important to determine what measure of accuracy we want to use to validate our model. In our case I am going to choose to use precision.

Why?

Well, lets look at what precision measures. We would be looking at the number of observations that have been correctly subscribed against all of the predicted subscribed values <b>(total Y / total Y obs)</b>, i.e. of all the observations that were predicted as Y, which ones were correct?

If we were looking at recall, we would look at <b>total Y correct / total Y</b>, meaning we would also consider the observations that were predicted as Y but were actually N.

We don't particularly care if we incorrectly classify a Y because if we predict them to be a N and they turn out to be a Y, then happy days, but what we really care about are the clients that were predicted to be a Y but turned out to be a N because this will affect future planning for our revenue figures.

## Working with Standardized data

In [2]:
X_train = pd.read_csv('data/train/x_train_stand.csv').values
y_train = pd.read_csv('data/train/y_train_stand.csv').values.flatten()

X_test = pd.read_csv('data/test/x_test_stand.csv').values
y_test = pd.read_csv('data/test/y_test_stand.csv').values.flatten()

### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=130)
lr_clf.fit(X_train, y_train)

y_pred_lr = lr_clf.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

pd.DataFrame(
    confusion_matrix(y_test, y_pred_lr),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,4993,3000
Actual,Positive,385,665


In [18]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_lr) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_lr) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred_lr) * 100))

4993 records were correctly predicted negative (true negative)
665 records were correctly predicted positive (true positives)
385 records were incorrectly predicted negative (false negative)
3000 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 62.57%
Precision: 18.14%
Recall: 63.33%


### K-NN

In [5]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='minkowski', p=2)
knn_clf.fit(X_train, y_train)

y_pred_knn = knn_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_knn),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7053,940
Actual,Positive,714,336


In [19]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_knn).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_knn) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_knn) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred_knn) * 100))

7053 records were correctly predicted negative (true negative)
336 records were correctly predicted positive (true positives)
714 records were incorrectly predicted negative (false negative)
940 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 81.71%
Precision: 26.33%
Recall: 32.00%


### SVM

In [7]:
from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', random_state=0)
svm_clf.fit(X_train, y_train)

y_pred_svm = svm_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_svm),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,4678,3315
Actual,Positive,379,671


In [20]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_svm).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_svm) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_svm) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred_svm) * 100))

4678 records were correctly predicted negative (true negative)
671 records were correctly predicted positive (true positives)
379 records were incorrectly predicted negative (false negative)
3315 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 59.15%
Precision: 16.83%
Recall: 63.90%


## Kernel SVM

In [9]:
from sklearn.svm import SVC

ksvm_clf = SVC(kernel='rbf', random_state=0)
ksvm_clf.fit(X_train, y_train)

y_pred_ksvm = ksvm_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_ksvm),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7560,433
Actual,Positive,844,206


In [21]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_ksvm).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_ksvm) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_ksvm, zero_division=0) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred_ksvm) * 100))

7560 records were correctly predicted negative (true negative)
206 records were correctly predicted positive (true positives)
844 records were incorrectly predicted negative (false negative)
433 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 85.88%
Precision: 32.24%
Recall: 19.62%


## Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

y_pred_gnb = gnb_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_gnb),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,4331,3662
Actual,Positive,370,680


In [22]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_gnb).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_gnb) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_gnb) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred_gnb) * 100))

4331 records were correctly predicted negative (true negative)
680 records were correctly predicted positive (true positives)
370 records were incorrectly predicted negative (false negative)
3662 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 55.41%
Precision: 15.66%
Recall: 64.76%


## Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_clf.fit(X_train, y_train)

y_pred_dt = dt_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_dt),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7220,773
Actual,Positive,701,349


In [23]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_dt).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_dt) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_dt) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred_dt) * 100))

7220 records were correctly predicted negative (true negative)
349 records were correctly predicted positive (true positives)
701 records were incorrectly predicted negative (false negative)
773 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 83.70%
Precision: 31.11%
Recall: 33.24%


## Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
rf_clf.fit(X_train, y_train)

y_pred_rf = dt_clf.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred_rf),
    columns=pd.MultiIndex.from_product([['Prediction'], ['Negative', 'Positive']]),
    index=pd.MultiIndex.from_product([['Actual'], ['Negative', 'Positive']])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Negative,Positive
Actual,Negative,7220,773
Actual,Positive,701,349


In [24]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
print('{} records were correctly predicted negative (true negative)'.format(tn))
print('{} records were correctly predicted positive (true positives)'.format(tp))
print('{} records were incorrectly predicted negative (false negative)'.format(fn))
print('{} records were incorrectly predicted positive (false positive)'.format(fp))
print('-----------------------------')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred_rf) * 100))
print('Precision: {:.2f}%'.format(precision_score(y_test, y_pred_rf) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred_rf) * 100))

7220 records were correctly predicted negative (true negative)
349 records were correctly predicted positive (true positives)
701 records were incorrectly predicted negative (false negative)
773 records were incorrectly predicted positive (false positive)
-----------------------------
Accuracy: 83.70%
Precision: 31.11%
Recall: 33.24%


The results have not differed much between the two methods of scaling. The best performing models remain to be:
- Decision Tree
- Random Forest

<b>How can I amend the threshold of my model to focus on precision?</b>