# Machine Learning in Python - Predictive Modelling

### Lots of Imports

To build predictive models in Python we use a set of libraries that are imported here. In particular **pandas** and **sklearn** are particularly important.

In [None]:
import os
import subprocess
from IPython.display import display, HTML, Image
import io
from operator import itemgetter

from TAS_Python_Utilities import data_viz
from TAS_Python_Utilities import data_viz_target
from TAS_Python_Utilities import visualize_tree

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn import metrics
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn import linear_model
from sklearn import neighbors

from xgboost import XGBClassifier

%matplotlib inline
%qtconsole

### Load & Partition Data

In [None]:
dataset = pd.read_csv('ACMETelephoneABT.csv')
dataset.head()

Examine the distribution of the two classes

In [None]:
dataset["churn"].value_counts()

In [None]:
display(dataset.select_dtypes(include=[np.number]).describe())
display(dataset.select_dtypes(include=[np.object]).describe())

In [None]:
data_viz(dataset)

In [None]:
data_viz_target(dataset, "churn")

Isolate the descriptive features we are interested in

In [None]:
X = dataset[['age',
 'income',
 'numHandsets',  
 'handsetAge',
 'smartPhone',
 'currentHandsetPrice',
 'avgBill',
 'avgOverBundleMins',
 'avgRoamCalls',
 'callMinutesChangePct',
 'billAmountChangePct',
 'avgReceivedMins',
 'avgOutCalls',
 'avgInCalls',
 'peakOffPeakRatio',
 'peakOffPeakRatioChangePct',
 'avgDroppedCalls',
 'lifeTime',
 'newFrequentNumbers']]
Y = dataset["churn"]

Split the data into a **training set**, a **vaidation set**, and a **test set**

In [None]:
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.7)

X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.5/0.7)

### A Very Simple Decision Tree

Train a decision tree

In [None]:
my_tree = \
    tree.DecisionTreeClassifier(criterion="entropy")
my_tree.fit(X_train,y_train)

Visualise the decision tree so we can see what it is doing!

In [None]:
feature_names = list(X_train.columns)
visualize_tree(my_tree, feature_names, fileName='dt_over.png')
Image(filename='dt_over.png') 

### Evaluating Model Performance

Assess the performance of the decision tree on the training set

In [None]:
# Make a set of predictions for the training data
y_pred = my_tree.predict(X_train)

# Print performance details
accuracy = \
metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))

print(metrics.classification_report(y_train, y_pred))

# Print confusion matrix
print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
pd.crosstab(y_train, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Assess the performance of the tree on the validation dataset

In [None]:
# Make a set of predictions for the test data
y_pred = my_tree.predict(X_valid)

# Print performance details
accuracy = metrics.accuracy_score(y_valid, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print(metrics.confusion_matrix(y_valid, y_pred))

# Print nicer confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Generate probability score based ROC evaluation metrics

In [None]:
# Make a set of predictions for the validation data
y_pred_score = my_tree.predict_proba(X_valid)
y_pred = my_tree.predict(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred_score[:, 1]))
fpr, tpr, thresh = metrics.roc_curve(y_valid, y_pred_score[:, 1])
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr)

### Alternative Trees

Train a decision tree, limiting its depth to 2

In [None]:
my_tree = tree.DecisionTreeClassifier(criterion="entropy", max_depth=2)
my_tree = my_tree.fit(X_train,y_train)

Assess the performance of the decision tree on the **training set**

In [None]:
# Make a set of predictions for the training data
y_pred = my_tree.predict(X_train)

# Print performance details
accuracy = metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_train, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_train, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Assess the performance of the decision tree on the **validation set**

In [None]:
# Make a set of predictions for the test data
y_pred = my_tree.predict(X_valid)

# Print performance details
accuracy = metrics.accuracy_score(y_valid, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Generate probability score based ROC evaluation metrics

In [None]:
# Make a set of predictions for the validation data
y_pred_score = my_tree.predict_proba(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred_score[:, 1]))
fpr, tpr, thresh = metrics.roc_curve(y_valid, y_pred_score[:, 1])
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr)

Visualise the decision tree so we can see what it is doing!

In [None]:
# visualise the decision tree
feature_names = list(X_train.columns)
visualize_tree(my_tree, feature_names, fileName="dt_under.png")
Image(filename='dt_under.png') 

## Perform a Cross Validation Experiment

Use a cross validation to perfrom an evaluation

In [None]:
my_tree = tree.DecisionTreeClassifier(max_depth = 12)
scores = cross_val_score(my_tree, X_train_plus_valid, y_train_plus_valid, cv=10)
print(scores)
scores = pd.Series(scores)
print(scores.mean(), " =/- ", scores.std())

## Choosing Parameters Using a Grid Search

In [None]:
my_tree.fit(X_train_plus_valid, y_train_plus_valid)
my_tree.classes_

An alternative to using post pruning explicitly is to use a grid search through a large set of possible parameters. Here we try depths between 3 and 20 and different limits on the minimum number of samples per split.

In [None]:
# Set up the parameter grid to seaerch
param_grid ={'criterion': ['gini', "entropy"], \
             'max_depth': list(range(3, 20, 3)), \
             'min_samples_split': [50] }

# Perform the search
my_tuned_tree = GridSearchCV(tree.DecisionTreeClassifier(), \
                                param_grid, cv=2, verbose = 0, \
                            return_train_score=True)
my_tuned_tree.fit(X_train_plus_valid, y_train_plus_valid)

# Print details
print("Best parameters set found on development set:")
display(my_tuned_tree.best_params_)
display(my_tuned_tree.best_score_)
display(my_tuned_tree.cv_results_)

In [None]:
# Make a set of predictions for the test data
y_pred = my_tuned_tree.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
best_tree = tree.DecisionTreeClassifier(min_samples_split=50, criterion='gini', max_depth=8)
best_tree = best_tree.fit(X_train, y_train)

# visualise the decision tree
feature_names = list(X_train.columns)
visualize_tree(best_tree, feature_names, 'dt_tuned.png')
Image(filename='dt_tuned.png') 

### Final Evaluation on Test Set

Evaluate the model on a stratified test set

In [None]:
# Make a set of predictions for the test data
y_pred = my_tuned_tree.predict(X_test)

# Print performance details
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

### Other Models

We can easily use the same patterns to train other types of models.

#### Random Forests

In [None]:
# Do the same job with random forests
my_model = ensemble.RandomForestClassifier(n_estimators=300, \
                                           max_features = 3,\
                                           min_samples_split=200)
my_model.fit(X_train,y_train)

Assess the performance of the model on the **validation set**

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_valid)

# Print performance details
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Generate probability score based ROC evaluation metrics

In [None]:
# Make a set of predictions for the validation data
y_pred_score = my_model.predict_proba(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred_score[:, 1]))
fpr, tpr, thresh = metrics.roc_curve(y_valid, y_pred_score[:, 1])
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr)

#### Bagging

In [None]:
# Do the same job with random forests
my_model = ensemble.BaggingClassifier(base_estimator = tree.DecisionTreeClassifier(criterion="entropy", min_samples_leaf = 50), \
                                      n_estimators=10)
my_model.fit(X_train,y_train)

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_valid)

# Print performance details
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Generate probability score based ROC evaluation metrics

In [None]:
# Make a set of predictions for the validation data
y_pred_score = my_model.predict_proba(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred_score[:, 1]))
fpr, tpr, thresh = metrics.roc_curve(y_valid, y_pred_score[:, 1])
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr)

#### AdaBoost

In [None]:
# Do the same job with random forests
my_model = ensemble.AdaBoostClassifier(base_estimator = tree.DecisionTreeClassifier(criterion="entropy", min_samples_leaf = 50), \
                                       n_estimators=10)
my_model.fit(X_train,y_train)

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_valid)

# Print performance details
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Generate probability score based ROC evaluation metrics

In [None]:
# Make a set of predictions for the validation data
y_pred_score = my_model.predict_proba(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred_score[:, 1]))
fpr, tpr, thresh = metrics.roc_curve(y_valid, y_pred_score[:, 1])
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr)

#### Logistic Regression

In [None]:
# Do the same job with logistic regression
my_model = linear_model.LogisticRegression()
my_model.fit(X_train,y_train)

Assess the performance of the model on the **validation set**

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_valid)

# Print performance details
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Generate probability score based ROC evaluation metrics

In [None]:
# Make a set of predictions for the validation data
y_pred_score = my_model.predict_proba(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred_score[:, 1]))
fpr, tpr, thresh = metrics.roc_curve(y_valid, y_pred_score[:, 1])
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr)

#### Nearest Neighbour

In [None]:
# Do the same job with random forests
my_model = neighbors.KNeighborsClassifier()
my_model = my_model.fit(X_train,y_train)

Assess the performance of the decision tree on the **validation set**

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_valid)

# Print performance details
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Generate probability score based ROC evaluation metrics

In [None]:
# Make a set of predictions for the validation data
y_pred_score = my_model.predict_proba(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred_score[:, 1]))
fpr, tpr, thresh = metrics.roc_curve(y_valid, y_pred_score[:, 1])
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr)

#### XGBoost

In [None]:
# Do the same job with random forests
my_model = XGBClassifier()
my_model = my_model.fit(X_train,y_train)

Assess the performance of the decision tree on the **validation set**

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_valid)

# Print performance details
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Generate probability score based ROC evaluation metrics

In [None]:
# Make a set of predictions for the validation data
y_pred_score = my_model.predict_proba(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred_score[:, 1]))
fpr, tpr, thresh = metrics.roc_curve(y_valid, y_pred_score[:, 1])
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr)