In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files

In [0]:
uploaded = files.upload()

Saving finalCleaned3.csv to finalCleaned3.csv


In [0]:
features = pd.read_csv('finalCleaned3.csv')

In [0]:
features.head()

In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

In [0]:
features = features.drop(columns=['user_session','USA','CHN','SK','JPN','UK','NED','FIN','SPN','RUS','CYP','TAI','CAN','IND','MAL','BRZ','user_id'])

In [0]:
features['colPurchase'].value_counts()

0    1385563
1     134196
Name: colPurchase, dtype: int64

In [0]:
#xiaomi, apple, samsung, huawei, oppo, vivo, meizu, tp-link, nokia

In [0]:
#rebalance data set using sklearn resample
df_majority = features[features.colPurchase==0] #majority of colPurchase column
df_minority = features[features.colPurchase==1] #minority of colPurchase column

In [0]:
#resample data to have minority class equal majority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=1385563,    # to match majority class
                                 random_state=123) # reproducible results

In [0]:
#concatenate df_majority and df_minority_upsampled together after rebalance
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [0]:
#verifiction of resampling; 0 and 1 count now equal; data rebalanced
df_upsampled.colPurchase.value_counts()

1    1385563
0    1385563
Name: colPurchase, dtype: int64

In [0]:
#reassign the resampled data back into the features variable
features = df_upsampled

In [0]:
#column transformer

numeric_features = ['price']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['brand']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [0]:
pip install eli5

In [0]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt

In [0]:
classifiers = [
    #DecisionTreeClassifier(),
    #KNeighborsClassifier(),
    #RandomForestClassifier(n_estimators = 100, verbose=3, n_jobs=-1, max_depth=20, random_state=42),
    #MLPClassifier()
    #XGBClassifier(),
    LogisticRegression(max_iter=4000)
]

In [0]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('clf', None)])

In [0]:
# y are the values we want to predict
y = np.array(features['colPurchase'])
# Remove the labels from the features
# axis 1 refers to the columns
X = features.drop('colPurchase', axis = 1)
# Saving feature names for later use
X_list = list(X.columns)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [0]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (2216900, 11)
Training Labels Shape: (2216900,)
Testing Features Shape: (554226, 11)
Testing Labels Shape: (554226,)


In [0]:
roc_things = []
precision_recall_things = []

In [0]:
def get_transformer_feature_names(columnTransformer):
  output_features = []

  for name, pipe, features in columnTransformer.transformers_:
    if name!='remainder':
      for i in pipe:
        trans_features = []
        if hasattr(i, 'categories_'):
          trans_features.extend(i.get_feature_names(features))
        else:
          trans_features = features
      output_features.extend(trans_features)

  return np.array(output_features)

In [0]:
for classifier in classifiers:
    clf.set_params(clf=classifier).fit(X_train, y_train)
    classifier_name = classifier.__class__.__name__
    print(str(classifier))

    y_score = clf.predict_proba(X_test)[:,1]

    y_pred = clf.predict(X_test)
    
    roc_auc = roc_auc_score(y_test, y_score)
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_things.append((fpr, tpr, '{} AUC: {:.3f}'.format(classifier_name, roc_auc)))
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_score)
    pr_auc = auc(recall, precision)
    precision_recall_things.append((recall, precision, thresholds, '{} AUC: {:.3f}'.format(classifier_name, pr_auc)))
    #plot_precision_recall_curve(clf, X_test, y_test)
    
    feature_names = get_transformer_feature_names(clf.named_steps['preprocessor'])
    
    try:
      importances = classifier.feature_importances_
      indices = np.argsort(importances)[::-1]
      print('~Feature Ranking:')

      for f in range (X_test.shape[1]):
        print ('{}. {} {} ({:.3f})'.format(f + 1, feature_names[indices[f]], indices[f], importances[indices[f]]))
    except:
      pass

    print('~Model Score: %.3f' % clf.score(X_test, y_test))

    scores = cross_val_score(clf, X, y, cv=5)
    print('~Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

    print('~Confusion Matrix:''\n',
    confusion_matrix(y_test, y_pred))
    print('~Classification Report:''\n',
    classification_report(y_test, y_pred,labels=np.unique(y_pred)))
   
    print('~Average Precision Score: {:.3f}'.format(average_precision_score(y_test, y_score)))
    print('~roc_auc_score: {:.3f}'.format(roc_auc))
    print('~precision-recall AUC: {:.3f}'.format(pr_auc))
    print()

In [0]:
roc_plt = plt.figure()
lw = 4
for roc_thing in roc_things:
    fpr, tpr, label = roc_thing
    plt.plot(fpr, tpr, lw=lw, label=label)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.legend()
plt.title('ROC curve')

In [0]:
pr_plt = plt.figure()
for pr_thing in precision_recall_things:
    recall, precision, _, label = pr_thing
    plt.plot(recall, precision, lw=lw, label=label)
ratio = y_test[y_test].shape[0] / y_test.shape[0]
plt.hlines(y=ratio, xmin=0, xmax=1, color='navy', lw=lw, linestyle='--')
plt.title('Precision-recall plot')
plt.legend()

In [0]:
from scipy.stats import hmean
import numpy.ma as ma

recall, precision, thresholds, _ = precision_recall_things[1]

a = np.column_stack((recall,precision))

a = ma.masked_less_equal(a, 0)
a = ma.mask_rows(a)
f1 = hmean(a,axis=1)

threshold_that_maximizes_f1 = thresholds[np.argmax(f1)]
print('threshold that optimizes f1: {}'.format(threshold_that_maximizes_f1))