In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

pd.set_option('display.precision', 3)

# Extra imports
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import graphviz

from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,GradientBoostingClassifier,StackingClassifier,ExtraTreesClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB

from time import time
from datetime import timedelta

import seaborn as sns
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

np.random.seed(1000)

In [4]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)
    
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [5]:
datasetTrain = pd.read_csv('../Dades/X_train_modified.csv')
datasetTest  = pd.read_csv('../Dades/X_test_modified.csv')

datasetTrain['Installs'] = datasetTrain['Installs'].astype('object')
datasetTest['Installs'] = datasetTest['Installs'].astype('object')

columsToDrop = ['Maximum Installs', 'Price', 'Size', 'Download', 'Last Updated', 'ModInstalls', 'ModMaximumInstalls', 'Rating', 'ModExit']
categoricalColumns = ['Installs', 'Category', 'Free', 'Content Rating', 'Ad Supported', 'In App Purchases', 'Editors Choice']

'''
# Convert to categorical variables
for column in categoricalColumns:
    datasetTrain[column] = datasetTrain[column].astype('category')
    datasetTest[column] = datasetTest[column].astype('category')
'''
# Drop repeated or unwanted data (Installs is the catageorial version of "Maximum installs")
datasetTrain = datasetTrain.drop(columns=columsToDrop)
datasetTest  = datasetTest.drop(columns=columsToDrop)

datasetTrain = datasetTrain.drop(columns=['Exit'])
datasetTest = datasetTest.drop(columns=['Exit'])

datasetTrain.dtypes

Category             object
Free                   bool
Released            float64
Content Rating       object
Ad Supported           bool
In App Purchases       bool
Editors Choice         bool
Installs             object
ModRating           float64
ModPrice            float64
ModSize             float64
ModLast Updated     float64
dtype: object

In [6]:
 datasetTrain['Installs'].unique()

array([10000.0, 10000000.0, 100000.0, 50000.0, 500.0, 1000.0, 5000.0,
       1000000.0, 500000.0, 100.0, 5000000.0, 50000000.0, 100000000.0,
       1000000000.0, 500000000.0], dtype=object)

In [8]:
Xtrain = datasetTrain.loc[:, datasetTrain.columns != 'Installs']
Ytrain = datasetTrain['Installs']

Xtest = datasetTest.loc[:, datasetTest.columns != 'Installs']
Ytest = datasetTest['Installs']

In [9]:
for column in Xtrain.columns:
        if Xtrain[column].dtype.kind == 'O':
            Xtrain_one_hot = pd.get_dummies(Xtrain[column], prefix=column)
            Xtrain = Xtrain.merge(Xtrain_one_hot,left_index=True,right_index=True)
            Xtrain = Xtrain.drop(columns=[column])
            
for column in Xtest.columns:
        if Xtest[column].dtype.kind == 'O':
            Xtest_one_hot = pd.get_dummies(Xtest[column], prefix=column)
            Xtest = Xtest.merge(Xtest_one_hot,left_index=True,right_index=True)
            Xtest = Xtest.drop(columns=[column])
            
print(Ytrain.head())


0       +10000.0
1       +10000.0
2    +10000000.0
3      +100000.0
4       +50000.0
Name: Installs, dtype: object


For the moment, we won't consider that there is a class imbalance

The error is symmetric, we will consider as our metric the F1-score macro average

In [10]:
def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_macro]

results = pd.DataFrame(columns=['Accuracy', 'F1-score (macro avg)'])


We split the training data to create a validation set

In [11]:
Xtrain, Xval, Ytrain, Yval = train_test_split(Xtrain, Ytrain, test_size=0.25, stratify=Ytrain, random_state=1)

## DecisionTreeClassifier

In [11]:
model_tree = DecisionTreeClassifier().fit(Xtrain, Ytrain)

In [12]:
Ypredtrain = model_tree.predict(Xtrain)

results.loc['DT-default',:] = compute_metrics(Ytrain, Ypredtrain)

confusion(Ytrain, Ypredtrain)

predicted,+100.0,+1000.0,+10000.0,+100000.0,+1000000.0,+10000000.0,+100000000.0,+1000000000.0,+500.0,+5000.0,+50000.0,+500000.0,+5000000.0,+50000000.0,+500000000.0
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100.0,3956,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000.0,1,33350,2,0,0,0,0,0,0,3,0,0,0,0,0
10000.0,0,16,57486,1,0,0,0,0,0,1,1,0,0,0,0
100000.0,0,0,6,25801,0,0,0,0,0,0,1,0,0,0,0
1000000.0,0,0,0,1,8295,0,0,0,0,0,0,0,0,0,0
10000000.0,0,0,0,0,0,1532,0,0,0,0,0,0,0,0,0
100000000.0,0,0,0,0,0,0,87,0,0,0,0,0,0,0,0
1000000000.0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0
500.0,3,2,0,0,0,0,0,0,4844,0,0,0,0,0,0
5000.0,0,27,33,0,0,0,0,0,3,25834,0,0,0,0,0


In [13]:
print(classification_report(Ytrain, Ypredtrain, target_names=['1000000.0', '100000.0', '10000.0', '50000.0', '5000.0', '500000.0', '1000.0', '500.0', '100.0', '5000000.0', '50000000.0', '10000000.0', '100000000.0', '1000000000.0', '500000000.0'],))

results

              precision    recall  f1-score   support

   1000000.0       1.00      1.00      1.00      3956
    100000.0       1.00      1.00      1.00     33356
     10000.0       1.00      1.00      1.00     57505
     50000.0       1.00      1.00      1.00     25808
      5000.0       1.00      1.00      1.00      8296
    500000.0       1.00      1.00      1.00      1532
      1000.0       1.00      1.00      1.00        87
       500.0       1.00      1.00      1.00         3
       100.0       1.00      1.00      1.00      4849
   5000000.0       1.00      1.00      1.00     25897
  50000000.0       1.00      1.00      1.00     17347
  10000000.0       1.00      1.00      1.00      6568
 100000000.0       1.00      1.00      1.00      1646
1000000000.0       1.00      1.00      1.00       179
 500000000.0       1.00      1.00      1.00         5

    accuracy                           1.00    187034
   macro avg       1.00      1.00      1.00    187034
weighted avg       1.00   

Unnamed: 0,Accuracy,F1-score (macro avg)
DT-default,0.999,1.0


In [14]:
Ypred = model_tree.predict(Xval)

results.loc['DT-default',:] = compute_metrics(Yval, Ypred)


In [15]:
print(classification_report(Yval, Ypred, target_names=['1000000.0', '100000.0', '10000.0', '50000.0', '5000.0', '500000.0', '1000.0', '500.0', '100.0', '5000000.0', '50000000.0', '10000000.0', '100000000.0', '1000000000.0', '500000000.0'],))

results

              precision    recall  f1-score   support

   1000000.0       0.28      0.27      0.27      1318
    100000.0       0.36      0.36      0.36     11119
     10000.0       0.41      0.41      0.41     19169
     50000.0       0.35      0.35      0.35      8603
      5000.0       0.19      0.19      0.19      2765
    500000.0       0.11      0.11      0.11       511
      1000.0       0.03      0.03      0.03        29
       500.0       0.00      0.00      0.00         1
       100.0       0.13      0.12      0.13      1616
   5000000.0       0.26      0.26      0.26      8632
  50000000.0       0.25      0.26      0.25      5783
  10000000.0       0.12      0.12      0.12      2189
 100000000.0       0.07      0.06      0.06       548
1000000000.0       0.03      0.03      0.03        60
 500000000.0       0.00      0.00      0.00         2

    accuracy                           0.32     62345
   macro avg       0.17      0.17      0.17     62345
weighted avg       0.32   

Unnamed: 0,Accuracy,F1-score (macro avg)
DT-default,0.322,0.173


It's clearly overfitting with the initial hyper parameters, let's change them

In [16]:
# Too slow
'''
criterion = ['gini', 'entropy']

max_dephts = [None, 5, 10, 15, 20]
min_samples_split = [1, 2, 3, 4, 5]
min_samples_leaf = [1, 2, 3, 4, 5]
max_features = ['auto', 'sqrt', 'log2', None]

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

init_time = time()
model_tree = DecisionTreeClassifier()


scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

trc = GridSearchCV(estimator=model_tree,
                   scoring=scoring_dict,
                   param_grid={
                       'criterion': criterion,
                       'max_depth': max_dephts,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'max_features': max_features
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac')

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))
'''

"\ncriterion = ['gini', 'entropy']\n\nmax_dephts = [None, 5, 10, 15, 20]\nmin_samples_split = [1, 2, 3, 4, 5]\nmin_samples_leaf = [1, 2, 3, 4, 5]\nmax_features = ['auto', 'sqrt', 'log2', None]\n\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import make_scorer\n\ninit_time = time()\nmodel_tree = DecisionTreeClassifier()\n\n\nscoring_dict = {\n    'f1_mac': 'f1_macro',\n    'acc': 'accuracy'\n}\n\ntrc = GridSearchCV(estimator=model_tree,\n                   scoring=scoring_dict,\n                   param_grid={\n                       'criterion': criterion,\n                       'max_depth': max_dephts,\n                       'min_samples_split': min_samples_split,\n                       'min_samples_leaf': min_samples_leaf,\n                       'max_features': max_features\n                   },\n                   cv=5,\n                   return_train_score=False,\n                   refit='f1_mac')\n\nmodel_5CV = trc.fit(Xtrain, Ytrain)\nprint(timede

In [17]:
'''
scoring_cols = [
    'param_criterion', 'param_max_depth', 'param_max_features',
    'param_min_samples_leaf', 'param_min_samples_split', 'mean_test_f1_mac',
    'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()
'''

"\nscoring_cols = [\n    'param_criterion', 'param_max_depth', 'param_max_features',\n    'param_min_samples_leaf', 'param_min_samples_split', 'mean_test_f1_mac',\n    'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_acc'\n]\n\npd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()\n"

Since we still have an overfitting problem, let's take a look at random forest. Random Forest is an ensemble of Decision Trees; the idea is that by averaging high-variance but decorrelated individual decision trees we will avoid their tendency to overfitting.

In [13]:
model_rf1 = RandomForestClassifier(oob_score=True).fit(Xtrain, Ytrain)

pred = model_rf1.predict(Xtrain)

print(classification_report(Ytrain, pred, target_names=['1000000.0', '100000.0', '10000.0', '50000.0', '5000.0', '500000.0', '1000.0', '500.0', '100.0', '5000000.0', '50000000.0', '10000000.0', '100000000.0', '1000000000.0', '500000000.0'],))
print('OOB accuracy=', model_rf1.oob_score_)


              precision    recall  f1-score   support

   1000000.0       1.00      1.00      1.00      3956
    100000.0       1.00      1.00      1.00     33356
     10000.0       1.00      1.00      1.00     57505
     50000.0       1.00      1.00      1.00     25808
      5000.0       1.00      1.00      1.00      8296
    500000.0       1.00      1.00      1.00      1532
      1000.0       1.00      1.00      1.00        87
       500.0       1.00      1.00      1.00         3
       100.0       1.00      1.00      1.00      4849
   5000000.0       1.00      1.00      1.00     25897
  50000000.0       1.00      1.00      1.00     17347
  10000000.0       1.00      1.00      1.00      6568
 100000000.0       1.00      1.00      1.00      1646
1000000000.0       1.00      1.00      1.00       179
 500000000.0       1.00      1.00      1.00         5

    accuracy                           1.00    187034
   macro avg       1.00      1.00      1.00    187034
weighted avg       1.00   

In [None]:
Ypred = model_rf1.predict(Xval)
print('Validation Accuracy:{}'.format(model_rf1.score(Xval,Yval)))
results.loc['RF-default',:] = compute_metrics(Yval,Ypred)
results

We are still overfitting our model by a lot

In [None]:
model_rf2 = RandomForestClassifier(n_estimators=100, 
                                   oob_score=True, 
                                   class_weight='balanced').fit(Xtrain, Ytrain)

pred = model_rf2.predict(Xtrain)

confusion(Ytrain, pred)
print(classification_report(Ytrain, pred, target_names=['1000000.0', '100000.0', '10000.0', '50000.0', '5000.0', '500000.0', '1000.0', '500.0', '100.0', '5000000.0', '50000000.0', '10000000.0', '100000000.0', '1000000000.0', '500000000.0'],))
print('OOB accuracy=', model_rf2.oob_score_)

In [None]:
Ypred = model_rf2.predict(Xval)

results.loc['RF-balance',:] = compute_metrics(Yval,Ypred)
results.sort_values(by='F1-score (macro avg)', ascending=False)

After a class balance, the original random forest performs better, this makes sense since classes are already balanced enough.

In [None]:
Ytest.value_counts()

Definetly, we should try to change hyper parameters, trhoughout all this models, we have faced the same problem overfitting, this can be solve with a hyperparameter search.

Notice that the model computations takes too long (about a minute for each model) thus, we will constraint our seach to few possibilities.

Also, we will cut our dataset to 3000 samples.

In [None]:
XtrainCut = Xtrain[:3000]
YtrainCut = Ytrain[:3000]

In [None]:
'''init_time = time()

rf_model = RandomForestClassifier()

ntrees = [200, None]
max_depth = [100,None]
min_samples_split = [4,6]
min_samples_leaf = [4,6]
balance = [None, 'balanced', 'balanced_subsample']

scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}


trc = GridSearchCV(estimator=rf_model,
                   scoring=scoring_dict,
                   param_grid={
                       'n_estimators': ntrees,
                       'max_depth':max_depth,
                       'min_samples_split':min_samples_split,
                       'min_samples_leaf':min_samples_leaf, 
                       'class_weight':balance
                   },
                   cv=5,
                   return_train_score=False,
                   refit=False,
                   n_jobs=-1)

model_5CV = trc.fit(XtrainCut, YtrainCut)
print(timedelta(seconds=(time() - init_time)))
'''

We will also try to do a random forest but for a binary variable which explains if the app was a success

In [15]:
datasetTrain = pd.read_csv('../Dades/X_train_modified.csv')
datasetTest  = pd.read_csv('../Dades/X_test_modified.csv')

datasetTrain['Installs'] = datasetTrain['Installs'].astype('object')
datasetTest['Installs'] = datasetTest['Installs'].astype('object')

columsToDrop = ['Maximum Installs', 'Price', 'Size', 'Download', 'Last Updated', 'ModInstalls', 'ModMaximumInstalls', 'Rating', 'ModExit']
categoricalColumns = ['Installs', 'Category', 'Free', 'Content Rating', 'Ad Supported', 'In App Purchases', 'Editors Choice']

'''
# Convert to categorical variables
for column in categoricalColumns:
    datasetTrain[column] = datasetTrain[column].astype('category')
    datasetTest[column] = datasetTest[column].astype('category')
'''
# Drop repeated or unwanted data (Installs is the catageorial version of "Maximum installs")
datasetTrain = datasetTrain.drop(columns=columsToDrop)
datasetTest  = datasetTest.drop(columns=columsToDrop)

datasetTrain = datasetTrain.drop(columns=['Installs'])
datasetTest = datasetTest.drop(columns=['Installs'])

datasetTrain.columns

Index(['Category', 'Free', 'Released', 'Content Rating', 'Ad Supported',
       'In App Purchases', 'Editors Choice', 'Exit', 'ModRating', 'ModPrice',
       'ModSize', 'ModLast Updated'],
      dtype='object')

In [19]:
datasetTrain['Exit'].unique()

array([False,  True])

In [20]:
Xtrain = datasetTrain.loc[:, datasetTrain.columns != 'Exit']
Ytrain = datasetTrain['Exit']

Xtest = datasetTest.loc[:, datasetTest.columns != 'Exit']
Ytest = datasetTest['Exit']

In [21]:
for column in Xtrain.columns:
        if Xtrain[column].dtype.kind == 'O':
            Xtrain_one_hot = pd.get_dummies(Xtrain[column], prefix=column)
            Xtrain = Xtrain.merge(Xtrain_one_hot,left_index=True,right_index=True)
            Xtrain = Xtrain.drop(columns=[column])
            
for column in Xtest.columns:
        if Xtest[column].dtype.kind == 'O':
            Xtest_one_hot = pd.get_dummies(Xtest[column], prefix=column)
            Xtest = Xtest.merge(Xtest_one_hot,left_index=True,right_index=True)
            Xtest = Xtest.drop(columns=[column])
            
print(Ytrain.head())

0    False
1    False
2     True
3    False
4    False
Name: Exit, dtype: bool


In [22]:
Xtrain, Xval, Ytrain, Yval = train_test_split(Xtrain, Ytrain, test_size=0.25, stratify=Ytrain, random_state=1)

In [25]:
model_rf1 = RandomForestClassifier(oob_score=True).fit(Xtrain, Ytrain)

pred = model_rf1.predict(Xtrain)

print(classification_report(Ytrain, pred, target_names=['True', 'False'],))
print('OOB accuracy=', model_rf1.oob_score_)

              precision    recall  f1-score   support

        True       1.00      1.00      1.00    173342
       False       1.00      1.00      1.00     13692

    accuracy                           1.00    187034
   macro avg       1.00      1.00      1.00    187034
weighted avg       1.00      1.00      1.00    187034

OOB accuracy= 0.9318359228803319


In [26]:
Ypred = model_rf1.predict(Xval)
print('Validation Accuracy:{}'.format(model_rf1.score(Xval,Yval)))
results.loc['RF-default',:] = compute_metrics(Yval,Ypred)
results

Validation Accuracy:0.9331943219183575


Unnamed: 0,Accuracy,F1-score (macro avg)
RF-default,0.933,0.632
