*Note: The result were obtained from the validation set, this set was independent from the trainning and there is no test leakage. We have found all hyperparameters with cross-validation on the training data, avoiding the use of validation data*

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

pd.set_option('display.precision', 3)

# Extra imports
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import graphviz

from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,GradientBoostingClassifier,StackingClassifier,ExtraTreesClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB

from time import time
from datetime import timedelta

import seaborn as sns
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

np.random.seed(1000)

In [16]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)
    
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [17]:
datasetTrain = pd.read_csv('../Dades/X_train_modified.csv')
datasetTest  = pd.read_csv('../Dades/X_test_modified.csv')

datasetTrain['Installs'] = datasetTrain['Installs'].astype('object')
datasetTest['Installs'] = datasetTest['Installs'].astype('object')

columsToDrop = ['Maximum Installs', 'Price', 'Size', 'Download', 'Last Updated', 'ModInstalls', 'ModMaximumInstalls', 'Rating', 'ModExit']
categoricalColumns = ['Installs', 'Category', 'Free', 'Content Rating', 'Ad Supported', 'In App Purchases', 'Editors Choice']

'''
# Convert to categorical variables
for column in categoricalColumns:
    datasetTrain[column] = datasetTrain[column].astype('category')
    datasetTest[column] = datasetTest[column].astype('category')
'''
# Drop repeated or unwanted data (Installs is the catageorial version of "Maximum installs")
datasetTrain = datasetTrain.drop(columns=columsToDrop)
datasetTest  = datasetTest.drop(columns=columsToDrop)

datasetTrain = datasetTrain.drop(columns=['Exit'])
datasetTest = datasetTest.drop(columns=['Exit'])

datasetTrain.dtypes

Category             object
Free                   bool
Released            float64
Content Rating       object
Ad Supported           bool
In App Purchases       bool
Editors Choice         bool
Installs             object
ModRating           float64
ModPrice            float64
ModSize             float64
ModLast Updated     float64
dtype: object

In [18]:
 datasetTrain['Installs'].unique()

array([10000.0, 10000000.0, 100000.0, 50000.0, 500.0, 1000.0, 5000.0,
       1000000.0, 500000.0, 100.0, 5000000.0, 50000000.0, 100000000.0,
       1000000000.0, 500000000.0], dtype=object)

In [19]:
Xtrain = datasetTrain.loc[:, datasetTrain.columns != 'Installs']
Ytrain = datasetTrain['Installs']

Xtest = datasetTest.loc[:, datasetTest.columns != 'Installs']
Ytest = datasetTest['Installs']


Ytrain = Ytrain.astype('str')
Ytest = Ytest.astype('str')

In [20]:
for column in Xtrain.columns:
        if Xtrain[column].dtype.kind == 'O':
            Xtrain_one_hot = pd.get_dummies(Xtrain[column], prefix=column)
            Xtrain = Xtrain.merge(Xtrain_one_hot,left_index=True,right_index=True)
            Xtrain = Xtrain.drop(columns=[column])
            
for column in Xtest.columns:
        if Xtest[column].dtype.kind == 'O':
            Xtest_one_hot = pd.get_dummies(Xtest[column], prefix=column)
            Xtest = Xtest.merge(Xtest_one_hot,left_index=True,right_index=True)
            Xtest = Xtest.drop(columns=[column])

print(Ytrain.head())


0       10000.0
1       10000.0
2    10000000.0
3      100000.0
4       50000.0
Name: Installs, dtype: object


For the moment, we won't consider that there is a class imbalance

The error is symmetric, we will consider as our metric the F1-score macro average

In [21]:
def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_macro]

results = pd.DataFrame(columns=['Accuracy', 'F1-score (macro avg)'])


We split the training data to create a validation set

In [22]:
Xtrain, Xval, Ytrain, Yval = train_test_split(Xtrain, Ytrain, test_size=0.25, stratify=Ytrain, random_state=1)

## DecisionTreeClassifier

In [23]:
model_tree = DecisionTreeClassifier().fit(Xtrain, Ytrain)

In [24]:
Ypredtrain = model_tree.predict(Xtrain)

results.loc['DT-default',:] = compute_metrics(Ytrain, Ypredtrain)

confusion(Ytrain, Ypredtrain)

predicted,100.0,1000.0,10000.0,100000.0,1000000.0,10000000.0,100000000.0,1000000000.0,500.0,5000.0,50000.0,500000.0,5000000.0,50000000.0,500000000.0
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100.0,3956,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000.0,1,33349,3,0,0,0,0,0,0,3,0,0,0,0,0
10000.0,0,17,57485,1,0,0,0,0,0,1,1,0,0,0,0
100000.0,0,1,14,25792,0,0,0,0,0,0,1,0,0,0,0
1000000.0,0,0,0,1,8295,0,0,0,0,0,0,0,0,0,0
10000000.0,0,0,0,0,0,1532,0,0,0,0,0,0,0,0,0
100000000.0,0,0,0,0,0,0,87,0,0,0,0,0,0,0,0
1000000000.0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0
500.0,3,2,0,0,0,0,0,0,4844,0,0,0,0,0,0
5000.0,0,27,38,2,0,0,0,0,3,25827,0,0,0,0,0


In [25]:
print(classification_report(Ytrain, Ypredtrain, target_names=['1000000.0', '100000.0', '10000.0', '50000.0', '5000.0', '500000.0', '1000.0', '500.0', '100.0', '5000000.0', '50000000.0', '10000000.0', '100000000.0', '1000000000.0', '500000000.0'],))

results

              precision    recall  f1-score   support

   1000000.0       1.00      1.00      1.00      3956
    100000.0       1.00      1.00      1.00     33356
     10000.0       1.00      1.00      1.00     57505
     50000.0       1.00      1.00      1.00     25808
      5000.0       1.00      1.00      1.00      8296
    500000.0       1.00      1.00      1.00      1532
      1000.0       1.00      1.00      1.00        87
       500.0       1.00      1.00      1.00         3
       100.0       1.00      1.00      1.00      4849
   5000000.0       1.00      1.00      1.00     25897
  50000000.0       1.00      1.00      1.00     17347
  10000000.0       1.00      1.00      1.00      6568
 100000000.0       1.00      1.00      1.00      1646
1000000000.0       1.00      1.00      1.00       179
 500000000.0       1.00      1.00      1.00         5

    accuracy                           1.00    187034
   macro avg       1.00      1.00      1.00    187034
weighted avg       1.00   

Unnamed: 0,Accuracy,F1-score (macro avg)
DT-default,0.999,1.0


In [28]:
Ypred = model_tree.predict(Xval)

results.loc['DT-default',:] = compute_metrics(Yval, Ypred)

confusion(Yval, Ypred)

predicted,100.0,1000.0,10000.0,100000.0,1000000.0,10000000.0,100000000.0,1000000000.0,500.0,5000.0,50000.0,500000.0,5000000.0,50000000.0,500000000.0
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100.0,354,436,146,22,5,0,0,0,203,121,27,3,1,0,0
1000.0,404,3121,3094,938,174,23,1,0,544,1840,756,187,36,1,0
10000.0,145,2958,7013,2566,615,85,6,1,282,2890,1997,508,92,9,2
100000.0,29,879,2520,1838,591,97,4,0,68,944,1058,461,97,16,1
1000000.0,4,202,596,645,428,97,7,0,12,189,255,228,96,6,0
10000000.0,0,21,63,109,96,56,7,0,3,34,37,42,36,7,0
100000000.0,0,1,2,4,6,6,0,0,0,2,3,1,3,1,0
1000000000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
500.0,218,567,271,80,22,2,0,0,177,206,58,13,1,1,0
5000.0,121,1743,2954,904,196,25,0,1,200,1526,754,178,28,2,0


In [27]:
print(classification_report(Yval, Ypred, target_names=['1000000.0', '100000.0', '10000.0', '50000.0', '5000.0', '500000.0', '1000.0', '500.0', '100.0', '5000000.0', '50000000.0', '10000000.0', '100000000.0', '1000000000.0', '500000000.0'],))

results

              precision    recall  f1-score   support

   1000000.0       0.27      0.27      0.27      1318
    100000.0       0.29      0.28      0.28     11119
     10000.0       0.36      0.37      0.36     19169
     50000.0       0.21      0.21      0.21      8603
      5000.0       0.16      0.15      0.16      2765
    500000.0       0.11      0.11      0.11       511
      1000.0       0.00      0.00      0.00        29
       500.0       0.00      0.00      0.00         1
       100.0       0.11      0.11      0.11      1616
   5000000.0       0.18      0.18      0.18      8632
  50000000.0       0.12      0.12      0.12      5783
  10000000.0       0.09      0.08      0.09      2189
 100000000.0       0.07      0.06      0.06       548
1000000000.0       0.04      0.03      0.03        60
 500000000.0       0.00      0.00      0.00         2

    accuracy                           0.25     62345
   macro avg       0.13      0.13      0.13     62345
weighted avg       0.25   

Unnamed: 0,Accuracy,F1-score (macro avg)
DT-default,0.248,0.132


It's clearly overfitting with the initial hyper parameters, let's change them, notice that our dataset is huge, and thus, we will reduce the number of hyperparamters tested, we will do this one by one, it is for sure not optimal but better than initial parameters

Test for depth

In [14]:
# Too slow
#criterion = ['gini', 'entropy']
criterion = ['gini']

max_dephts = [None, 5, 10, 15, 20]
#min_samples_split = [1, 2, 3, 4, 5]
min_samples_split = [3]
#min_samples_leaf = [1, 2, 3, 4, 5]
min_samples_leaf = [3]
#max_features = ['sqrt', 'log2', None]
max_features = ['sqrt']

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

init_time = time()
model_tree = DecisionTreeClassifier()


scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

trc = GridSearchCV(estimator=model_tree,
                   scoring=scoring_dict,
                   param_grid={
                       'criterion': criterion,
                       'max_depth': max_dephts,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'max_features': max_features
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac')

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))


0:00:16.699132


In [15]:
scoring_cols = [
    'param_criterion', 'param_max_depth', 'param_max_features',
    'param_min_samples_leaf', 'param_min_samples_split', 'mean_test_f1_mac', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()


Unnamed: 0,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,mean_test_f1_mac,mean_test_acc
0,gini,,sqrt,3,3,0.124,0.299
4,gini,20.0,sqrt,3,3,0.108,0.329
3,gini,15.0,sqrt,3,3,0.101,0.337
2,gini,10.0,sqrt,3,3,0.08,0.333
1,gini,5.0,sqrt,3,3,0.044,0.312


We choose none

In [16]:
# Too slow
#criterion = ['gini', 'entropy']
criterion = ['gini']

max_dephts = [None]
min_samples_split = [1, 2, 3, 4, 5]
#min_samples_split = [3]
#min_samples_leaf = [1, 2, 3, 4, 5]
min_samples_leaf = [3]
#max_features = ['sqrt', 'log2', None]
max_features = ['sqrt']

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

init_time = time()
model_tree = DecisionTreeClassifier()


scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

trc = GridSearchCV(estimator=model_tree,
                   scoring=scoring_dict,
                   param_grid={
                       'criterion': criterion,
                       'max_depth': max_dephts,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'max_features': max_features
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac')

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))

0:00:17.061521


In [17]:
scoring_cols = [
    'param_criterion', 'param_max_depth', 'param_max_features',
    'param_min_samples_leaf', 'param_min_samples_split', 'mean_test_f1_mac', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()


Unnamed: 0,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,mean_test_f1_mac,mean_test_acc
4,gini,,sqrt,3,5,0.128,0.296
2,gini,,sqrt,3,3,0.127,0.296
3,gini,,sqrt,3,4,0.127,0.3
1,gini,,sqrt,3,2,0.125,0.296
0,gini,,sqrt,3,1,,


We choose 4

In [18]:
# Too slow
#criterion = ['gini', 'entropy']
criterion = ['gini']

max_dephts = [None]
min_samples_split = [4]
min_samples_leaf = [1, 2, 3, 4, 5]
#min_samples_leaf = [3]
#max_features = ['sqrt', 'log2', None]
max_features = ['sqrt']

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

init_time = time()
model_tree = DecisionTreeClassifier()


scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

trc = GridSearchCV(estimator=model_tree,
                   scoring=scoring_dict,
                   param_grid={
                       'criterion': criterion,
                       'max_depth': max_dephts,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'max_features': max_features
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac')

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))

0:00:20.729320


In [19]:
scoring_cols = [
    'param_criterion', 'param_max_depth', 'param_max_features',
    'param_min_samples_leaf', 'param_min_samples_split', 'mean_test_f1_mac', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()


Unnamed: 0,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,mean_test_f1_mac,mean_test_acc
0,gini,,sqrt,1,4,0.128,0.256
2,gini,,sqrt,3,4,0.127,0.3
4,gini,,sqrt,5,4,0.126,0.309
1,gini,,sqrt,2,4,0.124,0.285
3,gini,,sqrt,4,4,0.123,0.306


We choose 1

In [20]:
# Too slow
#criterion = ['gini', 'entropy']
criterion = ['gini']

max_dephts = [None]
min_samples_split = [4]
min_samples_leaf = [1]
max_features = ['sqrt', 'log2', None]
#max_features = ['sqrt']

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

init_time = time()
model_tree = DecisionTreeClassifier()


scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

trc = GridSearchCV(estimator=model_tree,
                   scoring=scoring_dict,
                   param_grid={
                       'criterion': criterion,
                       'max_depth': max_dephts,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'max_features': max_features
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac')

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))

0:00:21.963624


In [21]:
scoring_cols = [
    'param_criterion', 'param_max_depth', 'param_max_features',
    'param_min_samples_leaf', 'param_min_samples_split', 'mean_test_f1_mac', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()


Unnamed: 0,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,mean_test_f1_mac,mean_test_acc
2,gini,,,1,4,0.135,0.251
0,gini,,sqrt,1,4,0.13,0.256
1,gini,,log2,1,4,0.127,0.257


we choose none

In [22]:
# Too slow
criterion = ['gini', 'entropy']

max_dephts = [None]
min_samples_split = [4]
min_samples_leaf = [1]
max_features = [None]

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

init_time = time()
model_tree = DecisionTreeClassifier()


scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

trc = GridSearchCV(estimator=model_tree,
                   scoring=scoring_dict,
                   param_grid={
                       'criterion': criterion,
                       'max_depth': max_dephts,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'max_features': max_features
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac')

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))

0:00:29.950645


In [23]:
scoring_cols = [
    'param_criterion', 'param_max_depth', 'param_max_features',
    'param_min_samples_leaf', 'param_min_samples_split', 'mean_test_f1_mac', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()


Unnamed: 0,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,mean_test_f1_mac,mean_test_acc
0,gini,,,1,4,0.133,0.251
1,entropy,,,1,4,0.132,0.247


In [24]:
Ypred = model_5CV.predict(Xval)

results.loc['DT-best',:] = compute_metrics(Yval, Ypred)

confusion(Yval,Ypred)
results.sort_values(by='F1-score (macro avg)', ascending=False)

Unnamed: 0,Accuracy,F1-score (macro avg)
DT-default,0.246,0.129
DT-best,0.254,0.127


poor results

## Random forest

Since we still have an overfitting problem, let's take a look at random forest. Random Forest is an ensemble of Decision Trees; the idea is that by averaging high-variance but decorrelated individual decision trees we will avoid their tendency to overfitting.

In [25]:
model_rf1 = RandomForestClassifier(oob_score=True).fit(Xtrain, Ytrain)

pred = model_rf1.predict(Xtrain)

print(classification_report(Ytrain, pred, target_names=['1000000.0', '100000.0', '10000.0', '50000.0', '5000.0', '500000.0', '1000.0', '500.0', '100.0', '5000000.0', '50000000.0', '10000000.0', '100000000.0', '1000000000.0', '500000000.0'],))
print('OOB accuracy=', model_rf1.oob_score_)


              precision    recall  f1-score   support

   1000000.0       1.00      1.00      1.00      3956
    100000.0       1.00      1.00      1.00     33356
     10000.0       1.00      1.00      1.00     57505
     50000.0       1.00      1.00      1.00     25808
      5000.0       1.00      1.00      1.00      8296
    500000.0       1.00      1.00      1.00      1532
      1000.0       1.00      1.00      1.00        87
       500.0       1.00      1.00      1.00         3
       100.0       1.00      1.00      1.00      4849
   5000000.0       1.00      1.00      1.00     25897
  50000000.0       1.00      1.00      1.00     17347
  10000000.0       1.00      1.00      1.00      6568
 100000000.0       1.00      1.00      1.00      1646
1000000000.0       1.00      1.00      1.00       179
 500000000.0       1.00      1.00      1.00         5

    accuracy                           1.00    187034
   macro avg       1.00      1.00      1.00    187034
weighted avg       1.00   

In [26]:
Ypred = model_rf1.predict(Xval)
print('Validation Accuracy:{}'.format(model_rf1.score(Xval,Yval)))
results.loc['RF-default',:] = compute_metrics(Yval,Ypred)
results

Validation Accuracy:0.32894378057582807


Unnamed: 0,Accuracy,F1-score (macro avg)
DT-default,0.246,0.129
DT-best,0.254,0.127
RF-default,0.329,0.154


We are still overfitting our model by a lot

In [14]:
model_rf2 = RandomForestClassifier(n_estimators=100, 
                                   oob_score=True, 
                                   class_weight='balanced').fit(Xtrain, Ytrain)

pred = model_rf2.predict(Xtrain)

confusion(Ytrain, pred)
print(classification_report(Ytrain, pred, target_names=['1000000.0', '100000.0', '10000.0', '50000.0', '5000.0', '500000.0', '1000.0', '500.0', '100.0', '5000000.0', '50000000.0', '10000000.0', '100000000.0', '1000000000.0', '500000000.0'],))
print('OOB accuracy=', model_rf2.oob_score_)

ValueError: Number of classes, 2, does not match size of target_names, 15. Try specifying the labels parameter

In [None]:
Ypred = model_rf2.predict(Xval)

results.loc['RF-balance',:] = compute_metrics(Yval,Ypred)
results.sort_values(by='F1-score (macro avg)', ascending=False)
results

After a class balance, the original random forest performs better, this makes sense since classes are already balanced enough.

In [29]:
Ytest.value_counts()

Installs
10000.0         37768
1000.0          21966
5000.0          17100
100000.0        16770
50000.0         11378
1000000.0        5496
500000.0         4357
500.0            3249
100.0            2534
5000000.0        1063
10000000.0        960
50000000.0        113
100000000.0        59
500000000.0         2
1000000000.0        1
Name: count, dtype: int64

Definetly, we should try to change hyper parameters, trhoughout all this models, we have faced the same problem overfitting, this can be solve with a hyperparameter search.

In this case, this is impossible due to how much resources it takes, I have executed this code 4 times and my personal computer shut down tevary time I try to compute hyper parameters. If we had infinite resources, we could try to do this computation.

In [30]:
'''XtrainCut = Xtrain[:1000]
YtrainCut = Ytrain[:1000]
'''

'XtrainCut = Xtrain[:1000]\nYtrainCut = Ytrain[:1000]\n'

In [31]:
'''
init_time = time()

rf_model = RandomForestClassifier()

ntrees = [200, None]
max_depth = [100,None]
min_samples_split = [4,6]
min_samples_leaf = [4,6]
balance = [None, 'balanced', 'balanced_subsample']

scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}


trc = GridSearchCV(estimator=rf_model,
                   scoring=scoring_dict,
                   param_grid={
                       'n_estimators': ntrees,
                       'max_depth':max_depth,
                       'min_samples_split':min_samples_split,
                       'min_samples_leaf':min_samples_leaf, 
                       'class_weight':balance
                   },
                   cv=5,
                   return_train_score=False,
                   refit=False,
                   n_jobs=-1)

model_5CV = trc.fit(XtrainCut, YtrainCut)
print(timedelta(seconds=(time() - init_time)))
'''

"\ninit_time = time()\n\nrf_model = RandomForestClassifier()\n\nntrees = [200, None]\nmax_depth = [100,None]\nmin_samples_split = [4,6]\nmin_samples_leaf = [4,6]\nbalance = [None, 'balanced', 'balanced_subsample']\n\nscoring_dict = {\n    'f1_mac': 'f1_macro',\n    'acc': 'accuracy'\n}\n\n\ntrc = GridSearchCV(estimator=rf_model,\n                   scoring=scoring_dict,\n                   param_grid={\n                       'n_estimators': ntrees,\n                       'max_depth':max_depth,\n                       'min_samples_split':min_samples_split,\n                       'min_samples_leaf':min_samples_leaf, \n                       'class_weight':balance\n                   },\n                   cv=5,\n                   return_train_score=False,\n                   refit=False,\n                   n_jobs=-1)\n\nmodel_5CV = trc.fit(XtrainCut, YtrainCut)\nprint(timedelta(seconds=(time() - init_time)))\n"

Extra classifiers and boosting + random forest are calculated in a python script since Jupiter notebook caused many problems with such a large dataset


## ModEXIT prediction

We will also try every technique for the (balanced) exit variable

In [5]:
datasetTrain = pd.read_csv('../Dades/X_train_modified.csv')
datasetTest  = pd.read_csv('../Dades/X_test_modified.csv')

datasetTrain['Installs'] = datasetTrain['Installs'].astype('object')
datasetTest['Installs'] = datasetTest['Installs'].astype('object')

columsToDrop = ['Maximum Installs', 'Price', 'Size', 'Download', 'Last Updated', 'ModInstalls', 'ModMaximumInstalls', 'Rating', 'Exit']
categoricalColumns = ['Installs', 'Category', 'Free', 'Content Rating', 'Ad Supported', 'In App Purchases', 'Editors Choice']

'''
# Convert to categorical variables
for column in categoricalColumns:
    datasetTrain[column] = datasetTrain[column].astype('category')
    datasetTest[column] = datasetTest[column].astype('category')
'''
# Drop repeated or unwanted data (Installs is the catageorial version of "Maximum installs")
datasetTrain = datasetTrain.drop(columns=columsToDrop)
datasetTest  = datasetTest.drop(columns=columsToDrop)

datasetTrain = datasetTrain.drop(columns=['Installs'])
datasetTest = datasetTest.drop(columns=['Installs'])

datasetTrain.columns

Index(['Category', 'Free', 'Released', 'Content Rating', 'Ad Supported',
       'In App Purchases', 'Editors Choice', 'ModExit', 'ModRating',
       'ModPrice', 'ModSize', 'ModLast Updated'],
      dtype='object')

In [6]:
datasetTrain['ModExit'].unique()

array([False,  True])

In [7]:
Xtrain = datasetTrain.loc[:, datasetTrain.columns != 'ModExit']
Ytrain = datasetTrain['ModExit']

Xtest = datasetTest.loc[:, datasetTest.columns != 'ModExit']
Ytest = datasetTest['ModExit']

Xtrain.columns

Index(['Category', 'Free', 'Released', 'Content Rating', 'Ad Supported',
       'In App Purchases', 'Editors Choice', 'ModRating', 'ModPrice',
       'ModSize', 'ModLast Updated'],
      dtype='object')

In [8]:
for column in Xtrain.columns:
        if Xtrain[column].dtype.kind == 'O':
            Xtrain_one_hot = pd.get_dummies(Xtrain[column], prefix=column)
            Xtrain = Xtrain.merge(Xtrain_one_hot,left_index=True,right_index=True)
            Xtrain = Xtrain.drop(columns=[column])
            
for column in Xtest.columns:
        if Xtest[column].dtype.kind == 'O':
            Xtest_one_hot = pd.get_dummies(Xtest[column], prefix=column)
            Xtest = Xtest.merge(Xtest_one_hot,left_index=True,right_index=True)
            Xtest = Xtest.drop(columns=[column])
            
print(Ytrain.value_counts())

ModExit
False    145515
True     103864
Name: count, dtype: int64


In [9]:
Xtrain, Xval, Ytrain, Yval = train_test_split(Xtrain, Ytrain, test_size=0.25, stratify=Ytrain, random_state=1)

In [37]:
model_rf1 = RandomForestClassifier(oob_score=True).fit(Xtrain, Ytrain)

pred = model_rf1.predict(Xtrain)

print(classification_report(Ytrain, pred, target_names=['True', 'False'],))
print('OOB accuracy=', model_rf1.oob_score_)

              precision    recall  f1-score   support

        True       1.00      1.00      1.00    109136
       False       1.00      1.00      1.00     77898

    accuracy                           1.00    187034
   macro avg       1.00      1.00      1.00    187034
weighted avg       1.00      1.00      1.00    187034

OOB accuracy= 0.7000331490531133


In [38]:
Ypred = model_rf1.predict(Xval)
print('Validation Accuracy:{}'.format(model_rf1.score(Xval,Yval)))
results.loc['RF-default',:] = compute_metrics(Yval,Ypred)
results

Validation Accuracy:0.7036330098644639


Unnamed: 0,Accuracy,F1-score (macro avg)
DT-default,0.246,0.129
DT-best,0.254,0.127
RF-default,0.704,0.689
RF-balance,0.331,0.148


In [12]:
scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

init_time = time()

rf_model = RandomForestClassifier()

ntrees = [200]
max_depth = [100,None]
#max_depth = [100]
#min_samples_split = [4,6]
min_samples_split = [4]
#min_samples_leaf = [4,6]
min_samples_leaf = [6]
#balance = [None, 'balanced', 'balanced_subsample']
balance = ['balanced_subsample']

trc = GridSearchCV(estimator=rf_model,
                   scoring=scoring_dict,
                   param_grid={
                       'n_estimators': ntrees,
                       'max_depth':max_depth,
                       'min_samples_split':min_samples_split,
                       'min_samples_leaf':min_samples_leaf, 
                       'class_weight':balance
                   },
                   cv=5,
                   return_train_score=False,
                   refit=False,
                   n_jobs=-1)

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))

scoring_cols = [
     'n_estimators', 'max_depth', 'min_samples_split','min_samples_leaf', 
     'class_weight', 'mean_test_f1_mac', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_)

0:02:22.984923


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,std_test_f1_mac,rank_test_f1_mac,split0_test_acc,split1_test_acc,split2_test_acc,split3_test_acc,split4_test_acc,mean_test_acc,std_test_acc,rank_test_acc
0,135.58,3.305,3.388,0.497,balanced_subsample,100.0,6,4,200,"{'class_weight': 'balanced_subsample', 'max_de...",...,0.001,1,0.702,0.703,0.701,0.701,0.701,0.701,0.001,1
1,131.854,1.518,3.759,0.382,balanced_subsample,,6,4,200,"{'class_weight': 'balanced_subsample', 'max_de...",...,0.001,2,0.701,0.703,0.7,0.701,0.7,0.701,0.001,2


In [15]:
scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

init_time = time()

rf_model = RandomForestClassifier()

ntrees = [200]
max_depth = [100]
min_samples_split = [6]
min_samples_leaf = [4]
balance = [None, 'balanced', 'balanced_subsample']


trc = GridSearchCV(estimator=rf_model,
                   scoring=scoring_dict,
                   param_grid={
                       'n_estimators': ntrees,
                       'max_depth':max_depth,
                       'min_samples_split':min_samples_split,
                       'min_samples_leaf':min_samples_leaf, 
                       'class_weight':balance
                   },
                   cv=5,
                   return_train_score=False,
                   refit=False,
                   n_jobs=-1)

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))

scoring_cols = [
     'n_estimators', 'max_depth', 'min_samples_split','min_samples_leaf', 
     'class_weight', 'mean_test_f1_mac', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_)

0:03:29.727579


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,std_test_f1_mac,rank_test_f1_mac,split0_test_acc,split1_test_acc,split2_test_acc,split3_test_acc,split4_test_acc,mean_test_acc,std_test_acc,rank_test_acc
0,132.212,3.003,4.686,0.392,,100,4,6,200,"{'class_weight': None, 'max_depth': 100, 'min_...",...,0.002,3,0.711,0.714,0.71,0.709,0.709,0.711,0.002,1
1,134.596,1.863,4.11,0.592,balanced,100,4,6,200,"{'class_weight': 'balanced', 'max_depth': 100,...",...,0.001,1,0.703,0.706,0.703,0.704,0.703,0.704,0.001,2
2,99.938,38.385,2.059,0.56,balanced_subsample,100,4,6,200,"{'class_weight': 'balanced_subsample', 'max_de...",...,0.001,2,0.704,0.706,0.704,0.703,0.702,0.704,0.001,3


In [17]:
scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

init_time = time()

rf_model = RandomForestClassifier()

ntrees = [1000]
max_depth = [100]
min_samples_split = [6]
min_samples_leaf = [4]
balance = [None]

trc = GridSearchCV(estimator=rf_model,
                   scoring=scoring_dict,
                   param_grid={
                       'n_estimators': ntrees,
                       'max_depth':max_depth,
                       'min_samples_split':min_samples_split,
                       'min_samples_leaf':min_samples_leaf, 
                       'class_weight':balance
                   },
                   cv=5,
                   return_train_score=False,
                   refit=False,
                   n_jobs=-1)

model_5CV = trc.fit(Xtrain, Ytrain)
print(timedelta(seconds=(time() - init_time)))

scoring_cols = [
     'n_estimators', 'max_depth', 'min_samples_split','min_samples_leaf', 
     'class_weight', 'mean_test_f1_mac', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_)

KeyboardInterrupt: 

In [10]:
model_rfI = RandomForestClassifier(oob_score=True, n_estimators = 1000, max_depth = 100,min_samples_split = 6, min_samples_leaf = 4, class_weight = None).fit(Xtrain, Ytrain)


Ypred = model_rfI.predict(Xval)

results.loc['RF-improved',:] = compute_metrics(Yval,Ypred)
results.sort_values(by='F1-score (macro avg)', ascending=False)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
RF-improved,0.712,0.695


In [11]:
model_tree = DecisionTreeClassifier().fit(Xtrain, Ytrain)

Ypred = model_tree.predict(Xval)

results.loc['DT-default',:] = compute_metrics(Yval, Ypred)

confusion(Yval, Ypred)

results

Unnamed: 0,Accuracy,F1-score (macro avg)
RF-improved,0.712,0.695
DT-default,0.635,0.625


Merging models

In [12]:
voting_hard = VotingClassifier([('dt', model_tree), ('rf', model_rfI)])
voting_hard.fit(Xtrain, Ytrain)

Ypred = voting_hard.predict(Xval)

results.loc['voting_hard', :] = compute_metrics(Yval, Ypred)

results

Unnamed: 0,Accuracy,F1-score (macro avg)
RF-improved,0.712,0.695
DT-default,0.635,0.625
voting_hard,0.686,0.643


# Exit

In [6]:
datasetTrain = pd.read_csv('../Dades/X_train_modified.csv')
datasetTest  = pd.read_csv('../Dades/X_test_modified.csv')

datasetTrain['Installs'] = datasetTrain['Installs'].astype('object')
datasetTest['Installs'] = datasetTest['Installs'].astype('object')

columsToDrop = ['Maximum Installs', 'Price', 'Size', 'Download', 'Last Updated', 'ModInstalls', 'ModMaximumInstalls', 'Rating', 'ModExit']
categoricalColumns = ['Installs', 'Category', 'Free', 'Content Rating', 'Ad Supported', 'In App Purchases', 'Editors Choice']

'''
# Convert to categorical variables
for column in categoricalColumns:
    datasetTrain[column] = datasetTrain[column].astype('category')
    datasetTest[column] = datasetTest[column].astype('category')
'''
# Drop repeated or unwanted data (Installs is the catageorial version of "Maximum installs")
datasetTrain = datasetTrain.drop(columns=columsToDrop)
datasetTest  = datasetTest.drop(columns=columsToDrop)

datasetTrain = datasetTrain.drop(columns=['Installs'])
datasetTest = datasetTest.drop(columns=['Installs'])

Xtrain = datasetTrain.loc[:, datasetTrain.columns != 'Exit']
Ytrain = datasetTrain['Exit']

Xtest = datasetTest.loc[:, datasetTest.columns != 'Exit']
Ytest = datasetTest['Exit']

for column in Xtrain.columns:
        if Xtrain[column].dtype.kind == 'O':
            Xtrain_one_hot = pd.get_dummies(Xtrain[column], prefix=column)
            Xtrain = Xtrain.merge(Xtrain_one_hot,left_index=True,right_index=True)
            Xtrain = Xtrain.drop(columns=[column])
            
for column in Xtest.columns:
        if Xtest[column].dtype.kind == 'O':
            Xtest_one_hot = pd.get_dummies(Xtest[column], prefix=column)
            Xtest = Xtest.merge(Xtest_one_hot,left_index=True,right_index=True)
            Xtest = Xtest.drop(columns=[column])
            
Xtrain, Xval, Ytrain, Yval = train_test_split(Xtrain, Ytrain, test_size=0.25, stratify=Ytrain, random_state=1)

In [7]:
model_rf1 = RandomForestClassifier(oob_score=True).fit(Xtrain, Ytrain)

pred = model_rf1.predict(Xtrain)

print(classification_report(Ytrain, pred, target_names=['True', 'False'],))
print('OOB accuracy=', model_rf1.oob_score_)

              precision    recall  f1-score   support

        True       1.00      1.00      1.00    173342
       False       1.00      1.00      1.00     13692

    accuracy                           1.00    187034
   macro avg       1.00      1.00      1.00    187034
weighted avg       1.00      1.00      1.00    187034

OOB accuracy= 0.9322850390838029


In [8]:
Ypred = model_rf1.predict(Xval)
print('Validation Accuracy:{}'.format(model_rf1.score(Xval,Yval)))
results.loc['RF-default',:] = compute_metrics(Yval,Ypred)
results

Validation Accuracy:0.9325527307723154


Unnamed: 0,Accuracy,F1-score (macro avg)
RF-default,0.933,0.628


In [9]:
model_tree = DecisionTreeClassifier().fit(Xtrain, Ytrain)

Ypred = model_tree.predict(Xval)

results.loc['DT-default',:] = compute_metrics(Yval, Ypred)

confusion(Yval, Ypred)

results

Unnamed: 0,Accuracy,F1-score (macro avg)
RF-default,0.933,0.628
DT-default,0.893,0.621
