In [1]:
# Data handling
import numpy as np
import pandas as pd

# For visualization
import altair as alt

# Feature Selection
from sklearn.feature_selection import RFE

# Oversampling
from imblearn.over_sampling import SMOTE

# Models
from sklearn import tree 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


# Split
from sklearn.model_selection import train_test_split

# Evaluation
from sklearn.metrics import confusion_matrix

import time
import pickle

# Data

In [4]:
# Training data
train = pd.read_csv("../data/processed_data/mist_train.csv") 
X = train.drop(columns = ['Y', 'Unnamed: 0'])
y = train['Y']

# Resampling due to the class imbalance using SMOTE: https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html?highlight=smote#smote-variants
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X,y)

# Splitting the data
X_train, X_valid, y_train, y_valid = train_test_split(X_sm, y_sm, test_size=0.20, random_state=1234)

In [6]:
print(f"X train shape: {X_train.shape}")
print(f"X valid shape: {X_valid.shape}")
print(f"y train shape: {y_train.shape}")
print(f"y valid shape: {y_valid.shape}")

X train shape: (11024, 122)
X valid shape: (2756, 122)
y train shape: (11024,)
y valid shape: (2756,)


In [18]:
print(f'Y vector without synthetic oversampling: \n {y.value_counts()}')
print(f'Y vector with synthetic oversampling using SMOTE: \n {y_sm.value_counts()}')

Y vector without synthetic oversampling: 
 0    6890
1     109
Name: Y, dtype: int64
Y vector with synthetic oversampling using SMOTE: 
 1    6890
0    6890
Name: Y, dtype: int64


# Feature selection

In this section I will select the features, I am using RFE which is a method that gradually evaluates the features adding the one by one

In [15]:
def fit_and_report(model, X, y, Xv, yv, mode = 'regression'):
    """
    Fits a given model and calculates its score in the training and the validation set. If the model is a regression it uses the MSE. 
    if the model is a classification model it uses the accuracy
    ----------------------------------------------------
    
    Paramaters:
    ----------------------------------------------------
    model: model to fit
    X: Training X matrix
    y: Training response vector
    Xv: Validation X matrix
    yv: Validation response vector
    mode: Type of estimation classification and
    
    Returns:
    ---------------------------------------------------
    errors: list with the training and validation error
    
    Example:
    --------------------------------------------------
    fit_and_report(LogisticRegression(), X, y, Xv, yv, mode = 'classification')
    """
    model.fit(X, y)
    if mode.lower().startswith('regress'):
        errors = [mean_squared_error(y, model.predict(X)), mean_squared_error(yv, model.predict(Xv))]
    if mode.lower().startswith('classif'):
        errors = [1 - model.score(X,y), 1 - model.score(Xv,yv)]        
    return errors

In [94]:
# This section is using the RFE function to select the most important variables
# I chose 70 variables to test but I am interested in the order and the error
n_features = 70

# This dictionary stores the results
results = {'Stage':[],
           'N_features':[],
           'Score':[]}

for i in range(1, n_features + 1):
    
    start = time.time()
    
    print(f"iteration {i}")
    # I am using a logistic regression as a 
    lr = LogisticRegression(solver = "liblinear", class_weight='balanced')
    rfe = RFE(estimator = lr, n_features_to_select = i)
    rfe.fit(X_train, y_train)
    
    selected_features = rfe.support_
    print(f"Selected Features: {X_train.columns[selected_features]}")
    
    
    scores = fit_and_report(lr, 
                            X_train.iloc[:,selected_features], 
                            y_train, 
                            X_valid.iloc[:,selected_features], 
                            y_valid,
                            mode="classification")
    
    end = time.time() - start
    print(f"Time: {end}")
    print("-------------------")
    results['Stage'].append('Train')
    results['N_features'].append(i)
    results['Score'].append(scores[0])
    results['Stage'].append('Validation')
    results['N_features'].append(i)
    results['Score'].append(scores[1])

iteration 1
Selected Features: Index(['Phone OS version_5.1.1'], dtype='object')
Time: 9.273712396621704
-------------------
iteration 2
Selected Features: Index(['Phone OS version_5.1.1', 'User source sh_YouTube'], dtype='object')
Time: 8.72529411315918
-------------------
iteration 3
Selected Features: Index(['Phone OS version_5.1.1', 'Countries_CA_US', 'User source sh_YouTube'], dtype='object')
Time: 8.618023157119751
-------------------
iteration 4
Selected Features: Index(['Phone OS version_5.1.1', 'Countries_CA_US', 'User source sh_YouTube',
       'Phone brand sh_Google'],
      dtype='object')
Time: 8.155201196670532
-------------------
iteration 5
Selected Features: Index(['Phone OS version_5.1.1', 'Countries_CA_US', 'User source sh_YouTube',
       'Phone brand sh_Google', 'Phone brand sh_OPPO'],
      dtype='object')
Time: 8.050000429153442
-------------------
iteration 6
Selected Features: Index(['Phone OS version_5.1.1', 'Countries_CA_US', 'Countries_MY',
       'User sour

In [95]:
variable_selection = pd.DataFrame(results)

In [96]:
# This section shows the results of the variable selection process

alt.Chart(variable_selection).mark_line().encode(
    alt.Y('Score:Q'),
    alt.X('N_features:O', title = 'Number of features'),
    alt.Color('Stage:N')
).properties(
    width = 600,
    height = 400,
    title = 'Error by number of features using RFE'
)

I am not really sure here why the validation error keeps decreasing. I am choosing 45 variables 

In [97]:
variable_selection.to_csv("../results/data/variable_selection.csv")

## Modeling

Given it is a classification model, I am going to fit a:
- Decision Tree
- KNN classifier
- Logistic regression
- Support vector classifier
- Random forest
- XGboost
- lgbm

In [125]:
# Chosen variables
variables = ['Phone OS version_5.0.1', 'Phone OS version_5.0.2',
       'Phone OS version_5.1', 'Phone OS version_5.1.1', 'Countries_CA_US',
       'Countries_FR', 'Countries_MY', 'Countries_NO',
       'User source sh_Fyber - Android', 'User source sh_YouTube',
       'Phone brand sh_ALCATEL', 'Phone brand sh_AlcatelOneTouch',
       'Phone brand sh_Alco', 'Phone brand sh_Google', 'Phone brand sh_HTC',
       'Phone brand sh_LENOVO', 'Phone brand sh_OPPO', 'Phone brand sh_TCL',
       'Phone brand sh_Yulong', 'Phone brand sh_google']

In [126]:
# Helper function
def evaluate_model(X_train, y_train, X_valid, y_valid, models):
    """
    Evaluates a group of models

    Parameters:
    X_valid -- (dataframe) validation X
    X_train -- (dataframe) train X
    y_valid -- (series) validation y
    y_train -- (series) train y
    models -- (dictionary) models dictionary
    
    Returns:
    results -- (dictionary) dictionary containing model, train error, validation error
    and elapsed training and validation time
    
    importances -- (dataframe) feature importances
    """
    results = {}
        
    for model_name, model in models.items():
        
        # Timing the model
        print(f"Fitting model: {model_name}")
        t = time.time()

        # Fitting the model as a pipeline

        model.fit(X_train, y_train);
        tr_err, valid_err = model.score(X_train, y_train), model.score(X_valid, y_valid)
        
        elapsed_time = time.time() - t
        
        results[model_name] = [model, round(tr_err,3), round(valid_err,3), round(elapsed_time,4)]
    
    return results


In [127]:
# List of models to use
models = {
          'decision tree': DecisionTreeClassifier(),
          'kNN': KNeighborsClassifier(),
          'logistic regression': LogisticRegression(solver ='liblinear'),
          'RBF SVM' :  SVC(gamma = 'scale'), 
          'random forest' : RandomForestClassifier(), 
          'xgboost' : XGBClassifier(),
          'lgbm': LGBMClassifier(),
          'Dummy': DummyClassifier(strategy='stratified')
         }

In [128]:
results = evaluate_model(X_train.loc[:,variables], y_train, X_valid.loc[:,variables], y_valid, models)

Fitting model: decision tree
Fitting model: kNN
Fitting model: logistic regression
Fitting model: RBF SVM
Fitting model: random forest
Fitting model: xgboost
Fitting model: lgbm
Fitting model: Dummy


In [129]:
# Here, I rather check the recall or the precision too evaluate the model.
# Given that I do not know the context of the metric, I am not sure which one to pick because the impact could be different
# the f1 score summarizes both so I am going to use it as evaluation

for model_name, model in results.items():
    print(f"Model Name: {model_name}")
    cm = confusion_matrix(y_valid, model[0].predict(X_valid[variables]))
    cm_train = confusion_matrix(y_train, model[0].predict(X_train[variables]))
    print(f"Confusion matrix: {cm_train}")

    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    
    recall = tp/(tp+fn)
    precision = tp/(tp+fp)
    f1 = 2 * precision * recall/(precision + recall)
    
    results[model_name].append(recall)
    results[model_name].append(precision)
    results[model_name].append(f1)
    
    print(f"The recall is {recall:.3f}")
    print(f"The precision is {precision:.3f}")
    print(f"The f1 is {f1:.3f}")
    print("--------")

Model Name: decision tree
Confusion matrix: [[1025 4472]
 [   4 5523]]
The recall is 0.999
The precision is 0.547
The f1 is 0.707
--------
Model Name: kNN
Confusion matrix: [[5497    0]
 [4995  532]]
The recall is 0.092
The precision is 1.000
The f1 is 0.168
--------
Model Name: logistic regression
Confusion matrix: [[ 995 4502]
 [  82 5445]]
The recall is 0.988
The precision is 0.542
The f1 is 0.700
--------
Model Name: RBF SVM
Confusion matrix: [[1019 4478]
 [  13 5514]]
The recall is 0.999
The precision is 0.547
The f1 is 0.707
--------
Model Name: random forest
Confusion matrix: [[1025 4472]
 [   4 5523]]
The recall is 0.999
The precision is 0.547
The f1 is 0.707
--------
Model Name: xgboost
Confusion matrix: [[1022 4475]
 [   4 5523]]
The recall is 0.999
The precision is 0.547
The f1 is 0.707
--------
Model Name: lgbm
Confusion matrix: [[ 962 4535]
 [   3 5524]]
The recall is 1.000
The precision is 0.545
The f1 is 0.705
--------
Model Name: Dummy
Confusion matrix: [[2701 2796]
 [2

In [130]:
# The Final model is xgboost given its higher f1 score
fm = results['xgboost'][0]

In [131]:
results_df = pd.DataFrame(results, index=['model', 'Train_score', 'Test_score', 'Train_test_time', 'Recall', 'Precision', 'F1']).T
results_df.to_csv("../results/data/results_df.csv")

In [132]:
# Dumping the model
filename = '../results/final_model.sav'
pickle.dump(fm, open(filename, 'wb'))

# Final model evaluation

Testing the model in the test dataset

In [133]:
test_final = pd.read_csv("../data/processed_data/mist_test.csv") 
X_test_final = test_final.loc[:,variables]
y_test_final = test_final['Y']

In [134]:
y_test_predict = fm.predict(X_test_final)

In [135]:
cm = confusion_matrix(y_test_final, y_test_predict)

tp = cm[1,1]
fn = cm[1,0]
fp = cm[0,1]
tn = cm[0,0]
    
recall = tp/(tp+fn)
precision = tp/(tp+fp)
f1 = 2 * precision * recall/(precision + recall)

score = fm.score(X_test_final,y_test_final)

print(f"The recall is {recall:.3f}")
print(f"The precision is {precision:.3f}")
print(f"The f1 is {f1:.3f}")
print(f"The score {score:.3f}")

The recall is 0.975
The precision is 0.016
The f1 is 0.032
The score 0.201


I think I am still having an imbalance issue here, more oversampling techniques should be evaluated

In [136]:
results_test = {
    'tp':tp,
    'fp':fp,
    'tn':tn,
    'fn':fn,
    'recall':recall,
    'precision':precision,
    'f1': f1,
    'score':score
}

results_test_df = pd.DataFrame(results_test, index=[0])

In [137]:
results_test_df.to_csv("../results/data/results_test.csv")