## SMS to CONVERSION MODEL

In [10]:
! aws s3 cp s3://aws-athena-query-results-101063123548-eu-west-1/Unsaved/2020/05/07/8a853c10-ffcf-4f58-b83c-2e2513a5e51e.csv s3://datateam-ml/CVR_FSI/sms.csv

copy: s3://aws-athena-query-results-101063123548-eu-west-1/Unsaved/2020/05/07/8a853c10-ffcf-4f58-b83c-2e2513a5e51e.csv to s3://datateam-ml/CVR_FSI/sms.csv


In [2]:
import pandas as pd
df = pd.read_csv("s3://datateam-ml/CVR_FSI/conversions.csv")

In [146]:
df2 = pd.read_csv("s3://datateam-ml/CVR_FSI/sms.csv")

In [218]:
features = yaml.safe_load(open("Attributes_yaml/features.yml"))
input_col = features['input_col']+features['target']
data = data[input_col]
data = data.reindex(columns = input_col)

## BUILDING OF THE PRE-PROCESSING PIPELINE AND MODELS

In [154]:
#important libraries
import pandas as pd
import numpy as np
import yaml
from sklearn.metrics import silhouette_score
import scipy as sci
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split

In [135]:
!pwd

/home/ec2-user/SageMaker/FSI_SMS_TO_CONVERSION


In [219]:
class processing():
    
    features = yaml.safe_load(open("Attributes_yaml/features.yml"))
    input_col = features['input_col']
    num = features['num_features']
    cat = features['cat_features']
    target = features['target']
    low_cat = features['low_cat']
    
    def __init__(self, data):
        self.data = data
    def map_values(self,column = 'event_type'):
        self.data['event_type'] = self.data['event_type'].apply(lambda x: 1 if x == 'install' else 0)
#     def map_values(self):
#         self.dictionary = {'sms': 0, 'install':1}
#         self.data[processing.target] = self.data[processing.target].index.map(self.dictionary.get)
#         #self.data[processing.target] = self.data[processing.target].index.apply(lambda x: 1 if x == 'install' else 0)
    def fill_na(self):
        for item in self.data[processing.num]:
            self.data[item] = self.data[item].fillna(self.data[item].mean())
        for item in self.data[processing.cat]:
            self.data[item] = self.data[item].fillna(self.data[item].value_counts().index[0])
    def hash_list(self):
        self.hash_features = []
        for item in processing.cat:
            if item not in processing.low_cat:
                self.hash_features.append(item)
    def pipeline(self, hash_size):
        self.num_pipeline = Pipeline(steps= [('imputer', SimpleImputer(strategy='mean')), ('std_scaler', MinMaxScaler())])
        self.cat_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
                                       ('one_hot_encoding', OneHotEncoder(handle_unknown = "ignore", sparse = False))])
        self.hash_pipeline = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
                                  ('hasher', FeatureHasher(n_features=hash_size, input_type='string'))])
        
    
    def build_pipe(self, hash_size = 500, test_size = 0.2): 
        self.fill_na()
        self.map_values()
        self.data.drop(['msisdn'],axis=1,inplace=True)
        self.hash_list()
        self.pipeline(hash_size)
        
        self.full_pipeline = ColumnTransformer(
        transformers=[
            ('num', self.num_pipeline, processing.num),
            ('cat', self.cat_pipeline, processing.low_cat),
            ('hash', self.hash_pipeline, self.hash_features)
        ])
        
        self.X = self.data.drop(processing.target, axis=1)
        self.y = self.data[processing.target].copy()
        
        self.full_pipeline.fit(self.X)
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, stratify = self.y)
        
        self.X_train = self.full_pipeline.transform(self.X_train)
        self.X_test = self.full_pipeline.transform(self.X_test)
        
        print(self.X_train.shape)
        return self.X, self.y, self.X_train, self.X_test, self.y_train, self.y_test, self.full_pipeline

In [220]:
processed = processing(data)

In [221]:
X, y, X_train, X_test, y_train, y_test, full_pipeline = processed.build_pipe(hash_size = 500, test_size = 0.2)

(212245, 517)


## MODELLING (LINEAR + DEEP MODELS)

In [232]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [240]:
classifiers = [
               KNeighborsClassifier(3),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               AdaBoostClassifier(),
               GradientBoostingClassifier(),
               LogisticRegression(C=1,random_state=1234,solver = 'lbfgs',class_weight={0:0.1, 1:0.90})
]

In [241]:
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print(classifier)
    print("model score: %.3f" % classifier.score(X_test, y_test))
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('classification report')
    print(metrics.classification_report(y_test, y_pred))
    print('Accuracy : %f' % (metrics.accuracy_score(y_test, y_pred)))
    print('f1 score : %f' % (metrics.fbeta_score(y_test, y_pred, beta=0.5)))

  from ipykernel import kernelapp as app


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
model score: 0.939
confusion matrix
[[49134   897]
 [ 2363   668]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.98      0.97     50031
           1       0.43      0.22      0.29      3031

   micro avg       0.94      0.94      0.94     53062
   macro avg       0.69      0.60      0.63     53062
weighted avg       0.92      0.94      0.93     53062

Accuracy : 0.938562
f1 score : 0.359488
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
model score: 0.957
confusio

  from ipykernel import kernelapp as app


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
model score: 0.965
confusion matrix
[[49941    90]
 [ 1772  1259]]
classification report
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     50031
           1       0.93      0.42      0.57      3031

   micro avg       0.96      0.96      0.96     53062
   macro avg       0.95      0.71      0.78     53062
weighted avg       0.96      0.96      0.96     53062

Accuracy : 0.964909
f1 score : 0.747004


  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
model score: 0.942
confusion matrix
[[49881   150]
 [ 2940    91]]
classification report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     50031
           1       0.38      0.03      0.06      3031

   micro avg       0.94      0.94      0.94     53062
   macro avg       0.66      0.51      0.51     53062
weighted avg       0.91      0.94      0.92     53062

Accuracy : 0.941766
f1 score : 0.113892


  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
model score: 0.943
confusion matrix
[[49993    38]
 [ 2969    62]]
classification report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     50031
           1       0.62      0.02      0.04      3031

   micro avg       0.94      0.94      0.94     53062
   macro avg       0.78      0.51      0.51     53062
weighted avg       0.93      0.94      0.92     53062

Accuracy : 0.943330
f1 score : 0.090353



In [239]:
from sklearn.neural_network import MLPClassifier #implements a multi-layer perceptron (MLP) algorithm
MLP = MLPClassifier(solver='lbfgs', alpha=0.001,
                          hidden_layer_sizes=(256,), random_state=1)
MLP.fit(X_train, y_train)
y_pred = MLP.predict(X_test)
print(MLP)
print("model score: %.3f" % MLP.score(X_test, y_test))
print('confusion matrix')
print(metrics.confusion_matrix(y_test, y_pred))
print('classification report')
print(metrics.classification_report(y_test, y_pred))
print('Accuracy : %f' % (metrics.accuracy_score(y_test, y_pred)))
print('f1 score : %f' % (metrics.fbeta_score(y_test, y_pred, beta=0.5)))

  y = column_or_1d(y, warn=True)


MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(256,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
model score: 0.942
confusion matrix
[[49898   133]
 [ 2929   102]]
classification report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     50031
           1       0.43      0.03      0.06      3031

   micro avg       0.94      0.94      0.94     53062
   macro avg       0.69      0.52      0.52     53062
weighted avg       0.92      0.94      0.92     53062

Accuracy : 0.942294
f1 score : 0.128431


In [248]:
import pickle
filename = '/home/ec2-user/SageMaker/FSI_SMS_TO_CONVERSION/model_and_pipeline/model_KNN.pkl'
pickle.dump(KNN, open(filename, 'wb'))

## DEEP LEARNING MODEL