In [24]:
import pandas as pd
import numpy as np
import os
import glob
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.model_selection import train_test_split,GridSearchCV
import pickle
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import accuracy_score, roc_curve, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import json
# from sklearn.ensemble import CatBoostClassifier

In [25]:
client = bigquery.Client()

print("Client creating using default project: {}".format(client.project))
query = """
 SELECT * FROM `jm-ebg.EDA_MODEL.Final_Label_data`
 """
query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    # location="US",
)  # API request - starts the query
df = query_job.to_dataframe()
df.shape

Client creating using default project: jm-ebg


(86197, 44)

In [26]:
df['label'].value_counts()

0    77264
1     8933
Name: label, dtype: Int64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86197 entries, 0 to 86196
Data columns (total 44 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ProspectId               86197 non-null  object 
 1   user_pseudo_id           86197 non-null  object 
 2   visitStartTime           86197 non-null  Int64  
 3   event_date               86197 non-null  object 
 4   day_of_week              86197 non-null  Int64  
 5   timeOnSite               86197 non-null  float64
 6   device_category          86197 non-null  object 
 7   mobile_brand_name        85804 non-null  object 
 8   operating_system         86193 non-null  object 
 9   os_version               86197 non-null  object 
 10  browser                  86193 non-null  object 
 11  region                   86197 non-null  object 
 12  city                     86197 non-null  object 
 13  ft_campaign              86144 non-null  object 
 14  ft_medium             

In [28]:
int_cols = list(df.select_dtypes(include=[int]).columns)
int_cols.remove('label')

In [29]:
cat_cols = list(df.select_dtypes(include=['O']).columns)
float_cols = list(df.select_dtypes(include=[float]).columns)

In [30]:
data={}
unwanted = {'(none)','(direct)','(not set)','(Other)'}  

In [31]:
def analysis_count(column_name,number):
    return df.groupby([column_name])[column_name].size().reset_index(name='count') \
                     .sort_values(['count'], ascending=False) \
                     .head(number).reset_index(drop=True)
def analysis_column(column_name,number):
    return df.groupby([column_name])[column_name].size().reset_index(name='count') \
                     .sort_values(['count'], ascending=False) \
                     .head(number)[column_name].tolist()

In [32]:
##data -> dictionary 
data['device_category'] =  analysis_column('device_category',5)
data['device_category'] = [i for i in data['device_category']  if i not in unwanted] 
data['mobile_brand_name'] =  analysis_column('mobile_brand_name',10)
data['mobile_brand_name'] = [i for i in data['mobile_brand_name']  if i not in unwanted] 
data['operating_system'] =  analysis_column('operating_system',5)
data['operating_system'] = [i for i in data['operating_system']  if i not in unwanted] 
data['os_version'] =  analysis_column('os_version',10)
data['os_version'] = [i for i in data['os_version']  if i not in unwanted] 
data['browser'] =  analysis_column('browser',8)
data['browser'] = [i for i in data['browser']  if i not in unwanted]
data['city'] =  analysis_column('city',160)
data['city'] = [i for i in data['city']  if i not in unwanted]
data['region'] =  analysis_column('region',50)
data['region'] = [i for i in data['region']  if i not in unwanted] 
data['ft_campaign'] =  analysis_column('ft_campaign',70)
data['ft_campaign'] = [i for i in data['ft_campaign']  if i not in unwanted] 
data['ft_medium'] =  analysis_column('ft_medium',5)
data['ft_medium'] = [i for i in data['ft_medium']  if i not in unwanted] 
data['ft_source'] =  analysis_column('ft_source',10)
data['ft_source'] = [i for i in data['ft_source']  if i not in unwanted] 
data['utm_term_placement'] =  analysis_column('utm_term_placement',200)
data['utm_term_placement'] = [i for i in data['utm_term_placement']  if i not in unwanted]
data['ad_network'] =  analysis_column("ad_network",4)
data['ad_network'] = [i for i in data['ad_network']  if i not in unwanted]
# data['placement'] =  analysis_column("placement",150)
# data['placement'] = [i for i in data['placement']  if i not in unwanted]
data['device_category_past'] =  analysis_column('device_category_past',4)
data['device_category_past'] = [i for i in data['device_category_past']  if i not in unwanted] 
data['mobile_brand_name_past'] =  analysis_column('mobile_brand_name_past',10)
data['mobile_brand_name_past'] = [i for i in data['mobile_brand_name_past']  if i not in unwanted] 
data['operating_system_past'] =  analysis_column('operating_system_past',5)
data['operating_system_past'] = [i for i in data['operating_system_past']  if i not in unwanted] 
data['os_version_past'] =  analysis_column('os_version_past',10)
data['os_version_past'] = [i for i in data['os_version_past']  if i not in unwanted] 
data['browser_past'] =  analysis_column('browser_past',8)
data['browser_past'] = [i for i in data['browser_past']  if i not in unwanted]
data['city_past'] =  analysis_column('city_past',160)
data['city_past'] = [i for i in data['city_past']  if i not in unwanted]
data['region_past'] =  analysis_column('region_past',50)
data['region_past'] = [i for i in data['region_past']  if i not in unwanted] 
data['ft_campaign_past'] =  analysis_column('ft_campaign',70)
data['ft_campaign_past'] = [i for i in data['ft_campaign_past']  if i not in unwanted]
data['ft_medium_past'] =  analysis_column('ft_medium_past',5)
data['ft_medium_past'] = [i for i in data['ft_medium_past']  if i not in unwanted] 
data['ft_source_past'] =  analysis_column('ft_source_past',10)
data['ft_source_past'] = [i for i in data['ft_source_past']  if i not in unwanted] 
data['utm_term_placement_past'] =  analysis_column('utm_term_placement_past',200)
data['utm_term_placement_past'] = [i for i in data['utm_term_placement_past']  if i not in unwanted]
data['ad_network_past'] =  analysis_column('ad_network_past',4)
data['ad_network_past'] = [i for i in data['ad_network_past']  if i not in unwanted]
# data['placement_past'] =  analysis_column('placement_past',150)
# data['placement_past'] = [i for i in data['placement_past']  if i not in unwanted]

In [33]:
### preprocess data
df['device_category'].loc[df['device_category'].apply(lambda x: (x not in data['device_category']) )] = '(Others)'
df['mobile_brand_name'].loc[df['mobile_brand_name'].apply(lambda x: (x not in data['mobile_brand_name']) )] = '(Others)'
df['operating_system'].loc[df['operating_system'].apply(lambda x: (x not in data['operating_system']) )] = '(Others)'
df['os_version'].loc[df['os_version'].apply(lambda x: (x not in data['os_version']) )] = '(Others)'
df['browser'].loc[df['browser'].apply(lambda x: (x not in data['browser']) )] = '(Others)'
df['city'].loc[df['city'].apply(lambda x: (x not in data['city']) )] = '(Others)'
df['region'].loc[df['region'].apply(lambda x: (x not in data['region']) )] = '(Others)'
df['ft_campaign'].loc[df['ft_campaign'].apply(lambda x: (x not in data['ft_campaign']) )] = '(Others)'
df['ft_medium'].loc[df['ft_medium'].apply(lambda x: (x not in data['ft_medium']) )] = '(Others)'
df['ft_source'].loc[df['ft_source'].apply(lambda x: (x not in data['ft_source']) )] = '(Others)'
df['utm_term_placement'].loc[df['utm_term_placement'].apply(lambda x: (x not in data['utm_term_placement']) )] = '(Others)'
df['ad_network'].loc[df['ad_network'].apply(lambda x: (x not in data['ad_network']) )] = '(Others)'
# df['placement'].loc[df['placement'].apply(lambda x: (x not in data['placement']) )] = '(Others)'
df['device_category_past'].loc[df['device_category_past'].apply(lambda x: (x not in data['device_category_past']) )] = '(Others)'
df['mobile_brand_name_past'].loc[df['mobile_brand_name_past'].apply(lambda x: (x not in data['mobile_brand_name_past']) )] = '(Others)'
df['operating_system_past'].loc[df['operating_system_past'].apply(lambda x: (x not in data['operating_system_past']) )] = '(Others)'
df['os_version_past'].loc[df['os_version_past'].apply(lambda x: (x not in data['os_version_past']) )] = '(Others)'
df['browser_past'].loc[df['browser_past'].apply(lambda x: (x not in data['browser_past']) )] = '(Others)'
df['city_past'].loc[df['city_past'].apply(lambda x: (x not in data['city_past']) )] = '(Others)'
df['region_past'].loc[df['region_past'].apply(lambda x: (x not in data['region_past']) )] = '(Others)'
df['ft_campaign_past'].loc[df['ft_campaign_past'].apply(lambda x: (x not in data['ft_campaign_past']) )] = '(Others)'
df['ft_medium_past'].loc[df['ft_medium_past'].apply(lambda x: (x not in data['ft_medium_past']) )] = '(Others)'
df['ft_source_past'].loc[df['ft_source_past'].apply(lambda x: (x not in data['ft_source_past']) )] = '(Others)'
df['utm_term_placement_past'].loc[df['utm_term_placement_past'].apply(lambda x: (x not in data['utm_term_placement_past']) )] = '(Others)'
df['ad_network_past'].loc[df['ad_network_past'].apply(lambda x: (x not in data['ad_network_past']) )] = '(Others)'
# df['placement_past'].loc[df['placement_past'].apply(lambda x: (x not in data['placement_past']) )] = '(Others)'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [11]:
dict_filename = 'model/cat_dict_esd_RF.pkl'
pickle.dump(data, open(dict_filename, 'wb'))

In [34]:
### replace the string int and float value
for i in cat_cols:
    df[i].fillna("Value_Not_Found", inplace = True)
for i in int_cols:
    df[i].fillna(-99999, inplace = True) 
for i in float_cols:
    df[i].fillna(-99999999.000, inplace = True) 

In [35]:
drop_cols = ['ProspectId', 'user_pseudo_id', 'visitStartTime', 'event_date','ProspectID_Past', 'user_pseudo_id_past', 'visitStartTime_past','date_past','user_psudo_id_conv', 'ProspectId_conv', 'visitStartTime_conv',
       'date_conv', 'day_diff','ad_network','ad_network_past','ad_group','ad_group_past','device_category_past','device_category']
df = df.drop(drop_cols, axis=1)
df = df.drop_duplicates(keep='first')

In [36]:
df['label'].value_counts()

0    46424
1     2896
Name: label, dtype: Int64

In [37]:
cat_cols = list(df.select_dtypes(include=['O']).columns)
cat_cols

['mobile_brand_name',
 'operating_system',
 'os_version',
 'browser',
 'region',
 'city',
 'ft_campaign',
 'ft_medium',
 'ft_source',
 'utm_term_placement',
 'mobile_brand_name_past',
 'operating_system_past',
 'os_version_past',
 'browser_past',
 'region_past',
 'city_past',
 'ft_campaign_past',
 'ft_medium_past',
 'ft_source_past',
 'utm_term_placement_past']

In [38]:
#categorical value handling
le = LabelEncoder()
for  i in cat_cols:
    df[i] = le.fit_transform(df[i])
    pickle.dump(le, open("label_encode/{}-esd-RF.pkl".format(i), 'wb'))

In [39]:
X = df.drop(['label'], axis=1)
y = df.label

In [40]:
y=np.floor(pd.to_numeric(y, errors='coerce')).astype('Int64')
import numpy as np
y = y.fillna(0).astype(np.int64, errors='ignore')
X = X.fillna(0).astype(np.int64, errors='ignore')

In [41]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [42]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.25, stratify=y, random_state=101)

In [43]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [22]:
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 40, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


exception calling callback for <Future at 0x7f1434789250 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 26, in _invoke_callbacks
    callback(self)
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 385, in __call__
    self.parallel.dispatch_next()
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 834, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 556, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "/opt/conda/lib/p

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_estimator_

In [44]:
clf1 = RandomForestClassifier(bootstrap=False, max_depth=70, max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=10,
                       n_estimators=1200)
clf1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, max_depth=70, max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=10,
                       n_estimators=1200)

In [45]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score

y_pred = clf1.predict(X_train.values)
#y_pred = [1 if val >= 0.50 else 0 for val in y_pred_prob[:,1]]
f1 = f1_score(y_train, y_pred)
print("accuracy : ", accuracy_score(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))
print("roc auc : ", roc_auc_score(y_train, y_pred))
print("recall : ", recall_score(y_train, y_pred))
print("precision : ", precision_score(y_train, y_pred))
print("f1 score : ",f1 )

print ("----------------------")


y_pred = clf1.predict(X_test.values)
#y_pred = [1 if val >= 0.50 else 0 for val in y_pred_prob[:,1]]
f1 = f1_score(y_test, y_pred)
print("accuracy : ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("roc auc : ", roc_auc_score(y_test, y_pred))
print("recall : ", recall_score(y_test, y_pred))
print("precision : ", precision_score(y_test, y_pred))
print("f1 score : ",f1)

  "X does not have valid feature names, but"


accuracy :  0.9742949049342294
[[33418  1400]
 [  390 34428]]
roc auc :  0.9742949049342294
recall :  0.9887988971221782
precision :  0.960924416657363
f1 score :  0.9746624012682954
----------------------


  "X does not have valid feature names, but"


accuracy :  0.955583318972945
[[10936   670]
 [  361 11245]]
roc auc :  0.9555833189729451
recall :  0.9688953989315872
precision :  0.9437683592110785
f1 score :  0.9561668296415968


In [None]:
clf1


In [30]:
filename = 'model/RF_Dropped_v1.pkl'
pickle.dump(clf1, open(filename, 'wb'))

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid_1 = {
    'bootstrap': [False],
    'max_depth': [15, 25, 30, 50],
    'max_features': [2, 3,7],
    'min_samples_leaf': [1,3, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid_1, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [33]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid,X_test, y_test)

In [34]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=15, max_features=2,
                       min_samples_split=8, n_estimators=300)

In [51]:
clf2 = RandomForestClassifier(n_estimators=300, max_depth=15,min_samples_split =8,max_features=2,bootstrap=False)
clf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, max_depth=15, max_features=2,
                       min_samples_split=8, n_estimators=300)

In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score

y_pred = clf2.predict(X_train.values)
#y_pred = [1 if val >= 0.50 else 0 for val in y_pred_prob[:,1]]
f1 = f1_score(y_train, y_pred)
print("accuracy : ", accuracy_score(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))
print("roc auc : ", roc_auc_score(y_train, y_pred))
print("recall : ", recall_score(y_train, y_pred))
print("precision : ", precision_score(y_train, y_pred))
print("f1 score : ",f1 )

print ("----------------------")


y_pred = clf2.predict(X_test.values)
#y_pred = [1 if val >= 0.50 else 0 for val in y_pred_prob[:,1]]
f1 = f1_score(y_test, y_pred)
print("accuracy : ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("roc auc : ", roc_auc_score(y_test, y_pred))
print("recall : ", recall_score(y_test, y_pred))
print("precision : ", precision_score(y_test, y_pred))
print("f1 score : ",f1)

  "X does not have valid feature names, but"


accuracy :  0.9241053478086048
[[31204  3614]
 [ 1671 33147]]
roc auc :  0.9241053478086048
recall :  0.952007582285025
precision :  0.9016892902804603
f1 score :  0.9261654954665475
----------------------


  "X does not have valid feature names, but"


accuracy :  0.9162502154058245
[[10323  1283]
 [  661 10945]]
roc auc :  0.9162502154058245
recall :  0.9430466999827676
precision :  0.8950768727510632
f1 score :  0.9184358479483091


In [39]:
filename = 'model/RF_hyperparameter_1.pkl'

pickle.dump(clf1, open(filename, 'wb'))

In [53]:
!pip install lazypredict
from lazypredict.Supervised import LazyClassifier



In [None]:
clf3 = LazyClassifier(verbose=0,ignore_warnings=True)
models, predictions = clf3.fit(X_train, X_test, y_train, y_test)
models

 59%|█████▊    | 17/29 [01:16<00:32,  2.69s/it]

In [24]:
!pip install lightgbm



In [25]:
from lightgbm import LGBMClassifier
clf4 = LGBMClassifier()
# model.fit(x_train, y_train)
# clf = RandomForestClassifier(n_estimators=300, max_depth=15,min_samples_split =8,max_features=2,bootstrap=False)
clf4.fit(X_train, y_train)

LGBMClassifier()

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score

y_pred = clf4.predict(X_train.values)
#y_pred = [1 if val >= 0.50 else 0 for val in y_pred_prob[:,1]]
f1 = f1_score(y_train, y_pred)
print("accuracy : ", accuracy_score(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))
print("roc auc : ", roc_auc_score(y_train, y_pred))
print("recall : ", recall_score(y_train, y_pred))
print("precision : ", precision_score(y_train, y_pred))
print("f1 score : ",f1 )

print ("----------------------")


y_pred = clf4.predict(X_test.values)
#y_pred = [1 if val >= 0.50 else 0 for val in y_pred_prob[:,1]]
f1 = f1_score(y_test, y_pred)
print("accuracy : ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("roc auc : ", roc_auc_score(y_test, y_pred))
print("recall : ", recall_score(y_test, y_pred))
print("precision : ", precision_score(y_test, y_pred))
print("f1 score : ",f1)

accuracy :  0.9172554425871675
[[30774  4044]
 [ 1718 33100]]
roc auc :  0.9172554425871675
recall :  0.9506577057843644
precision :  0.891126426879173
f1 score :  0.9199299630360468
----------------------
accuracy :  0.9119420989143546
[[10197  1409]
 [  635 10971]]
roc auc :  0.9119420989143546
recall :  0.9452869205583319
precision :  0.8861873990306947
f1 score :  0.9147836237805387


In [53]:
filename = 'model/LGBM.pkl'
pickle.dump(clf1, open(filename, 'wb'))