In [1]:
import pandas as pd
import numpy as np
import os
import glob
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.model_selection import train_test_split,GridSearchCV
import pickle
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import accuracy_score, roc_curve, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import json
# from sklearn.ensemble import CatBoostClassifier

In [2]:
client = bigquery.Client()

print("Client creating using default project: {}".format(client.project))
query = """
 SELECT * FROM `jm-ebg.EDA_MODEL.label_data_jan_to_march_0905`
 """
query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    # location="US",
)  # API request - starts the query
df = query_job.to_dataframe()
df.shape

Client creating using default project: jm-ebg


(102571, 23)

In [3]:

client = bigquery.Client()

print("Client creating using default project: {}".format(client.project))
query = """
 SELECT * FROM `jm-ebg.EDA_MODEL.april_label_back_test_data`
 """
query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    # location="US",
)  # API request - starts the query
df1 = query_job.to_dataframe()
df1.shape

Client creating using default project: jm-ebg


(35661, 44)

In [4]:
df['label'].value_counts()

0    91004
1    11567
Name: label, dtype: Int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102571 entries, 0 to 102570
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ProspectId               102571 non-null  object 
 1   user_pseudo_id           102571 non-null  object 
 2   visitStartTime           102571 non-null  Int64  
 3   event_date               102571 non-null  object 
 4   timeOnSite_past          102571 non-null  float64
 5   day_of_week_past         102571 non-null  Int64  
 6   device_category_past     102571 non-null  object 
 7   mobile_brand_name_past   102119 non-null  object 
 8   operating_system_past    102567 non-null  object 
 9   os_version_past          102571 non-null  object 
 10  browser_past             102567 non-null  object 
 11  region_past              102570 non-null  object 
 12  city_past                102564 non-null  object 
 13  ft_campaign_past         102171 non-null  object 
 14  ft_m

In [6]:
int_cols = list(df.select_dtypes(include=[int]).columns)
int_cols.remove('label')

In [7]:
cat_cols = list(df.select_dtypes(include=['O']).columns)
float_cols = list(df.select_dtypes(include=[float]).columns)

In [8]:
data={}
unwanted = {'(none)','(direct)','(not set)','(Other)'}  

In [9]:
def analysis_count(column_name,number):
    return df.groupby([column_name])[column_name].size().reset_index(name='count') \
                     .sort_values(['count'], ascending=False) \
                     .head(number).reset_index(drop=True)
def analysis_column(column_name,number):
    return df.groupby([column_name])[column_name].size().reset_index(name='count') \
                     .sort_values(['count'], ascending=False) \
                     .head(number)[column_name].tolist()

In [19]:
##data -> dictionary 
# data['placement'] =  analysis_column("placement",150)
# data['placement'] = [i for i in data['placement']  if i not in unwanted]
data['device_category_past'] =  analysis_column('device_category_past',4)
data['device_category_past'] = [i for i in data['device_category_past']  if i not in unwanted] 
data['mobile_brand_name_past'] =  analysis_column('mobile_brand_name_past',10)
data['mobile_brand_name_past'] = [i for i in data['mobile_brand_name_past']  if i not in unwanted] 
data['operating_system_past'] =  analysis_column('operating_system_past',5)
data['operating_system_past'] = [i for i in data['operating_system_past']  if i not in unwanted] 
data['os_version_past'] =  analysis_column('os_version_past',10)
data['os_version_past'] = [i for i in data['os_version_past']  if i not in unwanted] 
data['browser_past'] =  analysis_column('browser_past',8)
data['browser_past'] = [i for i in data['browser_past']  if i not in unwanted]
data['city_past'] =  analysis_column('city_past',160)
data['city_past'] = [i for i in data['city_past']  if i not in unwanted]
data['region_past'] =  analysis_column('region_past',50)
data['region_past'] = [i for i in data['region_past']  if i not in unwanted] 
data['ft_campaign_past'] =  analysis_column('ft_campaign_past',70)
data['ft_campaign_past'] = [i for i in data['ft_campaign_past']  if i not in unwanted]
data['ft_medium_past'] =  analysis_column('ft_medium_past',5)
data['ft_medium_past'] = [i for i in data['ft_medium_past']  if i not in unwanted] 
data['ft_source_past'] =  analysis_column('ft_source_past',10)
data['ft_source_past'] = [i for i in data['ft_source_past']  if i not in unwanted] 
data['utm_term_placement_past'] =  analysis_column('utm_term_placement_past',200)
data['utm_term_placement_past'] = [i for i in data['utm_term_placement_past']  if i not in unwanted]
# data['placement_past'] = [i for i in data['placement_past']  if i not in unwanted]

In [20]:
### preprocess data
# df['placement'].loc[df['placement'].apply(lambda x: (x not in data['placement']) )] = '(Others)'
df['device_category_past'].loc[df['device_category_past'].apply(lambda x: (x not in data['device_category_past']) )] = '(Others)'
df['mobile_brand_name_past'].loc[df['mobile_brand_name_past'].apply(lambda x: (x not in data['mobile_brand_name_past']) )] = '(Others)'
df['operating_system_past'].loc[df['operating_system_past'].apply(lambda x: (x not in data['operating_system_past']) )] = '(Others)'
df['os_version_past'].loc[df['os_version_past'].apply(lambda x: (x not in data['os_version_past']) )] = '(Others)'
df['browser_past'].loc[df['browser_past'].apply(lambda x: (x not in data['browser_past']) )] = '(Others)'
df['city_past'].loc[df['city_past'].apply(lambda x: (x not in data['city_past']) )] = '(Others)'
df['region_past'].loc[df['region_past'].apply(lambda x: (x not in data['region_past']) )] = '(Others)'
df['ft_campaign_past'].loc[df['ft_campaign_past'].apply(lambda x: (x not in data['ft_campaign_past']) )] = '(Others)'
df['ft_medium_past'].loc[df['ft_medium_past'].apply(lambda x: (x not in data['ft_medium_past']) )] = '(Others)'
df['ft_source_past'].loc[df['ft_source_past'].apply(lambda x: (x not in data['ft_source_past']) )] = '(Others)'
df['utm_term_placement_past'].loc[df['utm_term_placement_past'].apply(lambda x: (x not in data['utm_term_placement_past']) )] = '(Others)'
# df['placement_past'].loc[df['placement_past'].apply(lambda x: (x not in data['placement_past']) )] = '(Others)'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [11]:
dict_filename = 'model/cat_dict_esd_RF.pkl'
pickle.dump(data, open(dict_filename, 'wb'))

In [21]:
### replace the string int and float value
for i in cat_cols:
    df[i].fillna("Value_Not_Found", inplace = True)
for i in int_cols:
    df[i].fillna(-99999, inplace = True) 
for i in float_cols:
    df[i].fillna(-99999999.000, inplace = True) 

In [24]:
drop_cols = ['event_date','user_psudo_id_conv', 'ProspectId_conv', 'visitStartTime_conv',
       'date_conv', 'day_diff','device_category_past']
df = df.drop(drop_cols, axis=1)
df = df.drop_duplicates(keep='first')

In [25]:
df['label'].value_counts()

0    91004
1    11567
Name: label, dtype: Int64

In [26]:
cat_cols = list(df.select_dtypes(include=['O']).columns)
cat_cols

['ProspectId',
 'user_pseudo_id',
 'mobile_brand_name_past',
 'operating_system_past',
 'os_version_past',
 'browser_past',
 'region_past',
 'city_past',
 'ft_campaign_past',
 'ft_medium_past',
 'ft_source_past',
 'utm_term_placement_past']

In [27]:
#categorical value handling
le = LabelEncoder()
for  i in cat_cols:
    df[i] = le.fit_transform(df[i])
    pickle.dump(le, open("label_encode/{}-esd-RF.pkl".format(i), 'wb'))

In [28]:
seed = 101
min_class_df = df[df['label']==1]
max_class_df = df[df['label']==0]
n_samples = int(min_class_df.shape[0])
minority_df_sampled = resample(min_class_df, n_samples=n_samples, replace=False, random_state=seed )
majority_df_sampled = resample(max_class_df, n_samples=n_samples, replace=False, random_state=seed)
sampled_df = pd.concat([minority_df_sampled, majority_df_sampled]).sample(frac=1, random_state=seed)

In [29]:
sampled_df['label'].value_counts()

0    11567
1    11567
Name: label, dtype: Int64

In [30]:
X = sampled_df.drop(['label'], axis=1)
y = sampled_df.label

In [31]:
y = y.astype('int')

In [32]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.25, stratify=y, random_state=seed)

In [33]:
from sklearn.preprocessing import StandardScaler

ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)

ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)

In [34]:
#Fitting Logistic Regression to the training set  
from sklearn.linear_model import LogisticRegression  
classifier= LogisticRegression(random_state=0,max_iter=1000000)  
classifier.fit(X_train, y_train)  


LogisticRegression(max_iter=1000000, random_state=0)

In [35]:
y_pred= classifier.predict(X_test)

In [36]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_pred, y_test)

TN, FP, FN, TP = confusion_matrix(y_pred, y_test).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP)  =  2389
False Positive(FP) =  503
True Negative(TN)  =  2568
False Negative(FN) =  324


In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score



print ("----------------------")


print("accuracy : ", accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print("roc auc : ", roc_auc_score(y_pred, y_test))
print("recall : ", recall_score(y_pred, y_test))
print("precision : ", precision_score(y_pred, y_test))

----------------------
accuracy :  0.8570193637621023
[[2568  503]
 [ 324 2389]]
roc auc :  0.8583923564472373
recall :  0.8805750092148913
precision :  0.8260719225449515


In [38]:
#Predicting the test set result  
y_pred= classifier.predict(X_test) 
y_pred1= classifier.predict_proba(X_test) 


In [39]:
y_pred=y_test

In [40]:
y_pred1=y_pred1[:,1]

In [41]:
y_pred=list(y_pred)

In [42]:
y_pred1
for i in range(y_pred1.size):
    if y_pred1[i]>0.755:
        y_pred1[i]=1
    elif y_pred1[i]<0.03:
        y_pred1[i]=0
    else:
        y_pred1[i]=-1

In [43]:
y_pred1

array([ 1., -1., -1., ..., -1., -1.,  0.])

In [44]:
a=[]
b=[]
for i in range(y_pred1.size):
    if y_pred1[i]==1:
        a.append(y_pred1[i])
        b.append(y_pred[i])
    elif y_pred1[i]==0:
        a.append(y_pred1[i])
        b.append(y_pred[i])
        

In [45]:
len(a)

3282

In [46]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(a, b)

TN, FP, FN, TP = confusion_matrix(a, b).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP)  =  2191
False Positive(FP) =  11
True Negative(TN)  =  966
False Negative(FN) =  114


In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score



print ("----------------------")


print("accuracy : ", accuracy_score(a, b))
print(confusion_matrix(a, b))
print("roc auc : ", roc_auc_score(a, b))
print("recall : ", recall_score(a, b))
print("precision : ", precision_score(a, b))

----------------------
accuracy :  0.961913467397928
[[ 966   11]
 [ 114 2191]]
roc auc :  0.9696416716807617
recall :  0.9505422993492407
precision :  0.9950045413260672


In [37]:
X.shape

(5792, 24)

In [38]:
y_train.shape

(4344,)

In [39]:
y_test.shape

(1448,)

In [40]:
---------

SyntaxError: invalid syntax (567458762.py, line 1)

In [48]:
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV
logreg = LinearSVC(max_iter=1000000)
model = CalibratedClassifierCV(logreg) 
model.fit(X_train, y_train) 
predictions = model.predict(X_test)

In [49]:
y_proba = model.predict_proba(X_test)

In [50]:
predictions=y_test

In [51]:
y_proba=y_proba[:,1]

In [52]:
predictions=list(predictions)

In [53]:
for i in range(y_pred1.size):
    if y_proba[i]>0.755:
        y_proba[i]=1
    elif y_proba[i]<0.03:
        y_proba[i]=0
    else:
        y_proba[i]=-1

In [54]:
y_proba

array([ 1., -1., -1., ..., -1., -1.,  0.])

In [55]:
c=[]
d=[]
for i in range(y_proba.size):
    if y_proba[i]==1:
        c.append(y_proba[i])
        d.append(predictions[i])
    elif y_proba[i]==0:
        c.append(y_proba[i])
        d.append(predictions[i])
        

In [56]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(c, d)

TN, FP, FN, TP = confusion_matrix(c, d).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP)  =  2177
False Positive(FP) =  13
True Negative(TN)  =  849
False Negative(FN) =  123


In [57]:
accuracy =  (TP + TN) / (TP + FP + TN + FN)

print('Accuracy of the binary classifier = {:0.3f}'.format(accuracy))

Accuracy of the binary classifier = 0.957


In [58]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score



print ("----------------------")


print("accuracy : ", accuracy_score(c, d))
print(confusion_matrix(c, d))
print("roc auc : ", roc_auc_score(c, d))
print("recall : ", recall_score(c, d))
print("precision : ", precision_score(c, d))

----------------------
accuracy :  0.956989247311828
[[ 849   13]
 [ 123 2177]]
roc auc :  0.9657202663169576
recall :  0.9465217391304348
precision :  0.9940639269406393


In [52]:
pip install scikit-optimize

Note: you may need to restart the kernel to use updated packages.


In [53]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Define the hyperparameter search space
search_space = {
    'C': Real(0.001, 100, prior='log-uniform'),
    'penalty': Categorical(['l1', 'l2']),
    'solver': Categorical(['liblinear', 'saga'])
}

# Define the logistic regression model
logreg = LogisticRegression(max_iter=100000)

# Define the Bayesian optimization search using 5-fold cross-validation
bayes_search = BayesSearchCV(
    logreg,
    search_space,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Perform the hyperparameter search
bayes_search.fit(X_train, y_train)

# Evaluate the performance of the model on the test set
y_pred = bayes_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Test accuracy:', accuracy)

# Print the best hyperparameters found by Bayesian optimization
print('Best hyperparameters:', bayes_search.best_params_)

Test accuracy: 0.794889502762431
Best hyperparameters: OrderedDict([('C', 0.020172796349721606), ('penalty', 'l1'), ('solver', 'saga')])


In [None]:
-----------

In [59]:
df1['label'].value_counts()

0    33378
1     2283
Name: label, dtype: Int64

In [60]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35661 entries, 0 to 35660
Data columns (total 44 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ProspectId               35661 non-null  object 
 1   user_pseudo_id           35661 non-null  object 
 2   visitStartTime           35661 non-null  Int64  
 3   event_date               35661 non-null  object 
 4   day_of_week              35661 non-null  Int64  
 5   timeOnSite               35661 non-null  float64
 6   device_category          35661 non-null  object 
 7   mobile_brand_name        35270 non-null  object 
 8   operating_system         35661 non-null  object 
 9   os_version               35661 non-null  object 
 10  browser                  35661 non-null  object 
 11  region                   35661 non-null  object 
 12  city                     35661 non-null  object 
 13  ft_campaign              35619 non-null  object 
 14  ft_medium             

In [61]:
int_cols = list(df1.select_dtypes(include=[int]).columns)
int_cols.remove('label')

In [62]:
cat_cols = list(df1.select_dtypes(include=['O']).columns)
float_cols = list(df1.select_dtypes(include=[float]).columns)

In [63]:
data={}
unwanted = {'(none)','(direct)','(not set)','(Other)'}  

In [66]:
def analysis_count(column_name,number):
    return df1.groupby([column_name])[column_name].size().reset_index(name='count') \
                     .sort_values(['count'], ascending=False) \
                     .head(number).reset_index(drop=True)
def analysis_column(column_name,number):
    return df1.groupby([column_name])[column_name].size().reset_index(name='count') \
                     .sort_values(['count'], ascending=False) \
                     .head(number)[column_name].tolist()

In [69]:
##data -> dictionary 
data['device_category'] =  analysis_column('device_category',5)
data['device_category'] = [i for i in data['device_category']  if i not in unwanted] 
data['mobile_brand_name'] =  analysis_column('mobile_brand_name',10)
data['mobile_brand_name'] = [i for i in data['mobile_brand_name']  if i not in unwanted] 
data['operating_system'] =  analysis_column('operating_system',5)
data['operating_system'] = [i for i in data['operating_system']  if i not in unwanted] 
data['os_version'] =  analysis_column('os_version',10)
data['os_version'] = [i for i in data['os_version']  if i not in unwanted] 
data['browser'] =  analysis_column('browser',8)
data['browser'] = [i for i in data['browser']  if i not in unwanted]
data['city'] =  analysis_column('city',160)
data['city'] = [i for i in data['city']  if i not in unwanted]
data['region'] =  analysis_column('region',50)
data['region'] = [i for i in data['region']  if i not in unwanted] 
data['ft_campaign'] =  analysis_column('ft_campaign',70)
data['ft_campaign'] = [i for i in data['ft_campaign']  if i not in unwanted] 
data['ft_medium'] =  analysis_column('ft_medium',5)
data['ft_medium'] = [i for i in data['ft_medium']  if i not in unwanted] 
data['ft_source'] =  analysis_column('ft_source',10)
data['ft_source'] = [i for i in data['ft_source']  if i not in unwanted] 
data['utm_term_placement'] =  analysis_column('utm_term_placement',200)
data['utm_term_placement'] = [i for i in data['utm_term_placement']  if i not in unwanted]
data['ad_network'] =  analysis_column("ad_network",4)
data['ad_network'] = [i for i in data['ad_network']  if i not in unwanted]
# data['placement'] =  analysis_column("placement",150)
# data['placement'] = [i for i in data['placement']  if i not in unwanted]
data['device_category_past'] =  analysis_column('device_category_past',4)
data['device_category_past'] = [i for i in data['device_category_past']  if i not in unwanted] 
data['mobile_brand_name_past'] =  analysis_column('mobile_brand_name_past',10)
data['mobile_brand_name_past'] = [i for i in data['mobile_brand_name_past']  if i not in unwanted] 
data['operating_system_past'] =  analysis_column('operating_system_past',5)
data['operating_system_past'] = [i for i in data['operating_system_past']  if i not in unwanted] 
data['os_version_past'] =  analysis_column('os_version_past',10)
data['os_version_past'] = [i for i in data['os_version_past']  if i not in unwanted] 
data['browser_past'] =  analysis_column('browser_past',8)
data['browser_past'] = [i for i in data['browser_past']  if i not in unwanted]
data['city_past'] =  analysis_column('city_past',160)
data['city_past'] = [i for i in data['city_past']  if i not in unwanted]
data['region_past'] =  analysis_column('region_past',50)
data['region_past'] = [i for i in data['region_past']  if i not in unwanted] 
data['ft_campaign_past'] =  analysis_column('ft_campaign',70)
data['ft_campaign_past'] = [i for i in data['ft_campaign_past']  if i not in unwanted]
data['ft_medium_past'] =  analysis_column('ft_medium_past',5)
data['ft_medium_past'] = [i for i in data['ft_medium_past']  if i not in unwanted] 
data['ft_source_past'] =  analysis_column('ft_source_past',10)
data['ft_source_past'] = [i for i in data['ft_source_past']  if i not in unwanted] 
data['utm_term_placement_past'] =  analysis_column('utm_term_placement_past',200)
data['utm_term_placement_past'] = [i for i in data['utm_term_placement_past']  if i not in unwanted]
data['ad_network_past'] =  analysis_column('ad_network_past',4)
data['ad_network_past'] = [i for i in data['ad_network_past']  if i not in unwanted]
# data['placement_past'] =  analysis_column('placement_past',150)
# data['placement_past'] = [i for i in data['placement_past']  if i not in unwanted]

In [71]:
### preprocess data
df1['device_category'].loc[df1['device_category'].apply(lambda x: (x not in data['device_category']) )] = '(Others)'
df1['mobile_brand_name'].loc[df1['mobile_brand_name'].apply(lambda x: (x not in data['mobile_brand_name']) )] = '(Others)'
df1['operating_system'].loc[df1['operating_system'].apply(lambda x: (x not in data['operating_system']) )] = '(Others)'
df1['os_version'].loc[df1['os_version'].apply(lambda x: (x not in data['os_version']) )] = '(Others)'
df1['browser'].loc[df1['browser'].apply(lambda x: (x not in data['browser']) )] = '(Others)'
df1['city'].loc[df1['city'].apply(lambda x: (x not in data['city']) )] = '(Others)'
df1['region'].loc[df1['region'].apply(lambda x: (x not in data['region']) )] = '(Others)'
df1['ft_campaign'].loc[df1['ft_campaign'].apply(lambda x: (x not in data['ft_campaign']) )] = '(Others)'
df1['ft_medium'].loc[df1['ft_medium'].apply(lambda x: (x not in data['ft_medium']) )] = '(Others)'
df1['ft_source'].loc[df1['ft_source'].apply(lambda x: (x not in data['ft_source']) )] = '(Others)'
df1['utm_term_placement'].loc[df1['utm_term_placement'].apply(lambda x: (x not in data['utm_term_placement']) )] = '(Others)'
df1['ad_network'].loc[df1['ad_network'].apply(lambda x: (x not in data['ad_network']) )] = '(Others)'
# df['placement'].loc[df['placement'].apply(lambda x: (x not in data['placement']) )] = '(Others)'
df1['device_category_past'].loc[df1['device_category_past'].apply(lambda x: (x not in data['device_category_past']) )] = '(Others)'
df1['mobile_brand_name_past'].loc[df1['mobile_brand_name_past'].apply(lambda x: (x not in data['mobile_brand_name_past']) )] = '(Others)'
df1['operating_system_past'].loc[df1['operating_system_past'].apply(lambda x: (x not in data['operating_system_past']) )] = '(Others)'
df1['os_version_past'].loc[df1['os_version_past'].apply(lambda x: (x not in data['os_version_past']) )] = '(Others)'
df1['browser_past'].loc[df1['browser_past'].apply(lambda x: (x not in data['browser_past']) )] = '(Others)'
df1['city_past'].loc[df1['city_past'].apply(lambda x: (x not in data['city_past']) )] = '(Others)'
df1['region_past'].loc[df1['region_past'].apply(lambda x: (x not in data['region_past']) )] = '(Others)'
df1['ft_campaign_past'].loc[df1['ft_campaign_past'].apply(lambda x: (x not in data['ft_campaign_past']) )] = '(Others)'
df1['ft_medium_past'].loc[df1['ft_medium_past'].apply(lambda x: (x not in data['ft_medium_past']) )] = '(Others)'
df1['ft_source_past'].loc[df1['ft_source_past'].apply(lambda x: (x not in data['ft_source_past']) )] = '(Others)'
df1['utm_term_placement_past'].loc[df1['utm_term_placement_past'].apply(lambda x: (x not in data['utm_term_placement_past']) )] = '(Others)'
df1['ad_network_past'].loc[df1['ad_network_past'].apply(lambda x: (x not in data['ad_network_past']) )] = '(Others)'
# df['placement_past'].loc[df['placement_past'].apply(lambda x: (x not in data['placement_past']) )] = '(Others)'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [72]:
### replace the string int and float value
for i in cat_cols:
    df1[i].fillna("Value_Not_Found", inplace = True)
for i in int_cols:
    df1[i].fillna(-99999, inplace = True) 
for i in float_cols:
    df1[i].fillna(-99999999.000, inplace = True) 

In [73]:
drop_cols = ['ProspectId', 'user_pseudo_id', 'visitStartTime', 'event_date','ProspectID_Past', 'user_pseudo_id_past', 'visitStartTime_past','date_past','user_psudo_id_conv', 'ProspectId_conv', 'visitStartTime_conv',
       'date_conv', 'day_diff','ad_network','ad_network_past','ad_group','ad_group_past','device_category_past','device_category']
df1 = df1.drop(drop_cols, axis=1)
df1 = df1.drop_duplicates(keep='first')

In [74]:
df1['label'].value_counts()

0    25186
1     1737
Name: label, dtype: Int64

In [75]:
cat_cols = list(df1.select_dtypes(include=['O']).columns)
cat_cols

['mobile_brand_name',
 'operating_system',
 'os_version',
 'browser',
 'region',
 'city',
 'ft_campaign',
 'ft_medium',
 'ft_source',
 'utm_term_placement',
 'mobile_brand_name_past',
 'operating_system_past',
 'os_version_past',
 'browser_past',
 'region_past',
 'city_past',
 'ft_campaign_past',
 'ft_medium_past',
 'ft_source_past',
 'utm_term_placement_past']

In [77]:
#categorical value handling
le = LabelEncoder()
for  i in cat_cols:
    df1[i] = le.fit_transform(df1[i])
    pickle.dump(le, open("label_encode/{}-esd-RF.pkl".format(i), 'wb'))

In [78]:
seed = 101
min_class_df = df1[df1['label']==1]
max_class_df = df1[df1['label']==0]
n_samples = int(min_class_df.shape[0])
minority_df_sampled = resample(min_class_df, n_samples=n_samples, replace=False, random_state=seed )
majority_df_sampled = resample(max_class_df, n_samples=n_samples, replace=False, random_state=seed)
sampled_df = pd.concat([minority_df_sampled, majority_df_sampled]).sample(frac=1, random_state=seed)

In [79]:
sampled_df['label'].value_counts()

1    1737
0    1737
Name: label, dtype: Int64

In [80]:
X = sampled_df.drop(['label'], axis=1)
y = sampled_df.label

In [81]:
y_pre=classifier.predict(X)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


ValueError: X has 24 features, but LogisticRegression is expecting 15 features as input.

In [82]:
X.head()

Unnamed: 0,day_of_week,timeOnSite,mobile_brand_name,operating_system,os_version,browser,region,city,ft_campaign,ft_medium,...,mobile_brand_name_past,operating_system_past,os_version_past,browser_past,region_past,city_past,ft_campaign_past,ft_medium_past,ft_source_past,utm_term_placement_past
5401,2,0.062667,9,1,8,2,27,123,19,1,...,9,1,8,2,27,119,19,1,5,0
4421,2,0.0,9,1,8,2,27,123,19,1,...,9,1,8,2,27,119,19,1,5,0
35358,5,0.314915,9,1,5,1,2,158,18,1,...,9,1,5,1,2,158,18,1,5,0
12546,4,0.254657,6,1,8,2,27,95,68,2,...,6,1,8,2,27,91,68,2,3,0
24158,6,0.113424,5,1,8,2,50,78,0,0,...,5,1,8,2,50,71,0,0,0,0


In [83]:
df.head()

Unnamed: 0,ProspectId,user_pseudo_id,visitStartTime,timeOnSite_past,day_of_week_past,mobile_brand_name_past,operating_system_past,os_version_past,browser_past,region_past,city_past,ft_campaign_past,ft_medium_past,ft_source_past,utm_term_placement_past,label
0,72,16309,1675325695,0.682301,5,0,4,0,5,46,3,48,1,6,71,0
1,110,10502,1675582360,35.725837,1,4,1,5,2,26,102,0,0,0,0,0
2,145,11419,1675254861,0.462754,4,6,1,3,2,42,32,39,1,6,142,0
3,170,5231,1677211898,24.002012,6,9,1,3,2,43,61,65,1,6,0,0
4,209,3902,1678425248,4.844952,6,9,1,6,2,26,102,0,0,0,0,0
