In [94]:
import dill
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
import time
import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import json
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

## funcs

In [2]:
def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
    def calculate_outliers(data):
        q25 = data.quantile(0.25)
        q75 = data.quantile(0.75)
        iqr = q75 - q25
        bounds = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
        return bounds

    df = df.copy()
    boundaries = calculate_outliers(df['year'])
    df.loc[df['year'] < boundaries[0], 'year'] = round(boundaries[0])
    df.loc[df['year'] > boundaries[1], 'year'] = round(boundaries[1])
    return df

In [3]:
def cols_to_drop_list_obvious(df3):
    print('% of empties')
    cols_to_drop = []
    for col in df3.columns:
        print(col, ' - ', round(len(df3[col][df3[col].isna() == True]) / len(df3[col]), 2))
        if len(df3[col][df3[col].isna() == True]) / len(df3[col]) > 0.9:
            cols_to_drop.append(col)
    return cols_to_drop

def show_empties_percentage(df3):
    print('% of empties')
    #cols_to_drop = []
    for col in df3.columns:
        if round(len(df3[col][df3[col].isna() == True]) / len(df3[col]), 3) != 0:
            print(col, ' - ', round(len(df3[col][df3[col].isna() == True]) / len(df3[col]), 3))
    b = 0
    return b


In [4]:
def show_what_else_to_do(df3):
    counter = 0
    for feature in df3.columns:
        if df3[feature].dtype != 'O':
            #print(feature, ' - ', df3[feature].dtype)
            counter += 1
        else:
            print(feature)
    print(counter == len(df3.columns))
    return counter

In [5]:
def show_all_about_column(df3, column):
    print('number of empty rows: ',  len(df3[df3[column].isna() == True]))
    print('number of unique entries: ',  len(df3[column].unique()))
    pass 

!pip install fastparquet


!pip install pyarrow

## preparartions

In [6]:
df = pd.read_csv('data/ga_hits.csv')

In [7]:
df2 = pd.read_csv('data/ga_sessions.csv')

  df2 = pd.read_csv('data/ga_sessions.csv')


In [109]:
df3 = pd.merge(df, df2, on='session_id')

In [110]:
target_action = ['sub_car_claim_click', 'sub_car_claim_submit_click',
'sub_open_dialog_click', 'sub_custom_question_submit_click',
'sub_call_number_click', 'sub_callback_submit_click', 'sub_submit_success',
'sub_car_request_submit_click']

In [111]:
cols_to_drop = cols_to_drop_list_obvious(df3)

% of empties
session_id  -  0.0
hit_date  -  0.0
hit_time  -  0.58
hit_number  -  0.0
hit_type  -  0.0
hit_referer  -  0.4
hit_page_path  -  0.0
event_category  -  0.0
event_action  -  0.0
event_label  -  0.24
event_value  -  1.0
client_id  -  0.0
visit_date  -  0.0
visit_time  -  0.0
visit_number  -  0.0
utm_source  -  0.0
utm_medium  -  0.0
utm_campaign  -  0.14
utm_adcontent  -  0.18
utm_keyword  -  0.59
device_category  -  0.0
device_os  -  0.58
device_brand  -  0.25
device_model  -  0.99
device_screen_resolution  -  0.0
device_browser  -  0.0
geo_country  -  0.0
geo_city  -  0.0


In [112]:
df3.columns

Index(['session_id', 'hit_date', 'hit_time', 'hit_number', 'hit_type',
       'hit_referer', 'hit_page_path', 'event_category', 'event_action',
       'event_label', 'event_value', 'client_id', 'visit_date', 'visit_time',
       'visit_number', 'utm_source', 'utm_medium', 'utm_campaign',
       'utm_adcontent', 'utm_keyword', 'device_category', 'device_os',
       'device_brand', 'device_model', 'device_screen_resolution',
       'device_browser', 'geo_country', 'geo_city'],
      dtype='object')

In [113]:
df3['target_action'] = df3['event_action'].apply(lambda x: 1 if x in target_action else 0)

In [114]:
cols_to_drop

['event_value', 'device_model']

In [115]:
cols_to_drop = []
for col in df.columns:
    cols_to_drop.append(str(col))

In [116]:
df3 = df3.drop(cols_to_drop, axis=1)

In [117]:
50000 / len(df3)

0.0031877144973238816

In [118]:
df3_pos = df3[df3['target_action'] == 1].sample(50000)

In [119]:
df3_neg = df3[df3['target_action'] == 0].sample(50000)

In [120]:
df3_pos = df3_pos.reset_index()

In [121]:
df3_neg = df3_neg.reset_index()

In [122]:
df3_pos = df3_pos.drop('index', axis=1)
df3_neg = df3_neg.drop('index', axis=1)

In [123]:
df3 = pd.concat([df3_pos, df3_neg])

In [124]:
#df3 = df3.sample(frac= 100000 / len(df3))

In [125]:
df3.loc[df3.utm_campaign.isna(), 'utm_campaign'] = 'other'

In [126]:

#utm_c_frec_dict = {}
#counter = 1
#for pos in df3.utm_campaign.unique():
#    if len(df3[(df3.utm_campaign == pos) & (df3.target_action == 1)]) == 0:
#        utm_c_frec_dict[str(pos)] = 0
#    else:
#        utm_c_frec_dict[str(pos)] = round(len(df3[(df3.utm_campaign == pos) & (df3.target_action == 1)]) / len(df3[df3.utm_campaign == pos]), 5)
#   print(counter)
#   counter = counter  + 1

with open('data/utm_c_frec_dict.json', 'w') as f:
    json.dump(utm_c_frec_dict, f)

In [127]:
with open('data/utm_c_frec_dict.json', 'r') as f:
    utm_c_frec_dict = json.load(f)



In [128]:
df3['camp_succ_rate'] = df3.utm_campaign.apply(lambda x: utm_c_frec_dict[str(x)])

In [129]:
show_empties_percentage(df3)

% of empties
utm_adcontent  -  0.185
utm_keyword  -  0.625
device_os  -  0.622
device_brand  -  0.249
device_model  -  0.993


0

In [130]:
df3['new_date'] = pd.to_datetime(df3['visit_date'])
df3['day_of_week'] = df3.new_date.dt.dayofweek

In [132]:
cols_to_drop = ['client_id','new_date', 'visit_date', 'utm_keyword', 'device_os', 'device_model', 'visit_time']


In [133]:
df3 = df3.drop(cols_to_drop,axis=1)

df3 = df3.drop('visit_date',axis=1)

df3 = df3.drop('utm_keyword',axis=1)

df3 = df3.drop('device_os',axis=1)

df3 = df3.drop('device_model',axis=1)

df3 = df3.drop('visit_time',axis=1)

In [134]:
show_what_else_to_do(df3)

utm_source
utm_medium
utm_campaign
utm_adcontent
device_category
device_brand
device_screen_resolution
device_browser
geo_country
geo_city
False


4

In [135]:
df3.loc[df3.utm_source.isna() == True, 'utm_source'] = 'other'

In [136]:
cols_to_encode = ['utm_source', 
                  'utm_medium', 
                  'utm_adcontent', 
                  'device_brand', 
                  'device_category', 
                  'device_screen_resolution', 
                  'device_browser',
                  'utm_campaign',
                  'geo_country',
                  'geo_city'
                 ]

In [137]:
show_all_about_column(df3, 'utm_adcontent')

number of empty rows:  18532
number of unique entries:  146


In [138]:
df3.loc[df3.utm_adcontent.isna() == True, 'utm_adcontent'] = 'Other'

In [139]:
show_all_about_column(df3,'device_category')

number of empty rows:  0
number of unique entries:  3


In [140]:
df3.loc[df3.device_brand.isna() == True, 'device_brand'] = 'other'

In [141]:
counter = 0
for feature in df3.columns:
    if df3[feature].dtype != 'O':
        #print(feature, ' - ', df3[feature].dtype)
        counter += 1
    else:
        print(feature)
print(counter == len(df3.columns))

utm_source
utm_medium
utm_campaign
utm_adcontent
device_category
device_brand
device_screen_resolution
device_browser
geo_country
geo_city
False


df3['resolution'] = df3.device_screen_resolution.apply(lambda x:eval(x.replace('x','*')))

In [143]:

bounds = []

for device in df3.device_category.unique():

    q25 = df3[df3.device_category == device].resolution.quantile(0.25)
    q75 = df3[df3.device_category == device].resolution.quantile(0.75)
    iqr = q75 - q25

    bounds.append((device, q25 - 1.5 * iqr, q75 + 1.5 * iqr))
    
        
resolution = []


tst_l = []
counter = 1
for i in range(len(df3.device_category)):
    tst_l.append((df3.device_category[i], eval(df3.device_screen_resolution[i].replace('x','*'))))
    if counter == 1000000:
        print(counter)
    counter +=1


In [144]:
test_list = list(df3.device_screen_resolution)


In [145]:
test_list2 = list(df3.device_category)
len(test_list2)

100000

In [146]:
for i in range(len(test_list)):
    test_list[i] = eval(test_list[i].replace('x','*'))

In [147]:
tst_l = list(zip(test_list2, test_list))

In [148]:
for i in range(len(tst_l)):
    if tst_l[i][0] == bounds[0][0]:
        resolution.append(bounds[0][0]+'_high' if tst_l[i][1] >= bounds[0][2] * 0.7 else (bounds[0][0]+'_medium' if bounds[0][2] * 0.7 > tst_l[i][1] >= bounds[0][2] * 0.1 else bounds[0][0]+'_low'))
    elif tst_l[i][0] == bounds[1][0]:
        resolution.append(bounds[1][0]+'_high' if tst_l[i][1] >= bounds[1][2] * 0.7 else (bounds[1][0]+'_medium' if bounds[1][2] * 0.7 > tst_l[i][1] >= bounds[1][2] * 0.1 else bounds[1][0]+'_low'))
    elif tst_l[i][0] == bounds[2][0]:
        resolution.append(bounds[2][0]+'_high' if tst_l[i][1] >= bounds[2][2] * 0.7 else (bounds[2][0]+'_medium' if bounds[2][2] * 0.7 > tst_l[i][1] >= bounds[2][2] * 0.1 else bounds[2][0]+'_low'))
df3['device_screen_resolution'] = resolution


df3 = df3.drop('resolution', axis=1)

show_what_else_to_do(df3)

show_all_about_column(df3, 'device_browser')

In [152]:
country_list = list(df3.geo_country.unique())

In [153]:
for item in country_list:
    print(item, ' - ', len(df3[df3.geo_country == item]))

Russia  -  97751
Ukraine  -  401
Egypt  -  12
France  -  56
Netherlands  -  76
South Korea  -  4
Germany  -  162
Armenia  -  46
Uzbekistan  -  67
Sweden  -  41
Spain  -  35
Cyprus  -  52
Finland  -  35
Belarus  -  123
Croatia  -  10
Georgia  -  51
Canada  -  16
United Arab Emirates  -  28
Morocco  -  20
Kazakhstan  -  88
Italy  -  24
Norway  -  9
United States  -  177
Turkey  -  103
Kyrgyzstan  -  44
Cuba  -  2
Israel  -  25
Estonia  -  21
United Kingdom  -  84
Bulgaria  -  16
Switzerland  -  18
(not set)  -  49
Slovenia  -  4
Azerbaijan  -  19
Iraq  -  14
Albania  -  1
Philippines  -  2
Dominican Republic  -  10
Montenegro  -  13
Saudi Arabia  -  6
Brazil  -  9
Poland  -  26
Greece  -  18
China  -  5
Latvia  -  16
Serbia  -  7
Indonesia  -  4
Afghanistan  -  4
Pakistan  -  4
Maldives  -  8
Lithuania  -  12
Nigeria  -  1
Tajikistan  -  20
Czechia  -  11
Romania  -  11
Austria  -  11
Hungary  -  5
Moldova  -  12
Iran  -  5
South Africa  -  2
Seychelles  -  4
Tunisia  -  1
Belgium  -  2


In [154]:
len(df3)*0.004

400.0

In [155]:
trsh_5_pct = 0.01#len(df3)*0.005
df3_len = len(df3) 
for item in country_list:
    if len(df3[df3.geo_country == item]) / df3_len >= trsh_5_pct:
        continue
    else:
        df3.loc[df3.geo_country == item, 'geo_country'] = 'some_unimportant_country'


city_list = list(df3.geo_city.unique())
for item in range(len(city_list)):
    city_list[item] =  (city_list[item], len(df3[df3.geo_city == city_list[item]]))

In [156]:
df3.geo_country.unique()

array(['Russia', 'some_unimportant_country'], dtype=object)

In [157]:
trsh_06_pct = 100000 / len(df3)

In [158]:
trsh_06_pct

1.0

counter = 1
for i in city_list_tst:
    if i[0] > 100000:
        #print(counter)
        counter += 1

In [None]:
with open('data/city_list.txt', 'w') as f:
    for t in city_list_tst:
        f.write(str(t) +'\n')

In [159]:
with open('data/city_list.txt', 'r') as f:
    city_list = []
    for line in f:
        # remove newline character and parentheses
        line = line.rstrip('\n').replace('(', '').replace(')', '').replace("'", '')
        # split on comma and convert each element to correct type
        tuple_elements = [int(e.strip()) if e.strip().isdigit() else e.strip() for e in line.split(',')]
        # create tuple and add to list
        my_tuple = tuple(tuple_elements)
        city_list.append(my_tuple)

In [160]:
df3.loc[((df3.geo_city != city_list[0][1]) & 
    (df3.geo_city != city_list[1][1]) & 
    (df3.geo_city != city_list[2][1]) &
    (df3.geo_city != city_list[3][1]) & 
    (df3.geo_city != city_list[4][1]) & 
    (df3.geo_city != city_list[5][1])  )
    #(df3.geo_city != city_list[6][1]) & 
    #(df3.geo_city != city_list[7][1]) & 
    #(df3.geo_city != city_list[8][1]) & 
    #(df3.geo_city != city_list[9][1]) & 
    #(df3.geo_city != city_list[10][1]) & 
    #(df3.geo_city != city_list[11][1]) & 
    #(df3.geo_city != city_list[12][1]) & 
    #(df3.geo_city != city_list[13][1]) & 
    #(df3.geo_city != city_list[14][1]) & 
    #(df3.geo_city != city_list[15][1]) & 
    #(df3.geo_city != city_list[16][1]) & 
    #(df3.geo_city != city_list[17][1]) & 
    #(df3.geo_city != city_list[18][1])) 
    ,'geo_city'  ] = 'some_unimportant_city'

In [161]:
df3.geo_city.unique()

array(['Moscow', 'Kazan', 'some_unimportant_city', 'Saint Petersburg',
       'Krasnodar', 'Yekaterinburg'], dtype=object)

## encodin' shit!

In [163]:
df3 = df3.reset_index()

In [164]:
cols_to_encode

['utm_source',
 'utm_medium',
 'utm_adcontent',
 'device_brand',
 'device_category',
 'device_screen_resolution',
 'device_browser',
 'utm_campaign',
 'geo_country',
 'geo_city']

In [165]:
encoded_features = pd.DataFrame()

for col in cols_to_encode:

    pre_encoded_df = df3[[col]]
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse=False)
    encoded_array = encoder.fit_transform(pre_encoded_df)
    #feature_names = [f'{col}_{name}' for name in encoder.get_feature_names_out()]
    feature_names = encoder.get_feature_names_out()
    encoded_df = pd.DataFrame(encoded_array, columns=feature_names)
    
    if len(encoded_features) == 0:
        encoded_features = encoded_df.copy()
    else:
        encoded_features[feature_names] = encoded_df.values



  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_n

  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_n

  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_n

  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_n

  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_n

  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_n

  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_names] = encoded_df.values
  encoded_features[feature_n

In [166]:
#df3 = df_backup.copy()

In [167]:
#df_backup = df3.copy()

In [168]:
#df3 = df_backup

In [169]:
df3 = df3.join(encoded_features, how='right')

In [170]:
df3.columns

Index(['index', 'visit_number', 'utm_source', 'utm_medium', 'utm_campaign',
       'utm_adcontent', 'device_category', 'device_brand',
       'device_screen_resolution', 'device_browser',
       ...
       'utm_campaign_zmnpxOKDENholtspXiGy',
       'utm_campaign_zxoiLxhuSIFrCeTLQVWZ', 'geo_country_Russia',
       'geo_country_some_unimportant_country', 'geo_city_Kazan',
       'geo_city_Krasnodar', 'geo_city_Moscow', 'geo_city_Saint Petersburg',
       'geo_city_Yekaterinburg', 'geo_city_some_unimportant_city'],
      dtype='object', length=705)

In [171]:
show_empties_percentage(df3)
#lts = list(encoded_features.columns)
#df3 = df3.drop(list(encoded_features.columns), axis=1)

% of empties


0

In [172]:
df3 = df3.drop(cols_to_encode, axis=1)

## scalin' shit!

In [173]:
cols_to_scale = ['visit_number']

cols_to_scale.pop(0)

In [174]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df3.loc[:,cols_to_scale])
scaled_feature_names = [f'{name}_scaled' for name in scaler.get_feature_names_out()]
#scaler.get_feature_names_out()

scaled_df = pd.DataFrame(scaled_features, columns=scaled_feature_names)

df3 = df3.join(scaled_df)


In [175]:
df3 = df3.drop('visit_number', axis=1)

In [176]:
counter = 0
for feature in df3.columns:
    if df3[feature].dtype != 'O':
        #print(feature, ' - ', df3[feature].dtype)
        counter += 1
    else:
        print(feature)
print(counter == len(df3.columns))

True


# prediction time, bitch!

df3.to_parquet('data/df_prepped_100k_parq_50_50.parquet', index=False)

In [307]:
df3 = pd.read_parquet('data/df_prepped_100k_parq_50_50.parquet')

In [308]:
df3 = df3.drop('index', axis=1)


In [272]:
df3_pos = df3[df3['target_action'] == 1].sample(5000)
df3_neg = df3[df3['target_action'] == 0].sample(5000)
df3_pos = df3_pos.reset_index()
df3_neg = df3_neg.reset_index()
df3_pos = df3_pos.drop('index', axis=1)
df3_neg = df3_neg.drop('index', axis=1)
df3 = pd.concat([df3_pos, df3_neg])
#df3 = df3.drop('level_0', axis=1)

In [309]:
df3[df3[str(feature)].isna() == True]

Unnamed: 0,target_action,camp_succ_rate,day_of_week,utm_source_ArbfvYgWhqxkzywKqpQf,utm_source_BHcvLfOaCWvWTykYqHVe,utm_source_BKeImrJuRDZcHiSSTdzm,utm_source_CgsxHpdTmXHvrHqEKRxp,utm_source_CqeIpFwJscTsZoYXdHsP,utm_source_DfoBrvtzFbohFKcUrmMV,utm_source_DlnuGwaJBHGNEKdWfOpe,...,utm_campaign_zxoiLxhuSIFrCeTLQVWZ,geo_country_Russia,geo_country_some_unimportant_country,geo_city_Kazan,geo_city_Krasnodar,geo_city_Moscow,geo_city_Saint Petersburg,geo_city_Yekaterinburg,geo_city_some_unimportant_city,visit_number_scaled


In [310]:
counter = 0
for feature in df3.columns:
    if len(df3[df3[str(feature)].isna() == True]) != 0:
        print(feature, ' - ', len(df3[df3[str(feature)].isna() == True]))
        counter += 1
    
        
    
        
if counter == 0:
            print('vse zaebis", pustukh fi4ei net')    

vse zaebis", pustukh fi4ei net


df3 = df3.drop("day_of_week", axis=1)

In [311]:
y = df3['target_action']
df3 = df3.drop('target_action', axis=1)


scores = cross_val_score(rf, x, y, cv=4)

scores

len(x.columns)

In [312]:
x_train, x_test, y_train, y_test = train_test_split(df3,y, test_size=0.3)

with open('models/rf_model.pkl', 'rb') as file:
    rf = dill.load(file)

In [291]:
with open('data/best_params_rf_10k_fit.json', 'r') as f:
    best_params = json.load(f)

In [297]:
best_params

{'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 400}

In [298]:
rf = RandomForestClassifier(n_estimators=400, min_samples_leaf=2, max_features='sqrt')

In [299]:
rf.fit(x_train, y_train)

In [313]:
predicted_train = rf.predict(x_train)

In [314]:
predicted_test = rf.predict(x_test)

In [315]:
accuracy_score(y_train, predicted_train)

0.6057714285714285

In [316]:
accuracy_score(y_test, predicted_test)

0.6083333333333333

In [317]:
roc_auc_score(y_train, rf.predict_proba(x_train)[:,1])

0.6563583668102934

In [318]:
roc_auc_score(y_test, rf.predict_proba(x_test)[:,1])

0.6595781200878102

In [319]:
df3.shape

(100000, 693)

models = [
    LogisticRegression(solver='liblinear'),
    RandomForestClassifier()
    #, SVC()
]

best_score = .0
best_pipe = None
for model in models:

    #pipe = Pipeline([
    #    ('preprocessor', preprocessor),
    #    ('classifier', model)
    #])

    score = cross_val_score(model, df3, y, cv=4, scoring='accuracy')
    print(score, ' - ', model)
    #logging.info(f'model: {type(model).__name__}, acc_mean: {score.mean():.4f}, acc_std: {score.std():.4f}')
    #if score.mean() > best_score:
    #    best_score = score.mean()
    #    best_pipe = pipe

In [285]:
param_grid = {
   'n_estimators': list(range(100, 501, 100)),
   'max_features': ['sqrt', 'log2'],
   'min_samples_leaf': list(range(1, 10))
}

In [286]:
grid_search_rf = GridSearchCV(
   estimator=rf,
   param_grid=param_grid,
   scoring='accuracy',
   verbose=1,
   n_jobs=-1
)

In [287]:
grid_search_rf.fit(x_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


In [288]:
best_params = grid_search_rf.best_params_
best_params

{'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 400}

with open('data/best_params_rf_10k_fit.json', 'w') as f: json.dump(best_params, f)

In [320]:
with open('models/rf_model.pkl', 'wb') as file:
    dill.dump(rf, file)