In [182]:
import dill
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
import time
import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import json
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [183]:
def event_action(df3):
    print('event_action start')
    target_action = ['sub_car_claim_click', 
                 'sub_car_claim_submit_click',
                 'sub_open_dialog_click', 
                 'sub_custom_question_submit_click', 
                 'sub_call_number_click', 
                 'sub_callback_submit_click', 
                 'sub_submit_success', 
                 'sub_car_request_submit_click'
                ]

    df3['event_action'] = df3['event_action'].apply(lambda x: 1 if x in target_action else 0)
    
    print( 'event_action end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [184]:
def sample_df3(df3, total_rows =  200000, neg_percent = 50, pos_percent = 50):
    print( 'sample_df3 start')
    df3_pos = df3[df3['event_action'] == 1].sample(int(total_rows / 100 * pos_percent))
    df3_neg = df3[df3['event_action'] == 0].sample(int(total_rows / 100 * neg_percent))
    df3_pos = df3_pos.reset_index()
    df3_neg = df3_neg.reset_index()
    df3_pos = df3_pos.drop('index', axis=1)
    df3_neg = df3_neg.drop('index', axis=1)
    df3 = pd.concat([df3_pos, df3_neg])
    
    print( 'sample_df3 end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

df = pd.read_csv('data/ga_hits.csv')

df2 = pd.read_csv('data/ga_sessions.csv')

df3 = df3 = pd.merge(df, df2, on='session_id')

df3 = event_action(df3)

In [185]:
def ad_campaign(df3):
    print( 'ad_campaign start')
    try:
        with open('data/utm_c_frec_dict2.json', 'r') as f:
            utm_c_frec_dict = json.load(f)
    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")

        utm_c_frec_dict = {}
        counter = 1
        for pos in df3.utm_campaign.unique():
            if len(df3[(df3.utm_campaign == pos) & (df3.event_action == 1)]) == 0:
                utm_c_frec_dict[str(pos)] = 0
            else:
                utm_c_frec_dict[str(pos)] = round(len(df3[(df3.utm_campaign == pos) & (df3.event_action == 1)]) / len(df3[df3.utm_campaign == pos]), 5)

            #print(counter)
            counter = counter  + 1
        with open('data/utm_c_frec_dict2.json', 'w') as f: 
            json.dump(utm_c_frec_dict, f)

    finally:
        df3['camp_succ_rate'] = df3.utm_campaign.apply(lambda x: utm_c_frec_dict[str(x)])
    
    #print(utm_c_frec_dict)
    print( 'ad_campaign end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [186]:
def ad_campaign_v_2(df3):
    
    print( 'ad_campaign v2 start')
    
    
    
    try:
        with open('data/utm_c_frec_dict3.json', 'r') as f:
            utm_c_frec_dict = json.load(f)
    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")
        df5 = df3[['utm_campaign', 'event_action']]
        succ_camps = df5[df5.event_action == 1].utm_campaign.value_counts(dropna=False)
        all_camps = df5.utm_campaign.value_counts(dropna=False)

        utm_c_frec_dict = {}
        
        for pos in all_camps.keys():
            if pos in succ_camps.keys():
                utm_c_frec_dict[str(pos)] = round(succ_camps[pos] / all_camps[pos], 5)
            else:
                utm_c_frec_dict[str(pos)] = 0
        
        
        with open('data/utm_c_frec_dict3.json', 'w') as f: 
            json.dump(utm_c_frec_dict, f)

    finally:
        df3['camp_succ_rate'] = df3.utm_campaign.apply(lambda x: utm_c_frec_dict[str(x)])
    
    #print(utm_c_frec_dict)
    print( 'ad_campaign v2 end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [187]:
def day_of_week(df3):
    print( 'day_of_week start')
    df3['new_date'] = pd.to_datetime(df3['visit_date'])
    df3['day_of_week'] = df3.new_date.dt.dayofweek
    
    df3 = df3.drop('new_date', axis=1)
    print('day_of_week end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [188]:
def empties(df3):
    print( 'empties end')
    df3.loc[df3.utm_source.isna() == True, 'utm_source'] = 'other'
    df3.loc[df3.utm_adcontent.isna() == True, 'utm_adcontent'] = 'Other'
    df3.loc[df3.device_brand.isna() == True, 'device_brand'] = 'other'
    
    print( 'empties end')
    print('-')      
    #print('-')      
    #print('-')  
    
    return df3

In [189]:
def resolution_func(df3):
    print('resolution_func start')
    #resolution
    bounds = []
    df3['resolution'] = df3.device_screen_resolution.apply(lambda x:eval(x.replace('x','*')))
    for device in df3.device_category.unique():
        q25 = df3[df3.device_category == device].resolution.quantile(0.25)
        q75 = df3[df3.device_category == device].resolution.quantile(0.75)
        iqr = q75 - q25
        bounds.append((device, q25 - 1.5 * iqr, q75 + 1.5 * iqr))




    test_list = list(df3.device_screen_resolution)
    test_list2 = list(df3.device_category)

    for i in range(len(test_list)):
        test_list[i] = eval(test_list[i].replace('x','*'))

    tst_l = list(zip(test_list2, test_list))

    resolution = []

    for i in range(len(tst_l)):
        if tst_l[i][0] == bounds[0][0]:
            resolution.append(bounds[0][0]+'_high' if tst_l[i][1] >= bounds[0][2] * 0.7 else (bounds[0][0]+'_medium' if bounds[0][2] * 0.7 > tst_l[i][1] >= bounds[0][2] * 0.1 else bounds[0][0]+'_low'))
        elif tst_l[i][0] == bounds[1][0]:
            resolution.append(bounds[1][0]+'_high' if tst_l[i][1] >= bounds[1][2] * 0.7 else (bounds[1][0]+'_medium' if bounds[1][2] * 0.7 > tst_l[i][1] >= bounds[1][2] * 0.1 else bounds[1][0]+'_low'))
        elif tst_l[i][0] == bounds[2][0]:
            resolution.append(bounds[2][0]+'_high' if tst_l[i][1] >= bounds[2][2] * 0.7 else (bounds[2][0]+'_medium' if bounds[2][2] * 0.7 > tst_l[i][1] >= bounds[2][2] * 0.1 else bounds[2][0]+'_low'))

    df3['device_screen_resolution'] = resolution
    df3 = df3.drop('resolution', axis=1)
    
    print('resolution_func end')
    print('-')      
    #print('-')      
    #print('-') 
    
    
    return df3
    

In [190]:
def resolution_func_v_2(df3):
    print('resolution_func v2 start')
    #resolution
    bounds = []
    df3['device_screen_resolution'] = df3.device_screen_resolution.apply(lambda x:eval(x.replace('x','*')))

    
    print('resolution_func v2 end')
    print('-')      
    #print('-')      
    #print('-') 
    
    
    return df3
    

In [191]:
def country(df3, trsh = 0.001):
    print('country start')
    #geo_country
    country_list = list(df3.geo_country.unique())
    for i in range(len(country_list)):
        country_list[i] = ( len(df3[df3.geo_country == country_list[i]]), country_list[i])
    country_list = sorted(country_list, reverse=True)

#    trsh = 0.0005
    df3_len = len(df3) 
    for item in country_list:
        if item[0] / df3_len >= trsh:
            continue
        else:
            df3.loc[df3.geo_country == item[1], 'geo_country'] = 'some_unimportant_country'
    
    print( 'country end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [192]:
def country_v_2(df3):
    print('country v2  start')
    #geo_country
    counter = 0
    country_list_new = dict()
    
    
    try:
        with open('data/country_list_new.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                country_list_new[my_tuple[0]] = my_tuple[1]


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")
        
        succ_total = len(df3[df3.event_action == 1])
        country_list_success = df3[df3.event_action == 1].geo_country.value_counts().sort_values(ascending=False)
        country_list_new = []
        for country in country_list_success.keys():
            country_list_new.append(f'{country}*{str(round(country_list_success[country] / succ_total, 4))}%')
            counter += 1
            if counter == 23:
                break

        with open('data/country_list_new.txt', 'w') as f:
            for t in country_list_new:
                f.write(str(t) +'\n')
                
        country_list_new = dict()        
        with open('data/country_list_new.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                country_list_new[my_tuple[0]] = my_tuple[1]                


    finally:
        df3['geo_country'] = df3['geo_country'].apply(lambda x: country_list_new[x] if x in country_list_new else 0.0001)
        
    

    #print(sum(df4.isnull().sum().values))
    #print(df4.isnull().sum())
    print('country v2 end')    
    print('-')      
    #print('-')      
    #print('-')     
    
    return df3

In [193]:
def city(df3, trsh = 0.001):
    print('city start')
    #geo_city
    city_list = []
    df3_len = len(df3)
    try:
        with open('data/city_list1.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace('(', '').replace(')', '').replace("'", '')
                # split on comma and convert each element to correct type
                tuple_elements = [int(e.strip()) if e.strip().isdigit() else e.strip() for e in line.split(',')]
                # create tuple and add to list
                my_tuple = tuple(tuple_elements)
                city_list.append(my_tuple)


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")

        city_list = list(zip(df3.geo_city.value_counts().values, df3.geo_city.value_counts().keys() ))
        city_list = sorted(city_list, reverse=True)

        with open('data/city_list1.txt', 'w') as f:
            for t in city_list:
                f.write(str(t) +'\n')



    finally:
#        trsh = 0.0005
        city_list_valid = []
        
        for item in city_list:
            #print(item[1], ' - ', round(item[0] / df3_len, 4),'%' )
            if round(item[0] / 15000000, 4) >= trsh:   #df3_len, 4) >= trsh:
                city_list_valid.append(item[1])
                #print('trsh == 2000 - ', item[0], item[1], round(item[0] / df3_len, 4) >= trsh, ' - appended')

        df3.loc[(~df3['geo_city'].isin(city_list_valid)), 'geo_city'] = 'some_unimportant_city'
    
    

    #print(sum(df4.isnull().sum().values))
    #print(df4.isnull().sum())
    print('city end')    
    print('-')      
    #print('-')      
    #print('-')      
    
    return df3

In [194]:
def city_v_2(df3):
    print('city v2  start')
    #geo_city
    counter = 0
    city_list_new = dict()
    
    
    try:
        with open('data/city_list_new.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                city_list_new[my_tuple[0]] = my_tuple[1]


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")
        
        succ_total = len(df3[df3.event_action == 1])
        city_list_success = df3[df3.event_action == 1].geo_city.value_counts().sort_values(ascending=False)
        city_list_new = []
        for city in city_list_success.keys():
            city_list_new.append(f'{city}*{str(round(city_list_success[city] / succ_total, 4))}%')
            counter += 1
            if counter == 26:
                break

        with open('data/city_list_new.txt', 'w') as f:
            for t in city_list_new:
                f.write(str(t) +'\n')
                
        with open('data/city_list_new.txt', 'r') as f:
            city_list_new = dict()
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                city_list_new[my_tuple[0]] = my_tuple[1]                


    finally:
        
        df3['geo_city'] = df3['geo_city'].apply(lambda x: city_list_new[x] if x in city_list_new else 0.0001)
        
    

    #print(sum(df4.isnull().sum().values))
    #print(df4.isnull().sum())
    print('city v2 end')    
    print('-')      
    #print('-')      
    #print('-')      
    
    return df3

In [165]:
df3['dev_brand_copy'] = df3.device_brand

In [166]:
device_brand(df3)

device_brand start
oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)
device_brand end
-


Unnamed: 0,session_id,hit_date,hit_time,hit_number,hit_type,hit_referer,hit_page_path,event_category,event_action,event_label,...,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,dev_brand_copy
0,5639623078712724064.1640254056.1640254056,2021-12-23,597864.0,30,event,,sberauto.com/cars?utm_source_initial=google&ut...,quiz,0,,...,wvtWRwiRmvPIsSQuXnvd,mobile,Android,0.0893,,360x780,Chrome,Russia,Saint Petersburg,Huawei
1,5639623078712724064.1640254056.1640254056,2021-12-23,290095.0,12,event,,sberauto.com/cars/all/kia/seltos/20f30855?utm_...,card_web,0,,...,wvtWRwiRmvPIsSQuXnvd,mobile,Android,0.0893,,360x780,Chrome,Russia,Saint Petersburg,Huawei
2,5639623078712724064.1640254056.1640254056,2021-12-23,294857.0,18,event,,sberauto.com/cars/all/volkswagen/tiguan/0208cd...,card_web,0,,...,wvtWRwiRmvPIsSQuXnvd,mobile,Android,0.0893,,360x780,Chrome,Russia,Saint Petersburg,Huawei
3,5639623078712724064.1640254056.1640254056,2021-12-23,295591.0,20,event,,sberauto.com/cars/all/volkswagen/tiguan/0208cd...,card_web,0,,...,wvtWRwiRmvPIsSQuXnvd,mobile,Android,0.0893,,360x780,Chrome,Russia,Saint Petersburg,Huawei
4,5639623078712724064.1640254056.1640254056,2021-12-23,290039.0,8,event,,sberauto.com/cars/all/kia/seltos/20f30855?utm_...,card_web,0,,...,wvtWRwiRmvPIsSQuXnvd,mobile,Android,0.0893,,360x780,Chrome,Russia,Saint Petersburg,Huawei
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15685214,1706097853564481669.1640267190.1640267190,2021-12-23,0.0,1,event,,sberauto.com/moskva/cars?datefrom=2021&rental_...,quiz,0,,...,puhZPIYqKXeFPaUviSjo,mobile,iOS,0.2802,,428x926,Safari,Russia,Moscow,Apple
15685215,8262758806963127884.1640272536.1640272536,2021-12-23,0.0,1,event,,sberauto.com/cars/all/renault/logan/8c3c73f2?u...,quiz,0,,...,puhZPIYqKXeFPaUviSjo,mobile,Android,0.0084,,360x800,Chrome,Russia,Ulyanovsk,Realme
15685216,3349670872968620291.1640264206.1640264206,2021-12-23,0.0,1,event,,sberauto.com/cars/all/kia/rio/fee33fe6?rental_...,quiz,0,,...,,mobile,iOS,0.2802,,375x667,Safari (in-app),Russia,Samara,Apple
15685217,1009509786569589790.1640244938.1640244938,2021-12-23,600274.0,3,event,,sberauto.com/cars?utm_source_initial=yandex&ut...,quiz,0,,...,tVtbIKrPSOvrXLCznVVe,desktop,Windows,0.0001,,1366x768,Edge,Russia,Orenburg,


In [171]:
df3['device_brand'] = df3.dev_brand_copy

In [195]:
def device_brand(df3):
    print('device_brand start')
    #device_brand
    counter = 0
    device_brand_list_new = {}
    
    
    try:
        with open('data/device_brand_list_new1.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                device_brand_list_new[my_tuple[0]] = my_tuple[1]


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")
        succ_total = len(df3[df3.event_action == 1])
        device_brand_list_success = df3[df3.event_action == 1].device_brand.value_counts().sort_values(ascending=False)
        device_brand_list_new = []
        for device_brand in device_brand_list_success.keys():
            device_brand_list_new.append(f'{device_brand}*{str(round(device_brand_list_success[device_brand] / succ_total, 4))}%')
            counter += 1
            if counter == 23:
                break

        with open('data/device_brand_list_new1.txt', 'w') as f:
            for t in device_brand_list_new:
                f.write(str(t) +'\n')
                
        device_brand_list_new = {}
        with open('data/device_brand_list_new1.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                device_brand_list_new[my_tuple[0]] = my_tuple[1]                


    finally:
        df3['device_brand'] = df3['device_brand'].apply(lambda x: device_brand_list_new[x] if x in device_brand_list_new else 0.0001)
        
    

    #print(sum(df4.isnull().sum().values))
    #print(df4.isnull().sum())
    print('device_brand end')    
    print('-')      
    #print('-')      
    #print('-')      
    
    return df3

In [196]:
def device_brand_v_2(df3, trsh = 0.0012):
    print('device_brand v 2 start')
    #device_brand
    brand_list = []
    df3_len = len(df3)
    
    try:
        with open('data/brand_list1.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace('(', '').replace(')', '').replace("'", '')
                # split on comma and convert each element to correct type
                tuple_elements = [int(e.strip()) if e.strip().isdigit() else e.strip() for e in line.split(',')]
                # create tuple and add to list
                my_tuple = tuple(tuple_elements)
                brand_list.append(my_tuple)


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")

        brand_list = list(zip(df3.device_brand.value_counts().values, df3.device_brand.value_counts().keys() ))
        brand_list = sorted(brand_list, reverse=True)

        with open('data/brand_list1.txt', 'w') as f:
            for t in brand_list:
                f.write(str(t) +'\n')



    finally:
#        trsh = 0.0005
        brand_list_valid = []
        
        for item in brand_list:
            #print(item[0], ' ', item[0] / df3_len,'>=', trsh, ' ', round(item[0] / df3_len, 4) >= trsh )
            if item[0] / df3_len >= trsh:
                brand_list_valid.append(item[1])
                #print(len(brand_list_valid), ' ', item[0],' ',item[1] )

        df3.loc[(~df3['device_brand'].isin(brand_list_valid)), 'device_brand'] = 'some_unimportant_brand'
    
    
    print('device_brand v 2 end')    
    print('-')      
    #print('-')      
    #print('-')      
    
    return df3

In [197]:
def encode_stuff(df3):
    print('encode_stuff start')
    cols_to_encode = ['utm_source', 
                      'utm_medium', 
                      'utm_adcontent', 
                      #'device_brand', 
                      'device_category', 
                      'device_screen_resolution', 
                      'device_browser',
                      'utm_campaign'
                      #,'geo_country',
                      #'geo_city'
                     ]
    
    #encoding
    encoded_features = pd.DataFrame()

    for col in cols_to_encode:

        pre_encoded_df3 = df3[[col]]
        encoder = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse=False)
        encoded_array = encoder.fit_transform(pre_encoded_df3)
        #feature_names = [f'{col}_{name}' for name in encoder.get_feature_names_out()]
        feature_names = encoder.get_feature_names_out()
        encoded_df3 = pd.DataFrame(encoded_array, columns=feature_names)

        #if len(encoded_features) == 0:
        #    encoded_features = encoded_df3.copy()
        #else:
        #    encoded_features[feature_names] = encoded_df3.values
        
        df3[feature_names] = encoded_df3.values
    #print(encoded_features.isnull().sum())

    #df3 = df3.join(encoded_features)
    #print(df3.isnull().sum())
    df3 = df3.drop(cols_to_encode, axis=1)
    print( 'encode_stuff end')
    print('-')      
    #print('-')      
    #print('-')  
    
    return df3

In [198]:
def scale_stuff(df3):
    #scaling
    print('scale_stuff start')
    cols_to_scale = ['visit_number',
                     #'day_of_week'
                    # ,'device_screen_resolution'
                    ]

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df3.loc[:,cols_to_scale])
    scaled_feature_names = [f'{name}_scaled' for name in scaler.get_feature_names_out()]
    #scaler.get_feature_names_out()

    #scaled_df = pd.DataFrame(scaled_features, columns=scaled_feature_names)
    df3[scaled_feature_names] = scaled_features
    #print(scaled_df.shape, scaled_df.columns)
    #print(scaled_df.isnull().sum())

    #df3['scaled_feature_names'] = scaled_df
    #print(df3.shape, df3.columns)
    df3 = df3.drop(cols_to_scale, axis=1)
    for column in df3.columns:
        print(column)
    print(len(df3.columns))
    #print(df3.isnull().sum())
    #print(len(df3.columns), df3.columns)
    print('scale_stuff end')
    print('-')      
    #print('-')      
    #print('-')  
    
    return df3

In [199]:
def filter_stuff(df3):
    #pre-existing list of columns
    print('filter_stuff start')
    cols_to_drop = [
        'session_id',
        'hit_date',
        'hit_time',
        'hit_number',
        'hit_type',
        'hit_referer',
        'hit_page_path',
        'event_category',
        'event_label',
        'event_value',
        'client_id',
        #'new_date',
        'visit_date',
        'visit_number',
        'utm_keyword',
        'device_os',
        'device_model',
        'visit_time'
    ]
    
    cols_to_encode = [
        'utm_source',
        'utm_medium', 
        'utm_adcontent',
        'device_brand', 
        'device_category', 
        'device_screen_resolution',
        'device_browser',
        'utm_campaign',
        'geo_country',
        'geo_city'
    ]
    #dropping
    #cols_to_drop = []
    #for col in df_columns:
    #    cols_to_drop.append(str(col))
    #cols_to_drop = cols_to_drop + ['client_id','new_date', 'visit_date', 'utm_keyword', 'device_os', 'device_model', 'visit_time']    
    
    df3 = df3.drop(cols_to_drop, axis=1)
    #df3 = df3.drop(cols_to_encode, axis=1)
    
    try:
        df3 = df3.drop('Unnamed: 0', axis=1)
    except KeyError:
        pass
    try:
        df3 = df3.drop('Unnamed: 0.1', axis=1)
    except KeyError:
        pass
    try:
        df3 = df3.drop('Unnamed: 0.2', axis=1)
    except KeyError:
        pass
    
    print('filter_stuff end')
    #print(sum(df3.isnull().sum().values))
    #print(df3.isnull().sum())
    print(df3.columns)
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [200]:
def check_stuff(df3):
    #checking
    print('check_stuff start')
    counter = 0
    for feature in df3.columns:
        if df3[feature].dtype != 'O':
            #print(feature, ' - ', df3[feature].dtype)
            counter += 1
        else:
            print(feature)
    print(counter == len(df3.columns))


    #checking 2
    counter = 0
    for feature in df3.columns:
        if len(df3[df3[str(feature)].isna() == True]) != 0:
            print(feature, ' - ', len(df3[df3[str(feature)].isna() == True]))
            counter += 1




    if counter == 0:
        print('vse zaebis", pustukh fi4ei net')    
    
    
    print('check_stuff end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [201]:
def check_stuff_2(df3):
    #checking
    print('check_stuff_2 start')

    counter = 0
    for feature in df3.columns:
        if df3[feature].dtype != 'O':
            #print(feature, ' - ', df3[feature].dtype)
            counter += 1
        else:
            print(feature)
    print(counter == len(df3.columns))


    #checking 2
    empty_features = False
    if sum(df3.isnull().sum()) != 0:
        print(df3.isnull().sum())
        empty_features = True




    if empty_features == False:
        print('vse zaebis", pustukh fi4ei net') 
    #print(len(df3.isnull().sum()))
    print(df3.shape,  'check_stuff_2 end') #df3.shape,
    print('-')      
    #print('-')      
    #print('-')     
    
    
    return df3

In [202]:
def check_stuff_3(df3):
    print(len(df3.columns), df3.columns)
    pass

In [203]:
def predict_stuff(df3):
    y = df3['event_action']
    
    df3 = df3.drop('event_action', axis=1)
    print(df3.columns)
    x_train, x_test, y_train, y_test = train_test_split(df3,y, test_size=0.3)
    
    rf = RandomForestClassifier(n_estimators=400, min_samples_leaf=2, max_features='sqrt')
    rf.fit(x_train, y_train)
    
    predicted_train = rf.predict(x_train)
    predicted_test = rf.predict(x_test)
    
    #print(df3.shape, ' - shape', ' function - ')
    
    
    print('train acc score - ',accuracy_score(y_train, predicted_train))
    print('test acc score - ', accuracy_score(y_test, predicted_test))

    print('train roc score - ',roc_auc_score(y_train, rf.predict_proba(x_train)[:,1]))
    print('test roc score - ',roc_auc_score(y_test, rf.predict_proba(x_test)[:,1]))
    pass


with open('models/rf_model.pkl', 'wb') as file:
    dill.dump(rf, file)

## function declarations end here. its wildlands after that...


df = pd.read_csv('data/ga_hits.csv')

df2 = pd.read_csv('data/ga_sessions.csv')

df3 = pd.merge(df, df2, on='session_id')

df3 = event_action(df3)

In [204]:
cv_scores = [[0.6375, 'ad_camp V, resol v1 V, ctry v1 V, ct v1 V, brand v1 V'],
 [0.6375, 'ad_camp X, resol v1 V, ctry v1 V, ct v1 V, brand v1 V'],
 [0.6375, ', ad_camp X, resol v1 V, ctry v1 V, ct v1 V, brand v1 V'],
 [0.6482, ', ad_camp V, resol v1 X, ctry v1 V, ct v1 V, brand v1 V'],
 [0.6504, ', ad_camp X, resol v1 X, ctry v1 X, ct v1 X, brand v1 X'],
 [0.6409, ', ad_camp X, resol v1 V, ctry v1 X, ct v1 X, brand v1 X'],
 [0.6396, ', ad_camp X, resol v2 V, ctry v1 X, ct v1 X, brand v1 X'],
 [0.6502, ', ad_camp X, resol v2 X, ctry v1 V, ct v1 X, brand v1 X'],
 [0.6503, ', ad_camp X, resol v2 X, ctry v2 V, ct v1 X, brand v1 X'],
 [0.6504, ', ad_camp X, resol v2 X, ctry v2 X, ct v1 X, brand v1 X'],
 [0.6498, ', ad_camp X, resol v2 X, ctry v2 X, ct v1 V, brand v1 X'],
 [0.6389, ', ad_camp X, resol v2 X, ctry v2 X, ct v2 V, brand v1 X'],
 [0.649, ', ad_camp X, resol v2 X, ctry v2 X, ct v2 X, brand v1 V'],
 [0.6503, ', ad_camp X, resol v2 X, ctry v2 X, ct v2 X, brand v2 V'],
 [0.6504, ', ad_camp X, resol v2 X, ctry v2 X, ct v2 X, brand v2 V'],
 [0.6504, ', ad_camp X, resol v2 X, ctry v2 X, ct v2 X, brand v2 V'],
 [0.6603, 'ad_camp V, resol V, cntry V, ct V, brand V, 200k 50/50'],
 [0.6576, 'ad_camp V, resol V, cntry V, ct V, brand V, 200k 70/30'],
 [0.6456, 'ad_camp V, resol V, cntry V, ct V, brand V, 140k 30/70']]

In [205]:
scores_single_fit = []

In [206]:
df4 = pd.read_csv('data/df3_100k_50n_50p.csv')


categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


f_transformer = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, make_column_selector(dtype_include='object'))
])

preprocessor = Pipeline(steps=[
    #('event', FunctionTransformer(event_action)),
    #('sampling', FunctionTransformer(sample_df3)),
    ('ad_campaign_feature_creating', FunctionTransformer(ad_campaign)),
    #('day_of_week', FunctionTransformer(day_of_week)),
    ('empties', FunctionTransformer(empties)),
    ('resolution_func', FunctionTransformer(resolution_func)),
    ('country', FunctionTransformer(country)),
    ('city', FunctionTransformer(city)),
    ('device brand', FunctionTransformer(device_brand)),
    ('filter_stuff', FunctionTransformer(filter_stuff)),
    #('encode_stuff', FunctionTransformer(encode_stuff)),
    #('scale_stuff(visit_num)', FunctionTransformer(scale_stuff)),
    ('check_stuff_3', FunctionTransformer(check_stuff_3)),
    ('f_transformer', f_transformer),
    #('filter_stuff', FunctionTransformer(filter_stuff)),
    #('check_stuff_3', FunctionTransformer(check_stuff_3))
])

models = [
    #RandomForestClassifier(n_estimators=300, max_depth= 10, max_features='sqrt', min_samples_split=2),
    #SVC(C=10, gamma=0.01, kernel='rbf'),
    #DecisionTreeClassifier(criterion='gini', max_depth=7, min_samples_split=10),
    #LogisticRegression( C=1.0, penalty='l2', solver='saga'),
    MLPClassifier(hidden_layer_sizes=(100, ), solver='adam', activation='tanh')
    ]

for model in models:
    

    pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor), 
        ('classifier', model)  
    ])

    y = df4['event_action']
    x = df4.drop('event_action', axis=1)

    #x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)
    #pipeline.fit(x_train, y_train)
    
    #predictions = pipeline.predict(x_test)
    #probs = pipeline.predict_proba(x_test)
    #scores_single_fit.append([type(model).__name__,' test roc score - ', roc_auc_score(y_test, probs[:,1])])
    #scores_single_fit.append([type(model).__name__,' test acc score - ', accuracy_score(y_test, predictions)])
    #print(type(model).__name__,' test roc score - ', roc_auc_score(y_test, probs[:,1]))
    #print(type(model).__name__,' test acc score - ', accuracy_score(y_test, predictions))
    #interm_scores.append((str(model), 'test roc score - ', roc_auc_score(y_test, pipeline.predict_proba(x_test)[:,1])))
    #interm_scores.append((str(model), 'test acc score - ', accuracy_score(y_test, predictions)))
    
    log = 'ad_camp v2 V, resol V, cntry V, ct V, brand V'
    score = cross_val_score(pipeline, x, y, cv=4, scoring='roc_auc')
    #score = cross_val_score(pipeline, x, y, cv=4, scoring='accuracy')
    cv_scores.append([ round(score.mean(), 4), log]) #type(model).__name__,

ad_campaign start
ad_campaign end
-
empties end
empties end
-
resolution_func start
resolution_func end
-
country start
country end
-
city start
city end
-
device_brand start
device_brand end
-
filter_stuff start
filter_stuff end
Index(['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
       'device_category', 'device_brand', 'device_screen_resolution',
       'device_browser', 'geo_country', 'geo_city', 'camp_succ_rate'],
      dtype='object')
-
11 Index(['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
       'device_category', 'device_brand', 'device_screen_resolution',
       'device_browser', 'geo_country', 'geo_city', 'camp_succ_rate'],
      dtype='object')
ad_campaign start
ad_campaign end
-
empties end
empties end
-
resolution_func start
resolution_func end
-
country start
country end
-
city start
city end
-
device_brand start
device_brand end
-
filter_stuff start
filter_stuff end
Index(['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
     

4 fits failed out of a total of 4.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\advok\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\advok\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\advok\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\advok\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "C

In [181]:
cv_scores

[[0.6375, 'ad_camp V, resol v1 V, ctry v1 V, ct v1 V, brand v1 V'],
 [0.6375, 'ad_camp X, resol v1 V, ctry v1 V, ct v1 V, brand v1 V'],
 [0.6375, ', ad_camp X, resol v1 V, ctry v1 V, ct v1 V, brand v1 V'],
 [0.6482, ', ad_camp V, resol v1 X, ctry v1 V, ct v1 V, brand v1 V'],
 [0.6504, ', ad_camp X, resol v1 X, ctry v1 X, ct v1 X, brand v1 X'],
 [0.6409, ', ad_camp X, resol v1 V, ctry v1 X, ct v1 X, brand v1 X'],
 [0.6396, ', ad_camp X, resol v2 V, ctry v1 X, ct v1 X, brand v1 X'],
 [0.6502, ', ad_camp X, resol v2 X, ctry v1 V, ct v1 X, brand v1 X'],
 [0.6503, ', ad_camp X, resol v2 X, ctry v2 V, ct v1 X, brand v1 X'],
 [0.6504, ', ad_camp X, resol v2 X, ctry v2 X, ct v1 X, brand v1 X'],
 [0.6498, ', ad_camp X, resol v2 X, ctry v2 X, ct v1 V, brand v1 X'],
 [0.6389, ', ad_camp X, resol v2 X, ctry v2 X, ct v2 V, brand v1 X'],
 [0.649, ', ad_camp X, resol v2 X, ctry v2 X, ct v2 X, brand v1 V'],
 [0.6503, ', ad_camp X, resol v2 X, ctry v2 X, ct v2 X, brand v2 V'],
 [0.6504, ', ad_camp X, r

In [20]:

#f_transformer = ColumnTransformer(transformers=[
 #   ('numerical', numerical_transformer, make_column_selector(dtype_include=['int64', 'float64'])),
  #  ('categorical', categorical_transformer, make_column_selector(dtype_include='object'))
#])
df4 = pd.read_csv('data/df3_10k_50n_50p.csv')

#df4 = event_action(df4)

#df4 = sample_df3(df4)


y = df4['event_action']
X = df4.drop('event_action', axis=1)

preprocessor = Pipeline(steps=[
    ('target', FunctionTransformer(event_action)),
    ('sampling', FunctionTransformer(sample_df3)),
    ('ad_campaign_feature_creating', FunctionTransformer(ad_campaign)),
    ('day_of_week', FunctionTransformer(day_of_week)),
    ('empties', FunctionTransformer(empties)),
    ('resolution_func', FunctionTransformer(resolution_func)),
    ('country', FunctionTransformer(country)),
    ('city', FunctionTransformer(city)),
    ('encode_stuff', FunctionTransformer(encode_stuff)),
    ('scale_stuff', FunctionTransformer(scale_stuff)),
    ('filter_stuff', FunctionTransformer(filter_stuff)),
    ('check_stuff', FunctionTransformer(check_stuff))
    ])

rf = RandomForestClassifier(n_estimators=400, min_samples_leaf=2, max_features='sqrt')


high_score = 0


pipe = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', rf)
])
score = cross_val_score(pipe, X, y, cv=5, scoring='a''accuracy')#, error_score='raise')
print(f'model: {type(model).__name__}, acc_mean: {score.mean():.4f}, acc_std: {score.std():.4f}')
if high_score < score.mean():
    high_score = score.mean()
    cars_pipe = pipe

else:
    continue


#cars_pipe.fit(X, y)


#with open('data/cars_pipe.pkl', 'wb') as file:
#    dill.dump({
#        'model': cars_pipe,
#        'metadata': {
#            'name': 'car price prediction',
#           'author': 'collaborative ffs by this point can i really write myself here yet?)',
#           'version': 0.00000000000000000001,
#            'date': datetime.now(),
#            'type': type(cars_pipe.named_steps["classifier"]).__name__,
#           'accuracy': high_score
#       }
#   }, file)

event_action start
event_action start
event_action start
event_action start
event_action start


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'event_action'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 437, in fit_transform
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\utils\_set_output.py", line 142, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py", line 862, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\utils\_set_output.py", line 142, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 236, in transform
    return self._transform(X, func=self.func, kw_args=self.kw_args)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 307, in _transform
    return func(X, **(kw_args if kw_args else {}))
  File "C:\Users\user\AppData\Local\Temp\ipykernel_1844\1663308378.py", line 13, in event_action
    df3['event_action'] = df3['event_action'].apply(lambda x: 1 if x in target_action else 0)
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\frame.py", line 3807, in __getitem__
    indexer = self.columns.get_loc(key)
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3804, in get_loc
    raise KeyError(key) from err
KeyError: 'event_action'


In [None]:
accuracy_score(y_test, predictions)

In [186]:
with open('data/best_params_lr_10k_50_50.json', 'w') as f: json.dump(best_params, f)

In [50]:
#from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV



df4 = pd.read_csv('data/df3_10k_50n_50p.csv')


df4 = ad_campaign(df4)
df4 = day_of_week(df4)
df4 = empties(df4)
df4 = resolution_func(df4)
df4 = country_v_2(df4)
df4 = city_v_2(df4)
df4 = device_brand(df4)
df4 = encode_stuff(df4)
df4 = scale_stuff(df4)
df4 = filter_stuff(df4)

y = df4['event_action']
x = df4.drop('event_action', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)




# Define parameters and their possible values in a dictionary
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

# Create Logistic Regression classifier object
classifier = DecisionTreeClassifier()

# Perform Grid Search with cross-validation (e.g., using k-fold CV)
grid_search = GridSearchCV(classifier,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=5)

# Fit the model on training data and find optimal parameters based on performance metric (default is accuracy)
grid_search.fit(x_train, y_train)

# Get best parameters found during grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best estimator/model for predictions on test data 
y_pred = grid_search.predict(x_test)
print("Best Parameters: ", grid_search.best_params_)
print("Best Score (ROC AUC): ", grid_search.best_score_)

ad_campaign start
ad_campaign end
-
day_of_week start
day_of_week end
-
empties end
empties end
-
resolution_func start
resolution_func end
-
country v2  start
country v2 end
-
city v2  start
city v2 end
-
device_brand start
device_brand end
-
encode_stuff start


  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3

encode_stuff end
-
scale_stuff start
scale_stuff end
-
filter_stuff start
filter_stuff end
Index(['event_action', 'device_brand', 'geo_country', 'geo_city',
       'camp_succ_rate', 'utm_source_BHcvLfOaCWvWTykYqHVe',
       'utm_source_BKeImrJuRDZcHiSSTdzm', 'utm_source_DnEUulZAecfGPvdtZBYS',
       'utm_source_EvhrtRzIJnQYHziPiLzV', 'utm_source_FTAuYVNoYYxgvKMpKSLW',
       ...
       'utm_campaign_ydXTgkwKyFWEAJoahduP',
       'utm_campaign_yxtFdhyijaALzWWYtzHE',
       'utm_campaign_zDGMDYOBPSeVFZNNwoxT',
       'utm_campaign_zPJpddwzkFqLMSYgtDqy',
       'utm_campaign_zfwIehuEfWYdYrEZgRLo',
       'utm_campaign_zmnpxOKDENholtspXiGy',
       'utm_campaign_zxoiLxhuSIFrCeTLQVWZ', 'utm_campaign_nan',
       'visit_number_scaled', 'day_of_week_scaled'],
      dtype='object', length=376)
-
Best Parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 10}
Best Parameters:  {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 10}
Best Score (ROC AUC):  0.600507264020655

In [51]:
with open('data/best_params_des_tree_10k_50_50.json', 'w') as f: json.dump(best_params, f)

In [185]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


df4 = pd.read_csv('data/df3_10k_50n_50p.csv')


df4 = ad_campaign(df4)
df4 = day_of_week(df4)
df4 = empties(df4)
df4 = resolution_func(df4)
df4 = country_v_2(df4)
df4 = city_v_2(df4)
df4 = device_brand(df4)
df4 = encode_stuff(df4)
df4 = scale_stuff(df4)
df4 = filter_stuff(df4)

y = df4['event_action']
x = df4.drop('event_action', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)


# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],                # Regularization penalty ('l1' or 'l2')
    'C': [0.01, 0.1, 1.0],                   # Inverse regularization strength (smaller values specify stronger regularization)
    'solver': ['liblinear', 'saga']          # Algorithm to use in optimization problem
}

# Create Logistic Regression classifier object
lr_model = LogisticRegression()

# Perform Grid Search with cross-validation (e.g., using k-fold CV)
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid)

# Fit the model on training data and find optimal parameters based on performance metric (default is accuracy)
grid_search.fit(x_train, y_train)

# Get best parameters found during grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best estimator/model for predictions on test data 
y_pred = grid_search.predict(x_test)
print(accuracy_score(y_test, y_pred))

ad_campaign start
ad_campaign end
-
-
-
day_of_week start
day_of_week end
-
-
-
empties end
empties end
-
-
-
resolution_func start
resolution_func end
-
-
-
country v2  start
country v2 end
-
-
-
city v2  start
city v2 end
-
-
-
device_brand start
device_brand end
-
-
-
encode_stuff start


  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3

encode_stuff end
-
-
-
scale_stuff start
scale_stuff end
-
-
-
filter_stuff start
filter_stuff end
-
-
-




Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}
0.5863333333333334




In [184]:
with open('data/best_params_rf_10k_50_50.json', 'w') as f: json.dump(best_params, f)

In [183]:
df4 = pd.read_csv('data/df3_10k_50n_50p.csv')


df4 = ad_campaign(df4)
df4 = day_of_week(df4)
df4 = empties(df4)
df4 = resolution_func(df4)
df4 = country_v_2(df4)
df4 = city_v_2(df4)
df4 = device_brand(df4)
df4 = encode_stuff(df4)
df4 = scale_stuff(df4)
df4 = filter_stuff(df4)

y = df4['event_action']
x = df4.drop('event_action', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400],    # Number of trees in the forest
    'max_depth': [None, 5, 10],          # Maximum depth of each tree
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
}

# Create Random Forest classifier object
rf_model = RandomForestClassifier()

# Perform Grid Search with cross-validation (e.g., using k-fold CV)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid)

# Fit the model on training data and find optimal parameters based on performance metric (default is accuracy)
grid_search.fit(x_train, y_train)

# Get best parameters found during grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best estimator/model for predictions on test data 
y_pred = grid_search.predict(x_test)
print(accuracy_score(y_test, y_pred))


ad_campaign start
ad_campaign end
-
-
-
day_of_week start
day_of_week end
-
-
-
empties end
empties end
-
-
-
resolution_func start
resolution_func end
-
-
-
country v2  start
country v2 end
-
-
-
city v2  start
city v2 end
-
-
-
device_brand start
device_brand end
-
-
-
encode_stuff start


  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3.values
  df3[feature_names] = encoded_df3

encode_stuff end
-
-
-
scale_stuff start
scale_stuff end
-
-
-
filter_stuff start
filter_stuff end
-
-
-
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}
0.5943333333333334


In [86]:
max(scores)

'200k, 70n\x18p'