In [1]:
import dill
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
import time
import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import json
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
def pre_filter(df3):
    # оставляет только "разрешенные" колонки
    cols_to_proon = []
    for col in df3.columns:
        if 'device' in col:
            continue
        elif 'geo'in col:
            continue
        elif 'utm' in col:
            continue
        elif 'event_action' in col:
            continue
        else:
            cols_to_proon.append(col)
    df3 = df3.drop(cols_to_proon, axis=1)
    return df3

In [3]:
def event_action(df3):
    print('event_action start')
    target_action = ['sub_car_claim_click', 
                 'sub_car_claim_submit_click',
                 'sub_open_dialog_click', 
                 'sub_custom_question_submit_click', 
                 'sub_call_number_click', 
                 'sub_callback_submit_click', 
                 'sub_submit_success', 
                 'sub_car_request_submit_click'
                ]

    df3['event_action'] = df3['event_action'].apply(lambda x: 1 if x in target_action else 0)
    
    print( 'event_action end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [4]:
def sample_df3(df3, total_rows =  200000, neg_percent = 50, pos_percent = 50):
    print( 'sample_df3 start')
    df3_pos = df3[df3['event_action'] == 1].sample(int(total_rows / 100 * pos_percent))
    df3_neg = df3[df3['event_action'] == 0].sample(int(total_rows / 100 * neg_percent))
    df3_pos = df3_pos.reset_index()
    df3_neg = df3_neg.reset_index()
    df3_pos = df3_pos.drop('index', axis=1)
    df3_neg = df3_neg.drop('index', axis=1)
    df3 = pd.concat([df3_pos, df3_neg])
    
    print( 'sample_df3 end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

df = pd.read_csv('data/ga_hits.csv')

df2 = pd.read_csv('data/ga_sessions.csv')

df3 = pd.merge(df, df2, on='session_id')

df3 = event_action(df3)

In [5]:
def ad_campaign(df3):
    print( 'ad_campaign start')
    try:
        with open('data/utm_c_frec_dict2.json', 'r') as f:
            utm_c_frec_dict = json.load(f)
    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")

        utm_c_frec_dict = {}
        counter = 1
        for pos in df3.utm_campaign.unique():
            if len(df3[(df3.utm_campaign == pos) & (df3.event_action == 1)]) == 0:
                utm_c_frec_dict[str(pos)] = 0
            else:
                utm_c_frec_dict[str(pos)] = round(len(df3[(df3.utm_campaign == pos) & (df3.event_action == 1)]) / len(df3[df3.utm_campaign == pos]), 5)

            #print(counter)
            counter = counter  + 1
        with open('data/utm_c_frec_dict2.json', 'w') as f: 
            json.dump(utm_c_frec_dict, f)

    finally:
        df3['camp_succ_rate'] = df3.utm_campaign.apply(lambda x: utm_c_frec_dict[str(x)])
    
    #print(utm_c_frec_dict)
    print( 'ad_campaign end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [7]:
def ad_campaign_v_2(df3):
    import json
    print( 'ad_campaign v2 start')
    
    
    
    try:
        with open('data/utm_c_frec_dict3.json', 'r') as f:
            utm_c_frec_dict = json.load(f)
    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")
        df5 = df3[['utm_campaign', 'event_action']]
        succ_camps = df5[df5.event_action == 1].utm_campaign.value_counts(dropna=False)
        all_camps = df5.utm_campaign.value_counts(dropna=False)

        utm_c_frec_dict = {}
        
        for pos in all_camps.keys():
            if pos in succ_camps.keys():
                utm_c_frec_dict[str(pos)] = round(succ_camps[pos] / all_camps[pos], 5)
            else:
                utm_c_frec_dict[str(pos)] = 0
        
        
        with open('data/utm_c_frec_dict3.json', 'w') as f: 
            json.dump(utm_c_frec_dict, f)

    finally:
        df3['camp_succ_rate'] = df3.utm_campaign.apply(lambda x: utm_c_frec_dict[str(x)])
    
    #print(utm_c_frec_dict)
    print( 'ad_campaign v2 end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [8]:
def day_of_week(df3):
    print( 'day_of_week start')
    import pandas as pd
    df3['new_date'] = pd.to_datetime(df3['visit_date'])
    df3['day_of_week'] = df3.new_date.dt.dayofweek
    
    df3 = df3.drop('new_date', axis=1)
    print('day_of_week end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [9]:
def empties(df3):
    print( 'empties end')
    import pandas as pd
    df3.loc[df3.utm_source.isna() == True, 'utm_source'] = 'other'
    df3.loc[df3.utm_adcontent.isna() == True, 'utm_adcontent'] = 'Other'
    df3.loc[df3.device_brand.isna() == True, 'device_brand'] = 'other'
    
    print( 'empties end')
    print('-')      
    #print('-')      
    #print('-')  
    
    return df3

In [44]:
def resolution_func(df3):
    print('resolution_func start')
    import pandas as pd
    #resolution
    bounds = []
    
    try:
        with open('data/resolution_bounds.txt', 'r') as file:
            lines = file.readlines()
            for line in lines:
                values = line.strip().split(', ')
                bounds.append((values[0], int(values[1]), int(values[2])))
        
    except FileNotFoundError:
        df3['resolution'] = df3.device_screen_resolution.apply(lambda x:eval(x.replace('x','*')))
        for device in df3.device_category.unique():
            q25 = df3[df3.device_category == device].resolution.quantile(0.25)
            q75 = df3[df3.device_category == device].resolution.quantile(0.75)
            iqr = q75 - q25
            bounds.append((device, q25 - 1.5 * iqr, q75 + 1.5 * iqr))

        with open('data/resolution_bounds.txt', 'w') as file:
            for device in bounds:
                line = f"{device[0]}, {device[1]}, {device[2]}\n"
                file.write(line)
        df3 = df3.drop('resolution', axis=1)

    finally:
        
        test_list = list(df3.device_screen_resolution)
        test_list2 = list(df3.device_category)

        for i in range(len(test_list)):
            test_list[i] = eval(test_list[i].replace('x','*'))

        tst_l = list(zip(test_list2, test_list))

        resolution = []

        for i in range(len(tst_l)):
            if tst_l[i][0] == bounds[0][0]:
                resolution.append(bounds[0][0]+'_high' if tst_l[i][1] >= bounds[0][2] * 0.7 else (bounds[0][0]+'_medium' if bounds[0][2] * 0.7 > tst_l[i][1] >= bounds[0][2] * 0.1 else bounds[0][0]+'_low'))
            elif tst_l[i][0] == bounds[1][0]:
                resolution.append(bounds[1][0]+'_high' if tst_l[i][1] >= bounds[1][2] * 0.7 else (bounds[1][0]+'_medium' if bounds[1][2] * 0.7 > tst_l[i][1] >= bounds[1][2] * 0.1 else bounds[1][0]+'_low'))
            elif tst_l[i][0] == bounds[2][0]:
                resolution.append(bounds[2][0]+'_high' if tst_l[i][1] >= bounds[2][2] * 0.7 else (bounds[2][0]+'_medium' if bounds[2][2] * 0.7 > tst_l[i][1] >= bounds[2][2] * 0.1 else bounds[2][0]+'_low'))

        df3['device_screen_resolution_engeneered'] = resolution
        #df3['device_screen_resolution'] = resolution
        
    
    print('resolution_func end')
    print('-')      
    #print('-')      
    #print('-') 
    
    
    return df3
    

In [11]:
def resolution_func_v_2(df3):
    import pandas as pd
    print('resolution_func v2 start')
    #resolution
    bounds = []
    df3['device_screen_resolution'] = df3.device_screen_resolution.apply(lambda x:eval(x.replace('x','*')))

    
    print('resolution_func v2 end')
    print('-')      
    #print('-')      
    #print('-') 
    
    
    return df3
    

In [12]:
def country(df3, trsh = 0.001):
    import pandas as pd
    print('country start')
    #geo_country
    country_list = list(df3.geo_country.unique())
    for i in range(len(country_list)):
        country_list[i] = ( len(df3[df3.geo_country == country_list[i]]), country_list[i])
    country_list = sorted(country_list, reverse=True)

#    trsh = 0.0005
    df3_len = len(df3) 
    for item in country_list:
        if item[0] / df3_len >= trsh:
            continue
        else:
            df3.loc[df3.geo_country == item[1], 'geo_country'] = 'some_unimportant_country'
    
    print( 'country end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [13]:
def country_v_2(df3):
    import pandas as pd
    print('country v2  start')
    #geo_country
    counter = 0
    country_list_new = dict()
    
    
    try:
        with open('data/country_list_new.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                country_list_new[my_tuple[0]] = my_tuple[1]


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")
        
        succ_total = len(df3[df3.event_action == 1])
        country_list_success = df3[df3.event_action == 1].geo_country.value_counts().sort_values(ascending=False)
        country_list_new = []
        for country in country_list_success.keys():
            country_list_new.append(f'{country}*{str(round(country_list_success[country] / succ_total, 4))}%')
            counter += 1
            if counter == 23:
                break

        with open('data/country_list_new.txt', 'w') as f:
            for t in country_list_new:
                f.write(str(t) +'\n')
                
        country_list_new = dict()        
        with open('data/country_list_new.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                country_list_new[my_tuple[0]] = my_tuple[1]                


    finally:
        df3['geo_country_succ_perc'] = df3['geo_country'].apply(lambda x: country_list_new[x] if x in country_list_new else 0.0001)
        
    

    #print(sum(df4.isnull().sum().values))
    #print(df4.isnull().sum())
    print('country v2 end')    
    print('-')      
    #print('-')      
    #print('-')     
    
    return df3

In [14]:
def country_v_3(df3, trsh=0.001):
    import pandas as pd
    # При первом запуске:
        # 1) создает список .value_counts по странам
        # Первый запуск рекоммендуется производить на полном датасете
        # 2) Записывает список в файл
        # 3)Оставляет только страны, колечество появлений которых составляет больше 0.001 от длины полного датасета
    # При последующих запусках берет готовый файл со значениями, далее п.3

    # print('country_v_3 start')
    country_list = []
    df3_len = len(df3)

    try:
        with open('data/country_list1.txt', 'r') as f:
            for line in f:
                line = line.rstrip('\n').replace('(', '').replace(')', '').replace("'", '')
                tuple_elements = [int(e.strip()) if e.strip().isdigit() else e.strip() for e in line.split(',')]
                my_tuple = tuple(tuple_elements)
                country_list.append(my_tuple)

    except FileNotFoundError:
        # print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")

        country_list = list(zip(df3.geo_country.value_counts().values, df3.geo_country.value_counts().keys()))
        country_list = sorted(country_list, reverse=True)

        with open('data/country_list1.txt', 'w') as f:
            for t in country_list:
                f.write(str(t) + '\n')

    finally:
        country_list_valid = []

        for item in country_list:
            if item[0] / 15000000 >= trsh:
                country_list_valid.append(item[1])

        df3.loc[(~df3['geo_country'].isin(country_list_valid)), 'geo_country'] = 'some_unimportant_country'

    return df3



In [15]:
def city(df3, trsh = 0.001):
    import pandas as pd
    print('city start')
    #geo_city
    city_list = []
    df3_len = len(df3)
    try:
        with open('data/city_list1.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace('(', '').replace(')', '').replace("'", '')
                # split on comma and convert each element to correct type
                tuple_elements = [int(e.strip()) if e.strip().isdigit() else e.strip() for e in line.split(',')]
                # create tuple and add to list
                my_tuple = tuple(tuple_elements)
                city_list.append(my_tuple)


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")

        city_list = list(zip(df3.geo_city.value_counts().values, df3.geo_city.value_counts().keys() ))
        city_list = sorted(city_list, reverse=True)

        with open('data/city_list1.txt', 'w') as f:
            for t in city_list:
                f.write(str(t) +'\n')



    finally:
#        trsh = 0.0005
        city_list_valid = []
        
        for item in city_list:
            #print(item[1], ' - ', round(item[0] / df3_len, 4),'%' )
            if round(item[0] / 15000000, 4) >= trsh:   #df3_len, 4) >= trsh:
                city_list_valid.append(item[1])
                #print('trsh == 2000 - ', item[0], item[1], round(item[0] / df3_len, 4) >= trsh, ' - appended')

        df3.loc[(~df3['geo_city'].isin(city_list_valid)), 'geo_city'] = 'some_unimportant_city'
    
    

    #print(sum(df4.isnull().sum().values))
    #print(df4.isnull().sum())
    print('city end')    
    print('-')      
    #print('-')      
    #print('-')      
    
    return df3

In [16]:
def city_v_2(df3):
    import pandas as pd
    print('city v2  start')
    #geo_city
    counter = 0
    city_list_new = dict()
    
    
    try:
        with open('data/city_list_new.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                city_list_new[my_tuple[0]] = my_tuple[1]


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")
        
        succ_total = len(df3[df3.event_action == 1])
        city_list_success = df3[df3.event_action == 1].geo_city.value_counts().sort_values(ascending=False)
        city_list_new = []
        for city in city_list_success.keys():
            city_list_new.append(f'{city}*{str(round(city_list_success[city] / succ_total, 4))}%')
            counter += 1
            if counter == 26:
                break

        with open('data/city_list_new.txt', 'w') as f:
            for t in city_list_new:
                f.write(str(t) +'\n')
                
        with open('data/city_list_new.txt', 'r') as f:
            city_list_new = dict()
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                city_list_new[my_tuple[0]] = my_tuple[1]                


    finally:
        
        df3['geo_city_succ_perc'] = df3['geo_city'].apply(lambda x: city_list_new[x] if x in city_list_new else 0.0001)
        
    

    #print(sum(df4.isnull().sum().values))
    #print(df4.isnull().sum())
    print('city v2 end')    
    print('-')      
    #print('-')      
    #print('-')      
    
    return df3

In [17]:
def device_brand_v_2(df3):
    import pandas as pd
    print('device_brand v2 start')
    #device_brand
    counter = 0
    device_brand_list_new = dict()
    
    
    try:
        with open('data/device_brand_list_new1.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                device_brand_list_new[my_tuple[0]] = my_tuple[1]


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")
        succ_total = len(df3[df3.event_action == 1])
        device_brand_list_success = df3[df3.event_action == 1].device_brand.value_counts().sort_values(ascending=False)
        device_brand_list_new = []
        for device_brand in device_brand_list_success.keys():
            device_brand_list_new.append(f'{device_brand}*{str(round(device_brand_list_success[device_brand] / succ_total, 4))}%')
            counter += 1
            if counter == 23:
                break

        with open('data/device_brand_list_new1.txt', 'w') as f:
            for t in device_brand_list_new:
                f.write(str(t) +'\n')
                
        device_brand_list_new = dict()
        with open('data/device_brand_list_new1.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace("%", '')
                tuple_elements = line.split('*')
                my_tuple = (tuple_elements[0], eval(tuple_elements[1]))
                device_brand_list_new[my_tuple[0]] = my_tuple[1]                


    finally:
        df3['device_brand_succ_perc'] = df3['device_brand'].apply(lambda x: device_brand_list_new[x] if x in device_brand_list_new else 0.0001)
        
    

    #print(sum(df4.isnull().sum().values))
    #print(df4.isnull().sum())
    print('device_brand v2 end')    
    print('-')      
    #print('-')      
    #print('-')      
    
    return df3

df3 = device_brand_v_2(df3)

df4 = pd.read_csv('data/df3_10k_50n_50p.csv')

df4 = device_brand_v_2(df4)

df4.device_brand.value_counts()

In [18]:
def device_brand(df3, trsh = 0.0012):
    import pandas as pd
    print('device_brand start')
    #device_brand
    brand_list = []
    df3_len = len(df3)
    
    try:
        with open('data/brand_list1.txt', 'r') as f:
            for line in f:
                # remove newline character and parentheses
                line = line.rstrip('\n').replace('(', '').replace(')', '').replace("'", '')
                # split on comma and convert each element to correct type
                tuple_elements = [int(e.strip()) if e.strip().isdigit() else e.strip() for e in line.split(',')]
                # create tuple and add to list
                my_tuple = tuple(tuple_elements)
                brand_list.append(my_tuple)


    except FileNotFoundError:
        print("oh.., looks like its the first time you run it - lil' bit longer then, m8. pls hold:)")

        brand_list = list(zip(df3.device_brand.value_counts().values, df3.device_brand.value_counts().keys() ))
        brand_list = sorted(brand_list, reverse=True)

        with open('data/brand_list1.txt', 'w') as f:
            for t in brand_list:
                f.write(str(t) +'\n')



    finally:
#        trsh = 0.0005
        brand_list_valid = []
        
        for item in brand_list:
            #print(item[0], ' ', item[0] / df3_len,'>=', trsh, ' ', round(item[0] / df3_len, 4) >= trsh )
            if item[0] / df3_len >= trsh:
                brand_list_valid.append(item[1])
                #print(len(brand_list_valid), ' ', item[0],' ',item[1] )

        df3.loc[(~df3['device_brand'].isin(brand_list_valid)), 'device_brand'] = 'some_unimportant_brand'
    
    
    print('device_brand end')    
    print('-')      
    #print('-')      
    #print('-')      
    
    return df3

In [19]:
def encode_stuff(df3):
    import pandas as pd
    print('encode_stuff start')
    cols_to_encode = ['utm_source', 
                      'utm_medium', 
                      'utm_adcontent', 
                      #'device_brand', 
                      'device_category', 
                      'device_screen_resolution', 
                      'device_browser',
                      'utm_campaign'
                      #,'geo_country',
                      #'geo_city'
                     ]
    
    #encoding
    encoded_features = pd.DataFrame()

    for col in cols_to_encode:

        pre_encoded_df3 = df3[[col]]
        encoder = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse=False)
        encoded_array = encoder.fit_transform(pre_encoded_df3)
        #feature_names = [f'{col}_{name}' for name in encoder.get_feature_names_out()]
        feature_names = encoder.get_feature_names_out()
        encoded_df3 = pd.DataFrame(encoded_array, columns=feature_names)

        #if len(encoded_features) == 0:
        #    encoded_features = encoded_df3.copy()
        #else:
        #    encoded_features[feature_names] = encoded_df3.values
        
        df3[feature_names] = encoded_df3.values
    #print(encoded_features.isnull().sum())

    #df3 = df3.join(encoded_features)
    #print(df3.isnull().sum())
    df3 = df3.drop(cols_to_encode, axis=1)
    print( 'encode_stuff end')
    print('-')      
    #print('-')      
    #print('-')  
    
    return df3

In [107]:
def scale_stuff(df3):
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    #scaling
    print('scale_stuff start')
    cols_to_scale = [
        'camp_succ_rate',
        'geo_country_succ_perc',
        'geo_city_succ_perc',
        'device_brand_succ_perc',
        'device_screen_resolution'
        #'visit_number',
                     #'day_of_week',
                     #'device_screen_resolution'
                               
                    ]

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df3.loc[:,cols_to_scale])
    scaled_feature_names = [f'{name}_scaled' for name in scaler.get_feature_names_out()]
    #scaler.get_feature_names_out()

    #scaled_df = pd.DataFrame(scaled_features, columns=scaled_feature_names)
    df3[scaled_feature_names] = scaled_features
    #print(scaled_df.shape, scaled_df.columns)
    #print(scaled_df.isnull().sum())

    #df3['scaled_feature_names'] = scaled_df
    #print(df3.shape, df3.columns)
    df3 = df3.drop(cols_to_scale, axis=1)
    #for column in df3.columns:
    #    print(column)
    #print(len(df3.columns))
    #print(df3.isnull().sum())
    #print(len(df3.columns), df3.columns)
    print('scale_stuff end')
    print(len(df3.columns))
    features_list_testing = []
    potential_col_number = 0
    for column in df3.columns:
        if df3[column].dtype != 'int64' or df3[column].dtype != 'float64':
            potential_col_number += len(df3[column].unique())
        
    for i in df3.columns:
        features_list_testing.append(i)
    with open('data/features_list_testing.txt', 'w') as f:
        for t in features_list_testing:
            f.write(str(t) +'\n')

    print(potential_col_number)      
    #print('-')  
    
    return df3

In [104]:
df4.event_action.unique()

array([1., 0.])

In [89]:
def check_stuff_4(df3):
    features_list_testing = []
    #print(df3)
    for i in df3.columns:
        features_list_testing.append(i)
    with open('data/features_list_testing.txt', 'w') as f:
        for t in features_list_testing:
            f.write(str(t) +'\n')
    return df3

In [21]:
def filter_stuff(df3):
    import pandas as pd
    #pre-existing list of columns
    print('filter_stuff start')
    cols_to_drop = [
        'session_id',
        'hit_date',
        'hit_time',
        'hit_number',
        'hit_type',
        'hit_referer',
        'hit_page_path',
        'event_category',
        'event_label',
        'event_value',
        'client_id',
        #'new_date',
        'visit_date',
        'visit_number',
        'utm_keyword',
        'device_os',
        'device_model',
        'visit_time'
    ]
    
    #cols_to_encode = [
     #   'utm_source',
      #  'utm_medium', 
       ## 'utm_adcontent',
        #'device_brand', 
        #'device_category', 
        #'device_screen_resolution',
        #'device_browser',
        #'utm_campaign',
        #'geo_country',
        #'geo_city'
    #]
    #dropping
    #cols_to_drop = []
    #for col in df_columns:
    #    cols_to_drop.append(str(col))
    #cols_to_drop = cols_to_drop + ['client_id','new_date', 'visit_date', 'utm_keyword', 'device_os', 'device_model', 'visit_time']    
    
    df3 = df3.drop(cols_to_drop, axis=1)
    #df3 = df3.drop(cols_to_encode, axis=1)
    
    try:
        df3 = df3.drop('Unnamed: 0', axis=1)
    except KeyError:
        pass
    try:
        df3 = df3.drop('Unnamed: 0.1', axis=1)
    except KeyError:
        pass
    try:
        df3 = df3.drop('Unnamed: 0.2', axis=1)
    except KeyError:
        pass
    
    print('filter_stuff end')
    #print(sum(df3.isnull().sum().values))
    #print(df3.isnull().sum())
    print(df3.columns)
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [22]:
def filter_stuff_2(df3):
    import pandas as pd
    #pre-existing list of columns
    print('filter_stuff_2 start')
    cols_to_drop = [
        # 'session_id',
        # 'hit_date',
        # 'hit_time',
        # 'hit_number',
        # 'hit_type',
        # 'hit_referer',
        # 'hit_page_path',
        # 'event_category',
        # 'event_label',
        # 'event_value',
        # 'client_id',
        # 'new_date',
        # 'visit_date',
        # 'visit_number',
        'utm_keyword',
        'device_os',
        'device_model',
        # 'visit_time'
    ]
    
    #cols_to_encode = [
     #   'utm_source',
      #  'utm_medium', 
       ## 'utm_adcontent',
        #'device_brand', 
        #'device_category', 
        #'device_screen_resolution',
        #'device_browser',
        #'utm_campaign',
        #'geo_country',
        #'geo_city'
    #]
    #dropping
    #cols_to_drop = []
    #for col in df_columns:
    #    cols_to_drop.append(str(col))
    #cols_to_drop = cols_to_drop + ['client_id','new_date', 'visit_date', 'utm_keyword', 'device_os', 'device_model', 'visit_time']    
    
    df3 = df3.drop(cols_to_drop, axis=1)
    #df3 = df3.drop(cols_to_encode, axis=1)
    
    try:
        df3 = df3.drop('Unnamed: 0', axis=1)
    except KeyError:
        pass
    try:
        df3 = df3.drop('Unnamed: 0.1', axis=1)
    except KeyError:
        pass
    try:
        df3 = df3.drop('Unnamed: 0.2', axis=1)
    except KeyError:
        pass
    
    print('filter_stuff_2 end')
    #print(sum(df3.isnull().sum().values))
    #print(df3.isnull().sum())
    print(df3.columns)
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [23]:
def check_stuff(df3):
    import pandas as pd
    #checking
    print('check_stuff start')
    counter = 0
    for feature in df3.columns:
        if df3[feature].dtype != 'O':
            #print(feature, ' - ', df3[feature].dtype)
            counter += 1
        else:
            print(feature)
    print(counter == len(df3.columns))


    #checking 2
    counter = 0
    for feature in df3.columns:
        if len(df3[df3[str(feature)].isna() == True]) != 0:
            print(feature, ' - ', len(df3[df3[str(feature)].isna() == True]))
            counter += 1




    if counter == 0:
        print('vse zaebis", pustukh fi4ei net')    
    
    
    print('check_stuff end')
    print('-')      
    #print('-')      
    #print('-') 
    
    return df3

In [24]:
def check_stuff_2(df3):
    import pandas as pd
    #checking
    print('check_stuff_2 start')

    counter = 0
    for feature in df3.columns:
        if df3[feature].dtype != 'O':
            #print(feature, ' - ', df3[feature].dtype)
            counter += 1
        else:
            print(feature)
    print(counter == len(df3.columns))


    #checking 2
    empty_features = False
    if sum(df3.isnull().sum()) != 0:
        print(df3.isnull().sum())
        empty_features = True




    if empty_features == False:
        print('vse zaebis", pustukh fi4ei net') 
    #print(len(df3.isnull().sum()))
    print(df3.shape,  'check_stuff_2 end') #df3.shape,
    print('-')      
    #print('-')      
    #print('-')     
    
    
    return df3

In [25]:
def check_stuff_3(df3):
    import pandas as pd
    for column in df3.columns:
        print(column)
        print(df3[column].value_counts())
    print(len(df3.columns))
    print(' - ')
    
    return df3

In [26]:
def predict_stuff(df3):
    import pandas as pd
    y = df3['event_action']
    
    df3 = df3.drop('event_action', axis=1)
    print(df3.columns)
    x_train, x_test, y_train, y_test = train_test_split(df3,y, test_size=0.3)
    
    rf = RandomForestClassifier(n_estimators=400, min_samples_leaf=2, max_features='sqrt')
    rf.fit(x_train, y_train)
    
    predicted_train = rf.predict(x_train)
    predicted_test = rf.predict(x_test)
    
    #print(df3.shape, ' - shape', ' function - ')
    
    
    print('train acc score - ',accuracy_score(y_train, predicted_train))
    print('test acc score - ', accuracy_score(y_test, predicted_test))

    print('train roc score - ',roc_auc_score(y_train, rf.predict_proba(x_train)[:,1]))
    print('test roc score - ',roc_auc_score(y_test, rf.predict_proba(x_test)[:,1]))
    pass


with open('models/rf_model.pkl', 'wb') as file:
    dill.dump(rf, file)

## function declarations end here. its wildlands after that...


In [39]:
df = pd.read_csv('data/ga_hits.csv')

In [40]:
df2 = pd.read_csv('data/ga_sessions.csv')

  df2 = pd.read_csv('data/ga_sessions.csv')


In [41]:
df3 = pd.merge(df, df2, on='session_id')

In [42]:
df3 = event_action(df3)

event_action start
event_action end
-


In [26]:
cv_scores = [
    [0.6603, 'ad_camp V, resol V, cntry V, ct V, brand V, 200k 50/50'],
    [0.6576, 'ad_camp V, resol V, cntry V, ct V, brand V, 200k 70/30'],
    [0.6456, 'ad_camp V, resol V, cntry V, ct V, brand V, 140k 30/70'],
    [0.6469, 'ad_camp V, resol V, cntry V, ct V, brand V, 100k 50/50'],
    [0.6428, 'ad_camp V, resol V, cntry V, ct V, brand V, 100k 70/30'],
    [0.6447, 'ad_camp V, resol V, cntry V, ct V, brand V, 100k 30/70'],
    [0.6438, 'ad_camp v2 V, resol V, cntry V, ct V, brand V, 100k 30/70'],
    [0.6445, 'ad_camp v2 V, resol V, cntry V, ct V, brand v2 V, 100k 30/70'],
    [0.6459, 'ad_camp V, resol V, cntry V, ct V, brand v2 V, 100k 30/70'],
    [0.6312, 'ad_camp V, resol V, cntry V, ct v2 V, brand v2 V, 100k 30/70'],
    [0.6463, 'ad_camp V, resol V, cntry v 2V, ct V, brand v2 V, 100k 30/70'],
    [0.6461, 'ad_camp V, resol v2 V, cntry v2 V, ct V, brand v2 V, 100k 30/70'],
    [0.6482, 'ad_camp V, resol v2 V, cntry v2 V, ct V, brand v2 V, 100k 50/50'],
    [0.6486, 'ad_camp V, resol v2 V, cntry v2 V, ct V, brand v2 V, 100k 50/50'],
    [0.6463, 'ad_camp X, resol v2 V, cntry v2 V, ct V, brand v2 V, 100k 50/50'],
    [0.6489, 'ad_camp V, resol v2 V, cntry v2 V, cntry V, ct v2 V, ct V, brand v2 V, 100k 50/50'],
    [0.6605, 'ad_camp V, resol v2 V, cntry v2 V, cntry V, ct v2 V, ct V, brand v2 V, 200k 50/50']
]

In [24]:
scores_single_fit = []

In [90]:
df4 = pd.read_csv('data/df3_2k_50n_50p_maxi.csv')

categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


f_transformer = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, make_column_selector(dtype_include='object'))
])

preprocessor = Pipeline(steps=[
    #('event', FunctionTransformer(event_action)),
    #('sampling', FunctionTransformer(sample_df3)),
    ('ad_campaign_feature_creating v2', FunctionTransformer(ad_campaign_v_2)),
    ('day_of_week', FunctionTransformer(day_of_week)),
    ('empties', FunctionTransformer(empties)),
    #('resolution_func', FunctionTransformer(resolution_func)),
    ('resolution_func v2', FunctionTransformer(resolution_func_v_2)),
    ('country v2', FunctionTransformer(country_v_2)),
    ('country v3', FunctionTransformer(country_v_3)),
    ('city v2', FunctionTransformer(city_v_2)),
    ('city', FunctionTransformer(city)),
    ('device brand v2', FunctionTransformer(device_brand_v_2)),
    ('device brand', FunctionTransformer(device_brand)),
    ('filter_stuff', FunctionTransformer(filter_stuff)),
    #('encode_stuff', FunctionTransformer(encode_stuff)),
    ('scale_stuff(resol, )', FunctionTransformer(scale_stuff)),
    #('check_stuff_3', FunctionTransformer(check_stuff_3)),
    ('f_transformer', f_transformer),
    #('filter_stuff', FunctionTransformer(filter_stuff)),
    #('check_stuff_3', FunctionTransformer(check_stuff_3))
])

models = [
    #RandomForestClassifier(n_estimators=300, max_depth= 10, max_features='sqrt', min_samples_split=2),
    #SVC(C=10, gamma=0.01, kernel='rbf'),
    #DecisionTreeClassifier(criterion='gini', max_depth=7, min_samples_split=10),
    #LogisticRegression( C=1.0, penalty='l2', solver='saga'),
    MLPClassifier(hidden_layer_sizes=(100, ), solver='adam', activation='tanh')
    ]

for model in models:
    

    pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor), 
        ('classifier', model)  
    ])

    y = df4['event_action']
    x = df4.drop('event_action', axis=1)

    #x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)
    
    
    #predictions = pipeline.predict(x_test)
    #probs = pipeline.predict_proba(x_test)
    #scores_single_fit.append([type(model).__name__,' test roc score - ', roc_auc_score(y_test, probs[:,1])])
    #scores_single_fit.append([type(model).__name__,' test acc score - ', accuracy_score(y_test, predictions)])
    #print(type(model).__name__,' test roc score - ', roc_auc_score(y_test, probs[:,1]))
    #print(type(model).__name__,' test acc score - ', accuracy_score(y_test, predictions))
    #interm_scores.append((str(model), 'test roc score - ', roc_auc_score(y_test, pipeline.predict_proba(x_test)[:,1])))
    #interm_scores.append((str(model), 'test acc score - ', accuracy_score(y_test, predictions)))
    
    log = 'ad_camp V, resol V, cntry v2 V, cntry V, ct v2 V, ct V, brand v2 V, 200k 50/50'
    score = cross_val_score(pipeline, x, y, cv=4, scoring='roc_auc')
    #score = cross_val_score(pipeline, x, y, cv=4, scoring='accuracy')
    cv_scores.append([ round(score.mean(), 4), log]) #type(model).__name__,
    pipeline.fit(x, y)
    
with open('models/pipeline_3_tst_maxi_old_pipe_3.pkl', 'wb') as file:
    dill.dump({
        'model': pipeline,
        'metadata': {
            'name': 'sber_auto_sub_model_1',
            'author': 'well... me, i guess:)',
            'version': 0.00000000000000001,
            'type': type(pipeline.named_steps["classifier"]).__name__,
            'roc-auc': cv_scores[-1][0]
        }
    }, file)

ad_campaign v2 start
ad_campaign v2 end
-
day_of_week start
day_of_week end
-
empties end
empties end
-
resolution_func v2 start
resolution_func v2 end
-
country v2  start
country v2 end
-
city v2  start
city v2 end
-
city start
city end
-
device_brand v2 start
device_brand v2 end
-
device_brand start
device_brand end
-
filter_stuff start
filter_stuff end
Index(['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
       'device_category', 'device_brand', 'device_screen_resolution',
       'device_browser', 'geo_country', 'geo_city', 'camp_succ_rate',
       'day_of_week', 'geo_country_succ_perc', 'geo_city_succ_perc',
       'device_brand_succ_perc'],
      dtype='object')
-
scale_stuff start
utm_source
utm_medium
utm_campaign
utm_adcontent
device_category
device_brand
device_browser
geo_country
geo_city
day_of_week
camp_succ_rate_scaled
geo_country_succ_perc_scaled
geo_city_succ_perc_scaled
device_brand_succ_perc_scaled
device_screen_resolution_scaled
15
scale_stuff end
-




ad_campaign v2 start
ad_campaign v2 end
-
day_of_week start
day_of_week end
-
empties end
empties end
-
resolution_func v2 start
resolution_func v2 end
-
country v2  start
country v2 end
-
city v2  start
city v2 end
-
city start
city end
-
device_brand v2 start
device_brand v2 end
-
device_brand start
device_brand end
-
filter_stuff start
filter_stuff end
Index(['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
       'device_category', 'device_brand', 'device_screen_resolution',
       'device_browser', 'geo_country', 'geo_city', 'camp_succ_rate',
       'day_of_week', 'geo_country_succ_perc', 'geo_city_succ_perc',
       'device_brand_succ_perc'],
      dtype='object')
-
scale_stuff start
utm_source
utm_medium
utm_campaign
utm_adcontent
device_category
device_brand
device_browser
geo_country
geo_city
day_of_week
camp_succ_rate_scaled
geo_country_succ_perc_scaled
geo_city_succ_perc_scaled
device_brand_succ_perc_scaled
device_screen_resolution_scaled
15
scale_stuff end
-
ad_



ad_campaign v2 start
ad_campaign v2 end
-
day_of_week start
day_of_week end
-
empties end
empties end
-
resolution_func v2 start
resolution_func v2 end
-
country v2  start
country v2 end
-
city v2  start
city v2 end
-
city start
city end
-
device_brand v2 start
device_brand v2 end
-
device_brand start
device_brand end
-
filter_stuff start
filter_stuff end
Index(['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
       'device_category', 'device_brand', 'device_screen_resolution',
       'device_browser', 'geo_country', 'geo_city', 'camp_succ_rate',
       'day_of_week', 'geo_country_succ_perc', 'geo_city_succ_perc',
       'device_brand_succ_perc'],
      dtype='object')
-
scale_stuff start
utm_source
utm_medium
utm_campaign
utm_adcontent
device_category
device_brand
device_browser
geo_country
geo_city
day_of_week
camp_succ_rate_scaled
geo_country_succ_perc_scaled
geo_city_succ_perc_scaled
device_brand_succ_perc_scaled
device_screen_resolution_scaled
15
scale_stuff end
-
ad_



ad_campaign v2 start
ad_campaign v2 end
-
day_of_week start
day_of_week end
-
empties end
empties end
-
resolution_func v2 start
resolution_func v2 end
-
country v2  start
country v2 end
-
city v2  start
city v2 end
-
city start
city end
-
device_brand v2 start
device_brand v2 end
-
device_brand start
device_brand end
-
filter_stuff start
filter_stuff end
Index(['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
       'device_category', 'device_brand', 'device_screen_resolution',
       'device_browser', 'geo_country', 'geo_city', 'camp_succ_rate',
       'day_of_week', 'geo_country_succ_perc', 'geo_city_succ_perc',
       'device_brand_succ_perc'],
      dtype='object')
-
scale_stuff start
utm_source
utm_medium
utm_campaign
utm_adcontent
device_category
device_brand
device_browser
geo_country
geo_city
day_of_week
camp_succ_rate_scaled
geo_country_succ_perc_scaled
geo_city_succ_perc_scaled
device_brand_succ_perc_scaled
device_screen_resolution_scaled
15
scale_stuff end
-
ad_



In [84]:
cv_scores

[[0.6603, 'ad_camp V, resol V, cntry V, ct V, brand V, 200k 50/50'],
 [0.6576, 'ad_camp V, resol V, cntry V, ct V, brand V, 200k 70/30'],
 [0.6456, 'ad_camp V, resol V, cntry V, ct V, brand V, 140k 30/70'],
 [0.6469, 'ad_camp V, resol V, cntry V, ct V, brand V, 100k 50/50'],
 [0.6428, 'ad_camp V, resol V, cntry V, ct V, brand V, 100k 70/30'],
 [0.6447, 'ad_camp V, resol V, cntry V, ct V, brand V, 100k 30/70'],
 [0.6438, 'ad_camp v2 V, resol V, cntry V, ct V, brand V, 100k 30/70'],
 [0.6445, 'ad_camp v2 V, resol V, cntry V, ct V, brand v2 V, 100k 30/70'],
 [0.6459, 'ad_camp V, resol V, cntry V, ct V, brand v2 V, 100k 30/70'],
 [0.6312, 'ad_camp V, resol V, cntry V, ct v2 V, brand v2 V, 100k 30/70'],
 [0.6463, 'ad_camp V, resol V, cntry v 2V, ct V, brand v2 V, 100k 30/70'],
 [0.6461, 'ad_camp V, resol v2 V, cntry v2 V, ct V, brand v2 V, 100k 30/70'],
 [0.6482, 'ad_camp V, resol v2 V, cntry v2 V, ct V, brand v2 V, 100k 50/50'],
 [0.6486, 'ad_camp V, resol v2 V, cntry v2 V, ct V, brand v2 

In [108]:
df4 = pd.read_csv('data/df3_200k_50n_50p_2_step_backup.csv')
categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


f_transformer = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, make_column_selector(dtype_include='object'))
])

preprocessor = Pipeline(steps=[
    #('event', FunctionTransformer(event_action)),
    #('sampling', FunctionTransformer(sample_df3)),
    ('ad_campaign_feature_creating v2', FunctionTransformer(ad_campaign_v_2)),
    #('day_of_week', FunctionTransformer(day_of_week)),
    ('empties', FunctionTransformer(empties)),
    #('resolution_func', FunctionTransformer(resolution_func)),
    ('resolution_func v2', FunctionTransformer(resolution_func_v_2)),
    ('country v2', FunctionTransformer(country_v_2)),
    ('country v3', FunctionTransformer(country_v_3)),
    ('city v2', FunctionTransformer(city_v_2)),
    ('city', FunctionTransformer(city)),
    ('device brand v2', FunctionTransformer(device_brand_v_2)),
    ('device brand', FunctionTransformer(device_brand)),
    ('filter_stuff v2', FunctionTransformer(filter_stuff_2)),
    #('encode_stuff', FunctionTransformer(encode_stuff)),
    ('scale_stuff(resol, )', FunctionTransformer(scale_stuff)),
    #('check_stuff_3', FunctionTransformer(check_stuff_3)),
    ('f_transformer', f_transformer),
    #('filter_stuff', FunctionTransformer(filter_stuff)),
    #('check_stuff_4', FunctionTransformer(check_stuff_4))
])

models = [
    #RandomForestClassifier(n_estimators=300, max_depth= 10, max_features='sqrt', min_samples_split=2),
    #SVC(C=10, gamma=0.01, kernel='rbf'),
    #DecisionTreeClassifier(criterion='gini', max_depth=7, min_samples_split=10),
    #LogisticRegression( C=1.0, penalty='l2', solver='saga'),
    MLPClassifier(hidden_layer_sizes=(100, ), solver='adam', activation='tanh')
    ]

for model in models:
    

    pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor), 
        ('classifier', model)  
    ])

    y = df4['event_action']
    x = df4.drop('event_action', axis=1)

    #x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)
    
    
    #predictions = pipeline.predict(x_test)
    #probs = pipeline.predict_proba(x_test)
    #scores_single_fit.append([type(model).__name__,' test roc score - ', roc_auc_score(y_test, probs[:,1])])
    #scores_single_fit.append([type(model).__name__,' test acc score - ', accuracy_score(y_test, predictions)])
    #print(type(model).__name__,' test roc score - ', roc_auc_score(y_test, probs[:,1]))
    #print(type(model).__name__,' test acc score - ', accuracy_score(y_test, predictions))
    #interm_scores.append((str(model), 'test roc score - ', roc_auc_score(y_test, pipeline.predict_proba(x_test)[:,1])))
    #interm_scores.append((str(model), 'test acc score - ', accuracy_score(y_test, predictions)))
    
    #log = 'ad_camp V, resol V, cntry v2 V, cntry V, ct v2 V, ct V, brand v2 V, 200k 50/50'
    #score = cross_val_score(pipeline, x, y, cv=4, scoring='roc_auc')
    #score = cross_val_score(pipeline, x, y, cv=4, scoring='accuracy')
    #cv_scores.append([ round(score.mean(), 4), log]) #type(model).__name__,
    pipeline.fit(x, y)


ad_campaign v2 start
ad_campaign v2 end
-
empties end
empties end
-
resolution_func v2 start
resolution_func v2 end
-
country v2  start
country v2 end
-
city v2  start
city v2 end
-
city start
city end
-
device_brand v2 start
device_brand v2 end
-
device_brand start
device_brand end
-
filter_stuff_2 start
filter_stuff_2 end
Index(['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
       'device_category', 'device_brand', 'device_screen_resolution',
       'device_browser', 'geo_country', 'geo_city', 'camp_succ_rate',
       'geo_country_succ_perc', 'geo_city_succ_perc',
       'device_brand_succ_perc'],
      dtype='object')
-
scale_stuff start
scale_stuff end
14
1995




In [71]:
model = pipeline.named_steps['classifier']

In [None]:
model.nam

In [75]:
coefs = model.coefs_[0]

In [79]:
len(coefs)

815

In [77]:
for idx, coef in enumerate(coefs):
    print(f"Feature {idx}: Importance - {coef}")

Feature 0: Importance - [ 3.20108931e-01 -9.28777740e-03 -1.76539962e-02 -6.64081598e-03
  1.32001570e-02  1.62841216e-02 -1.83442461e-02 -1.52997800e-02
  8.32464878e-03  1.28177681e-01 -2.90916327e-03 -3.27461982e-03
  3.21291290e-03 -5.07812683e-02  9.63777629e-03  1.84591014e-01
 -1.33355425e-01  3.96602271e-04 -8.79523237e-02  8.66984670e-03
  8.88334367e-04 -6.10246782e-03  2.09619325e-02  1.67598271e-01
 -2.13291839e-01  1.26168074e-02 -1.71622657e-03  1.52261399e-01
  9.89440191e-03 -3.13656160e-01 -1.92263131e-02  5.72054362e-02
  1.58955121e-02 -6.74696298e-03 -2.74827016e-03 -3.00772734e-01
 -1.84839779e-02 -1.32659555e-02  1.66024384e-01 -2.03170352e-02
  1.52940987e-02  2.37968237e-02 -1.80103114e-02 -8.92241118e-03
  9.84012124e-04  1.70567941e-02  1.45038991e-02  3.78546040e-03
  6.48121681e-03  2.22334660e-02 -1.60422539e-02 -9.59656015e-03
 -1.73817103e-02 -1.66951497e-02  3.68442363e-03 -5.03007107e-03
  1.86416803e-02 -4.91066193e-03  1.52546121e-02  7.46891294e-03
 

Feature 148: Importance - [ 3.47381107e-02 -3.47852128e-03 -4.27358954e-03 -2.56082625e-02
  3.61599218e-03 -2.05891391e-03 -9.24806353e-03 -8.65650246e-03
 -1.49165467e-02  5.56735798e-01  4.26607376e-03  1.12105230e-02
 -9.80223904e-03  3.93727954e-02 -7.23492822e-03 -3.60678314e-02
  7.21816538e-02 -1.84470497e-03  1.74371579e-01 -2.86507226e-02
 -1.45156897e-01 -1.75363567e-02 -2.87508172e-02  4.41441785e-01
 -1.32232583e-01  4.10537811e-02  1.82668314e-03  5.88051595e-02
  8.12492906e-03  5.05897232e-01 -1.07505691e-02  1.87268808e-01
  1.99235637e-03  2.86008294e-02  1.52428077e-02 -2.63624234e-02
 -6.96398444e-03  9.94303548e-03  1.86116470e-01  2.33525860e-02
 -1.77198809e-02 -1.69346447e-02  1.77151448e-02  2.93433086e-03
 -1.73259298e-02 -1.03165266e-02  1.58405310e-02  3.40721669e-03
  3.18607369e-02 -4.22512197e-03 -7.75689000e-03 -8.37959016e-03
  1.49086934e-02  2.33133767e-03 -2.90572882e-02 -1.13181453e-02
  1.75331801e-02  2.03477872e-02 -3.62466471e-02 -5.67396377e-03

Feature 297: Importance - [-2.21206747e-01 -1.04413939e-12  3.43048701e-08  1.33581169e-06
 -1.21340179e-07  9.70743062e-08 -7.65475402e-12  1.86427882e-14
 -6.05414375e-09 -1.35301733e-01 -2.63706503e-16 -1.31611557e-07
 -1.47321761e-09  1.58035261e-03  2.02345773e-08 -2.97786974e-02
  6.03665081e-03  2.66165004e-12 -1.01544761e-15 -8.81439540e-07
 -2.03816988e-02 -5.87383393e-09  1.59127813e-07 -4.87208391e-02
  1.46009447e-01  4.74323877e-06  1.39623842e-09 -1.64742540e-02
 -2.80247042e-11  4.82733574e-02 -7.27587772e-08 -1.44579889e-01
 -2.78069655e-09  4.38367429e-05 -1.04243465e-08  1.98886927e-02
 -1.89465943e-15  1.10723622e-08 -8.62785317e-03  1.07347412e-08
  1.37060579e-07 -7.36727991e-04  2.24489509e-09 -8.07678373e-16
 -1.36819589e-05 -5.79123540e-05 -3.02423552e-08 -2.78805544e-08
  8.91746238e-16  1.01392408e-08  4.73783785e-16  6.83513486e-06
 -1.24252192e-14 -3.46528286e-13 -1.60958010e-14  2.13806833e-08
 -1.46466701e-14 -5.66974284e-15  1.47382771e-14  9.06646488e-08

Feature 515: Importance - [ 1.87277200e-01 -2.56029858e-03 -5.27496985e-03 -9.87437440e-04
  1.77408129e-03  8.26475823e-03 -7.49036484e-03 -6.04867035e-03
  3.87761923e-03  2.67119241e-01 -1.19818920e-03  1.27672270e-03
  1.28798635e-03 -2.05220130e-01  2.96874883e-03  3.09173053e-01
 -2.06708873e-01  5.24484708e-04 -1.34749652e-01  3.29934896e-03
  3.77000676e-01 -1.82782465e-03  8.78192238e-03  2.03650833e-01
 -3.65932387e-01  3.76375642e-03  7.64294949e-06  2.43307127e-01
  2.94495557e-03 -3.16181740e-01 -7.63201080e-03  3.13343065e-01
  6.50098343e-03  9.35324902e-05  1.18436054e-03 -4.34374585e-01
 -7.72034737e-03 -3.01467332e-03  2.47576838e-01 -9.17186214e-03
  4.41695238e-03  2.27698633e-01 -7.07677057e-03 -2.48341615e-03
  6.71765287e-04  3.23417604e-03  6.42922789e-03  1.55998789e-03
  3.74270638e-03  1.14709976e-02 -7.29326026e-03 -3.21996697e-03
 -5.73855259e-03 -6.24496503e-03 -8.64380422e-04 -1.21442020e-03
  8.27691250e-03  1.18713289e-04  5.29799703e-03  2.72677804e-03

Feature 687: Importance - [ 7.78143754e-01  1.81318843e-02 -6.26672936e-02  2.13257637e-02
  5.41656705e-02 -5.81168687e-02  4.67052153e-02 -4.18082048e-02
  1.71136978e-02  9.27119534e-02 -1.96156997e-01 -4.61525612e-02
  2.15329872e-02  5.55036994e-01  1.11612225e-01 -1.58969454e+00
  2.25767774e-01 -4.85967859e-02 -2.19375526e+00 -1.74580188e-02
  9.84207416e-01  1.39828358e-01  1.45871194e-01  5.41070718e-01
  7.03207225e-01 -9.03844490e-02 -8.11211568e-02  7.68717520e-01
  4.13164092e-03 -8.21520589e-01  8.39792099e-02  4.80166595e-01
 -1.59216091e-01 -1.09210601e-01  1.35650708e-01 -1.09911756e+00
  9.09604508e-02  5.54790208e-02 -1.91190616e+00 -3.11009174e-02
  9.91905538e-02 -8.78552405e-01 -9.66043556e-02 -3.98427284e-02
  8.96088745e-02  1.27962689e-01  7.79263713e-02  9.87965205e-02
  6.38021231e-02 -1.33844339e-02 -1.00947090e-03 -4.86433908e-02
 -9.76801740e-02 -3.66377916e-02  7.60371678e-02 -6.66185906e-02
 -4.00632775e-02 -2.51766045e-02  4.77157502e-02  5.01061190e-02

In [111]:
!pip install geopy


Collecting geopy
  Downloading geopy-2.3.0-py3-none-any.whl (119 kB)
     -------------------------------------- 119.8/119.8 kB 2.3 MB/s eta 0:00:00
Collecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
     -------------------------------------- 40.3/40.3 kB 192.8 kB/s eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.3.0


In [124]:
df4.geo_country.value_counts()

Russia             195490
Ukraine               796
United States         346
Germany               306
Belarus               285
                    ...  
North Macedonia         1
Nigeria                 1
Burundi                 1
Slovakia                1
South Sudan             1
Name: geo_country, Length: 104, dtype: int64

In [128]:
df3.shape

(15685219, 28)

In [169]:
from geopy.geocoders import Nominatim

# Create a geocoder object
geolocator = Nominatim(user_agent="whatever")

# Get coordinates for a city

locations = dict()
locations_to_write = []
russia_cities = df4[df4['geo_country'] == 'Russia'].geo_city.unique()
for city in russia_cities:
    location = geolocator.geocode(f"{city}")
    
    if location == '(not set)':
        continue

    elif location:
        locations[str(city)] = {'lat': location.latitude, 'long': location.longitude}
        locations_to_write.append(f'{city, location.latitude, location.longitude}')

    else:
        continue
with open('data/locations_coords.txt', 'w') as file:
        for location in locations_to_write:
            line = f"{location}\n"
            file.write(line)
            


In [170]:
locations_2 = dict()
with open('data/locations_coords.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        locs = line.strip().replace("'", '').replace('(', '').replace(')', '').split(', ')
        locations_2[str(locs[0])] =  {'lat': eval(locs[1]), 'long':eval(locs[2])}

In [172]:

locations_2 == locations

False

In [None]:
x1 = locations['Moscow']['lat']
y1 = locations['Moscow']['long']


In [178]:
distance = ((x1 - x2) **2 + (y1 - y2)**2) ** 0.5



NameError: name 'x1' is not defined