In [118]:
import pandas as pd
import hashlib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [119]:
data_cheak = pd.read_csv('save_df1', low_memory=False)

In [120]:
data_cheak = pd.DataFrame(data_cheak)
df_ch_copy = data_cheak.copy()

In [121]:
df_filtered = df_ch_copy.loc[df_ch_copy['target']==0]

In [122]:
df_filtered = df_filtered.dropna()

In [123]:
random_rows = df_filtered.sample(frac=0.45, random_state=42)

In [124]:
df_filtered = df_filtered.drop(random_rows.index)

In [125]:
df_filtered_test = df_filtered.sample(frac=0.1, random_state=42)

In [126]:
df_filtered_train = df_filtered.drop(df_filtered_test.index)

In [127]:
df_filtered_target = df_ch_copy.loc[df_ch_copy['target']==1]

In [128]:
imputer = SimpleImputer(strategy='most_frequent')

In [129]:
columns = list(df_filtered_target.columns)

In [130]:
df_filtered_target = imputer.fit_transform(df_filtered_target[columns])

In [131]:
df_filtered_target = pd.DataFrame(df_filtered_target, columns=columns)

In [132]:
df_target_test = df_filtered_target.sample(frac=0.1, random_state=42)

In [133]:
df_target_train = df_filtered_target.drop(df_target_test.index)

In [134]:
df_target_train = pd.concat([df_target_train, df_target_train, df_target_train, df_target_train], ignore_index=True)

In [135]:
train_data = pd.concat([df_target_train, df_filtered_train])

In [136]:
test_data = pd.concat([df_target_test, df_filtered_test])

In [137]:
train_data = train_data.sample(frac=1)

In [138]:
test_data = test_data.sample(frac=1)

In [139]:
train_data = train_data.drop('Unnamed: 0', axis=1)

In [140]:
test_data = test_data.drop('Unnamed: 0', axis=1)

In [141]:
train_data.to_csv('train_data', index=False)

In [142]:
test_data.to_csv('test_data', index=False)

In [143]:
df = pd.read_csv('train_data', low_memory=False)
df = pd.DataFrame(df)

In [144]:
df_copy = df.copy()
df_copy['width'] = pd.to_numeric(df_copy['device_screen_resolution'].str.split('x').str[0])
df_copy['height'] = pd.to_numeric(df_copy['device_screen_resolution'].str.split('x').str[1])

df_copy['datetime'] = pd.to_datetime(df_copy['visit_date'], format='%Y-%m-%d') + \
                 pd.to_timedelta(df_copy['visit_time'])
df_copy['year'] = df_copy['datetime'].dt.year
df_copy['month'] = df_copy['datetime'].dt.month
df_copy['day'] = df_copy['datetime'].dt.day
df_copy['hour'] = df_copy['datetime'].dt.hour
df_copy['minute'] = df_copy['datetime'].dt.minute

df_copy['geo'] = df_copy['geo_country'] + '/' + df_copy['geo_city']

drop_col = ['session_id', 'client_id', 'device_screen_resolution', 'visit_date', 'visit_time', 'datetime', 'geo_country', 'geo_city']

df_copy = df_copy.drop(drop_col, axis=1)

In [145]:
X = df_copy.drop('target', axis=1)
y = df_copy['target']

In [146]:
scal_columns = ['visit_number', 'width', 'height', 'year', 'month', 'day', 'hour', 'minute']
encod_columns = ['utm_medium', 'device_category', 'device_os', 'device_brand', 'device_browser', 'geo']
hash_columns = ['utm_source', 'utm_campaign', 'utm_adcontent', 'utm_keyword', 'device_model']

In [147]:
scaler = StandardScaler()
X[scal_columns] = scaler.fit_transform(X[scal_columns])

In [148]:
X[hash_columns] = X[hash_columns].applymap(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest())

  X[hash_columns] = X[hash_columns].applymap(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest())


In [149]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X[encod_columns])

In [150]:
X = encoder.transform(X[encod_columns])

In [151]:
log_reg = LogisticRegression(n_jobs=-1)
rfc = RandomForestClassifier(n_jobs=-1)

In [152]:
log_reg_cvs = cross_val_score(log_reg, X, y, cv=4, scoring='roc_auc')

In [153]:
log_reg_cvs.mean()

0.6853214935570648

In [154]:
rfc_cvs = cross_val_score(rfc, X, y, cv=4, scoring='roc_auc')

In [155]:
rfc_cvs.mean()

0.7503522571539856

In [156]:
rfc_model = rfc.fit(X, y)

In [157]:
test_df = pd.read_csv('test_data', low_memory=False)

In [158]:
test_df = test_df.copy()
test_df['width'] = pd.to_numeric(test_df['device_screen_resolution'].str.split('x').str[0])
test_df['height'] = pd.to_numeric(test_df['device_screen_resolution'].str.split('x').str[1])

test_df['datetime'] = pd.to_datetime(test_df['visit_date'], format='%Y-%m-%d') + \
                 pd.to_timedelta(test_df['visit_time'])
test_df['year'] = test_df['datetime'].dt.year
test_df['month'] = test_df['datetime'].dt.month
test_df['day'] = test_df['datetime'].dt.day
test_df['hour'] = test_df['datetime'].dt.hour
test_df['minute'] = test_df['datetime'].dt.minute

test_df['geo'] = test_df['geo_country'] + '/' + test_df['geo_city']

drop_col = ['session_id', 'client_id', 'device_screen_resolution', 'visit_date', 'visit_time', 'datetime', 'geo_country', 'geo_city']

test_df = test_df.drop(drop_col, axis=1)

In [159]:
x_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [160]:
x_test[scal_columns] = scaler.transform(x_test[scal_columns])

In [161]:
x_test[hash_columns] = x_test[hash_columns].applymap(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest())

  x_test[hash_columns] = x_test[hash_columns].applymap(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest())


In [162]:
x_test = encoder.transform(x_test[encod_columns])

In [163]:
x_test

<75542x1948 sparse matrix of type '<class 'numpy.float64'>'
	with 453178 stored elements in Compressed Sparse Row format>

In [164]:
pred = rfc_model.predict(x_test)

In [165]:
roc_auc = roc_auc_score(y_test, pred)

In [166]:
roc_auc

0.5933914133106541