In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

# Посмотрим на тренировочную выборку и разметим её

In [None]:
chunksize = 10 ** 7
num_of_chunk = 0
train = pd.DataFrame()

for chunk in pd.read_csv('/kaggle/input/avazu-ctr-prediction/train.gz',chunksize=chunksize):
    num_of_chunk = num_of_chunk + 1
    train = pd.concat([train,chunk.sample(frac=0.01)],axis = 0,ignore_index = True)
    print(f'NUMBER {str(num_of_chunk)} is done.')

In [None]:
train

In [None]:
train.columns

In [None]:
train = train.drop(columns=['id', 'site_id', 'app_id', 'device_id', 'device_ip', 'device_model'])

In [None]:
train

In [None]:
train['hour'] = pd.to_datetime(train['hour'], format = '%y%m%d%H')

In [None]:
train

In [None]:
sns.set(rc={'figure.figsize': (10, 10)})
colors = sns.color_palette('pastel')[0:5]
labels = sorted(train['click'].unique())
dt = np.array([len(train[train['click'] == i])for i in labels])
plt.pie(dt, labels=labels, colors=colors, autopct='%.0f%%')
plt.title('click')
plt.show()

In [None]:
sns.set(rc={'figure.figsize': (15, 10)})

sns.lineplot(data=train, x='hour', y='click')

In [None]:
features = {
    'hour': [],
    'day': []
}

for elem in train['hour'].map(str):
    year, month, day_hour = map(lambda x: int(x) if ' ' not in x else x, elem[:-6].split('-'))
    day, hour = map(int, day_hour.split())
    features['day'].append(day)
    features['hour'].append(hour)

for elem in features:
    train[elem] = features[elem]

In [None]:
train

In [None]:
train.isnull().sum()

In [None]:
train.info()

In [None]:
train['site_category'] = train['site_category'].astype('category')
train['site_category'] = train['site_category'].cat.codes

train['app_category'] = train['app_category'].astype('category')
train['app_category'] = train['app_category'].cat.codes

train['app_domain'] = train['app_domain'].astype('category')
train['app_domain'] = train['app_domain'].cat.codes

train['site_domain'] = train['site_domain'].astype('category')
train['site_domain'] = train['site_domain'].cat.codes

In [None]:
train

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
corr_matrix = train.corr()

sns.set(rc={'figure.figsize': (20, 15)})

sns.heatmap(corr_matrix, annot=True)

In [None]:
import math
from sklearn import preprocessing

scaled = preprocessing.StandardScaler()

scaled.fit(train.drop(columns=['click']), train['click'])

In [None]:
df_scaled = scaled.transform(train.drop(columns=['click']))

In [None]:
df_scaled

In [None]:
df_scaled.shape

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=5)
pca.fit(df_scaled)
print(pca.explained_variance_ratio_)

In [None]:
print(pca.components_)

In [None]:
zzz = pca.transform(df_scaled)

In [None]:
zzz

In [None]:
zzz.shape

In [None]:
pd.DataFrame(pca.components_, columns=train.columns[1:])

In [None]:
pd.Series(pca.noise_variance_, train.columns[1:])

In [None]:
from sklearn.model_selection import train_test_split
Y = train['click']

X_train, X_valid, Y_train, Y_valid = train_test_split(zzz, Y, test_size=0.1, random_state=42)

# Обучим модель

In [None]:
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier

In [None]:
sgd = SGDClassifier(
    loss='log_loss',
    penalty='l2',
    n_iter_no_change=80,
    shuffle=True,
    max_iter=10000,
    warm_start=True
)

In [None]:
sgd.partial_fit(X_train, Y_train, classes=np.unique(Y_train))

In [None]:
cat = CatBoostClassifier(
    iterations=5000,
    verbose=1000,
    learning_rate=0.005,
    l2_leaf_reg=0.03,
    depth=7,
    leaf_estimation_method='Newton',
    leaf_estimation_iterations=65,
    use_best_model=True
)

In [None]:
cat.fit(X_train, Y_train, eval_set=(X_valid, Y_valid))

In [None]:
from sklearn.metrics import log_loss


preds_val_sgd = sgd.predict_proba(X_valid)
preds_val_cat = cat.predict_proba(X_valid)

In [None]:
preds_val_sgd = np.array(list(map(lambda x: x[1], preds_val_sgd)))
preds_val_cat = np.array(list(map(lambda x: x[1], preds_val_cat)))

In [None]:
preds_val_sgd

In [None]:
preds_val_cat

In [None]:
log_loss(y_pred=preds_val_sgd, y_true=Y_valid)

In [None]:
log_loss(y_pred=preds_val_cat, y_true=Y_valid)

In [None]:
pred_val = [(i + j) / 2 for i, j in zip(preds_val_sgd, preds_val_cat)]

In [None]:
log_loss(y_pred=pred_val, y_true=Y_valid)

# Тоже самое сделаем с тестовой выбокрой

In [None]:
test_df = pd.read_csv("/kaggle/input/avazu-ctr-prediction/test.gz", header=0, dtype=str)

In [None]:
test_df = test_df.drop(columns=['id', 'site_id', 'app_id', 'device_id', 'device_ip', 'device_model'])

test_df['hour'] = pd.to_datetime(test_df['hour'], format = '%y%m%d%H')
features = {
    'hour': [],
    'day': []
}

for elem in test_df['hour'].map(str):
    year, month, day_hour = map(lambda x: int(x) if ' ' not in x else x, elem[:-6].split('-'))
    day, hour = map(int, day_hour.split())
    features['day'].append(day)
    features['hour'].append(hour)

for elem in features:
    test_df[elem] = features[elem]

In [None]:
test_df

In [None]:
test_df.isnull().sum()

In [None]:
test_df.info()

In [None]:
test_df['site_category'] = test_df['site_category'].astype('category')
test_df['site_category'] = test_df['site_category'].cat.codes

test_df['app_category'] = test_df['app_category'].astype('category')
test_df['app_category'] = test_df['app_category'].cat.codes

test_df['app_domain'] = test_df['app_domain'].astype('category')
test_df['app_domain'] = test_df['app_domain'].cat.codes

test_df['site_domain'] = test_df['site_domain'].astype('category')
test_df['site_domain'] = test_df['site_domain'].cat.codes

In [None]:
test_df

# Предскажем тест

In [None]:
test_df_scaled = scaled.transform(test_df)

In [None]:
test_df_scaled

In [None]:
test_df_zzz = pca.transform(test_df_scaled)

In [None]:
test_df_zzz

In [None]:
predict_test_sgd = sgd.predict_proba(test_df_zzz)
predict_test_cat = cat.predict_proba(test_df_zzz)

In [None]:
predict_test_sgd = np.array(list(map(lambda x: x[1], predict_test_sgd)))
predict_test_cat = np.array(list(map(lambda x: x[1], predict_test_cat)))

In [None]:
predict_test = np.array([(i + j) / 2 for i, j in zip(predict_test_sgd, predict_test_cat)])

In [None]:
len([i for i in predict_test if i >= 0.5])

In [None]:
len([i for i in predict_test if i <= 0.5])

# Сохраним ответ

In [None]:
submit_df = pd.read_csv("/kaggle/input/avazu-ctr-prediction/test.gz", header=0, dtype=str)

In [None]:
submit_df

In [None]:
submit_df.columns

In [None]:
test_df = pd.read_csv("/kaggle/input/avazu-ctr-prediction/test.gz", header=0, dtype=str)

In [None]:
test_df

In [None]:
submit_df["click"] = predict_test
submit_df[['id', 'click']].to_csv("submission_difm.csv", index=False)

In [None]:
submit_df[['id', 'click']]