In [1]:
import numpy as np
import pandas as pd
from properties import *
from sklearn.model_selection import train_test_split
# from sklearn.svm import LinearSVR
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
import joblib
import gc
import os

In [None]:
training = None
if os.path.exists(PATH_DATA + 'training.pkl'):
    training = pd.read_pickle(PATH_DATA + 'training.pkl')

# training = training.sample(n=50000000, random_state=42, axis=0)

# joblib.dump(training, './data/training_sampling_5000,0000.pkl')

In [None]:
def scoreClickAUC(num_clicks, num_impressions, predicted_ctr):
    """
    Calculates the area under the ROC curve (AUC) for click rates
    
    Parameters
    ----------
    num_clicks : a list containing the number of clicks

    num_impressions : a list containing the number of impressions

    predicted_ctr : a list containing the predicted click-through rates

    Returns
    -------
    auc : the area under the ROC curve (AUC) for click rates
    """
    i_sorted = sorted(range(len(predicted_ctr)),key=lambda i: predicted_ctr[i],
                      reverse=True)
    auc_temp = 0.0
    click_sum = 0.0
    old_click_sum = 0.0
    no_click = 0.0
    no_click_sum = 0.0

    # treat all instances with the same predicted_ctr as coming from the
    # same bucket
    last_ctr = predicted_ctr[i_sorted[0]] + 1.0

    print('Start eval:')
    for i in range(len(predicted_ctr)):
        if i % 10000 == 0:
            print(i)
        if last_ctr != predicted_ctr[i_sorted[i]]: 
            auc_temp += (click_sum+old_click_sum) * no_click / 2.0        
            old_click_sum = click_sum
            no_click = 0.0
            last_ctr = predicted_ctr[i_sorted[i]]
        no_click += num_impressions[i_sorted[i]] - num_clicks[i_sorted[i]]
        no_click_sum += num_impressions[i_sorted[i]] - num_clicks[i_sorted[i]]
        click_sum += num_clicks[i_sorted[i]]
    auc_temp += (click_sum+old_click_sum) * no_click / 2.0
    auc = auc_temp / (click_sum * no_click_sum)
    return auc

In [None]:
# training = joblib.load('./data/training_sampling_2000,0000.pkl')

In [None]:
# 0.25左右是未知用户
len(training[training['userid'] == 0])

In [None]:
len(training[training['userid'] > 0])

In [None]:
pCTR_Ad = training.groupby(['adid']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Ad['pCTR_Ad'] = (pCTR_Ad['click'] + 0.05 * 75) / (pCTR_Ad['impression'] + 75)
pCTR_Ad = pCTR_Ad.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_Ad, on='adid', how='left')

In [None]:
pCTR_Advertiser = training.groupby(['advertiserid']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Advertiser['pCTR_Advertiser'] = (pCTR_Advertiser['click'] + 0.05 * 75) / (pCTR_Advertiser['impression'] + 75)
pCTR_Advertiser = pCTR_Advertiser.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_Advertiser, on='advertiserid', how='left')

In [None]:
pCTR_Query = training.groupby(['queryid']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Query['pCTR_Query'] = (pCTR_Query['click'] + 0.05 * 75) / (pCTR_Query['impression'] + 75)
pCTR_Query = pCTR_Query.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_Query, on='queryid', how='left')

In [None]:
pCTR_Title = training.groupby(['titleid']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Title['pCTR_Title'] = (pCTR_Title['click'] + 0.05 * 75) / (pCTR_Title['impression'] + 75)
pCTR_Title = pCTR_Title.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_Title, on='titleid', how='left')

In [None]:
pCTR_Description = training.groupby(['descriptionid']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Description['pCTR_Description'] = (pCTR_Description['click'] + 0.05 * 75) / (pCTR_Description['impression'] + 75)
pCTR_Description = pCTR_Description.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_Description, on='descriptionid', how='left')

In [None]:
pCTR_User = training.groupby(['userid']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_User['pCTR_User'] = (pCTR_User['click'] + 0.05 * 75) / (pCTR_User['impression'] + 75)
pCTR_User = pCTR_User.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_User, on='userid', how='left')

In [None]:
pCTR_Keyword = training.groupby(['keywordid']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Keyword['pCTR_Keyword'] = (pCTR_Keyword['click'] + 0.05 * 75) / (pCTR_Keyword['impression'] + 75)
pCTR_Keyword = pCTR_Keyword.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_Keyword, on='keywordid', how='left')

In [None]:
pCTR_Url = training.groupby(['displayurl']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Url['pCTR_Url'] = (pCTR_Url['click'] + 0.05 * 75) / (pCTR_Url['impression'] + 75)
pCTR_Url = pCTR_Url.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_Url, on='displayurl', how='left')

In [None]:
training['ctr'] = (training['click'] + 0.05 * 75) / (training['impression'] + 75)

In [None]:
# ((training['depth'] - training['position']) / training['depth']).value_counts()

In [None]:
training['RPosition'] = (training['depth'] - training['position']) / training['depth']

In [None]:
pCTR_RPosition = training.groupby(['RPosition']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_RPosition['pCTR_RPosition'] = (pCTR_RPosition['click'] + 0.05 * 75) / (pCTR_RPosition['impression'] + 75)
pCTR_RPosition = pCTR_RPosition.drop(['click', 'impression'], axis=1)

training = training.merge(pCTR_RPosition, on='RPosition', how='left')

In [None]:
# (training.shape[0] - training.count()) / training.shape[0]

In [None]:
userid_profile = None
if os.path.exists(PATH_DATA + 'userid_profile.pkl'):
    userid_profile = pd.read_pickle(PATH_DATA + 'userid_profile.pkl')
else:
    userid_profile = pd.read_csv(PATH_ABOVE + '/track2/' + 'userid_profile.txt', header=None, sep='\t', nrows=None)
    userid_profile.columns = ['userid', 'gender', 'age']
    userid_profile.to_pickle(PATH_DATA + 'userid_profile.pkl')

In [None]:
training = training.merge(userid_profile, on='userid', how='left')

In [None]:
# gender: 3对应NAN，age: 0对应NAN
values = {'gender': 3, 'age': 0}
training = training.fillna(value=values)

In [None]:
training_not_zero = training[training['userid'] != 0]

In [None]:
training_zero = training[training['userid'] == 0]

In [None]:
pCTR_Gender = training_not_zero.groupby(['gender']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Gender['pCTR_Gender'] = (pCTR_Gender['click'] + 0.05 * 75) / (pCTR_Gender['impression'] + 75)
pCTR_Gender = pCTR_Gender.drop(['click', 'impression'], axis=1)

training_not_zero = training_not_zero.merge(pCTR_Gender, on='gender', how='left')


pCTR_Age = training_not_zero.groupby(['age']).agg({'click': np.sum, 'impression': np.sum}).reset_index()

pCTR_Age['pCTR_Age'] = (pCTR_Age['click'] + 0.05 * 75) / (pCTR_Age['impression'] + 75)
pCTR_Age = pCTR_Age.drop(['click', 'impression'], axis=1)

training_not_zero = training_not_zero.merge(pCTR_Age, on='age', how='left')

In [None]:
# (training_zero.shape[0] - training_zero.count()) / training_zero.shape[0]

In [None]:
test = None
if os.path.exists(PATH_DATA + 'test.pkl'):
    test = pd.read_pickle(PATH_DATA + 'test.pkl')
else:
    test = pd.read_csv(PATH_ABOVE + 'test.txt', header=None, sep='\t', nrows=None)
    test.columns = ['displayurl', 'adid', 'advertiserid', 'depth', 'position',
                    'queryid', 'keywordid', 'titleid', 'descriptionid', 'userid']
    test.to_pickle(PATH_DATA + 'test.pkl')

In [None]:
test['i_id'] = pd.Series([x for x in range(len(test))])

In [None]:
solution = pd.read_csv('./data/KDD_Track2_solution.csv')
num_clicks = solution['clicks']
num_impressions = solution['impressions']

In [None]:
test['click'] = pd.Series(num_clicks)
test['impression'] = pd.Series(num_impressions)

In [None]:
test['RPosition'] = (test['depth'] - test['position']) / test['depth']

In [None]:
test = test.merge(userid_profile, on='userid', how='left')

In [None]:
values = {'gender': 3, 'age': 0}
test = test.fillna(value=values)

In [None]:
test_not_zero = test[test['userid'] != 0]
test_zero = test[test['userid'] == 0]

In [None]:
# X = training[['pCTR_Ad', 'pCTR_Advertiser', 'pCTR_Query', 'pCTR_Title', 'pCTR_Description', 'pCTR_User', 'pCTR_Keyword', 'pCTR_Url', 'pCTR_RPosition', 'pCTR_Gender', 'pCTR_Age']]
# y = training[['click', 'impression', 'ctr']]
test_not_zero = test_not_zero[['i_id', 'click', 'impression', 'adid', 'advertiserid', 'queryid', 'titleid', 'descriptionid', 'userid', 'keywordid', 'displayurl', 'RPosition', 'gender', 'age']]

In [None]:
test_zero = test_zero[['i_id', 'click', 'impression', 'adid', 'advertiserid', 'queryid', 'titleid', 'descriptionid', 'keywordid', 'displayurl', 'RPosition']]

In [None]:
test_not_zero = test_not_zero.merge(pCTR_Ad, on='adid', how='left')
test_not_zero = test_not_zero.merge(pCTR_Advertiser, on='advertiserid', how='left')
test_not_zero = test_not_zero.merge(pCTR_Query, on='queryid', how='left')
test_not_zero = test_not_zero.merge(pCTR_Title, on='titleid', how='left')
test_not_zero = test_not_zero.merge(pCTR_Description, on='descriptionid', how='left')
test_not_zero = test_not_zero.merge(pCTR_User, on='userid', how='left')
test_not_zero = test_not_zero.merge(pCTR_Keyword, on='keywordid', how='left')
test_not_zero = test_not_zero.merge(pCTR_Url, on='displayurl', how='left')

test_not_zero = test_not_zero.merge(pCTR_RPosition, on='RPosition', how='left')
test_not_zero = test_not_zero.merge(pCTR_Gender, on='gender', how='left')
test_not_zero = test_not_zero.merge(pCTR_Age, on='age', how='left')

In [None]:
test_zero = test_zero.merge(pCTR_Ad, on='adid', how='left')
test_zero = test_zero.merge(pCTR_Advertiser, on='advertiserid', how='left')
test_zero = test_zero.merge(pCTR_Query, on='queryid', how='left')
test_zero = test_zero.merge(pCTR_Title, on='titleid', how='left')
test_zero = test_zero.merge(pCTR_Description, on='descriptionid', how='left')
# test_zero = test_zero.merge(pCTR_User, on='userid', how='left')
test_zero = test_zero.merge(pCTR_Keyword, on='keywordid', how='left')
test_zero = test_zero.merge(pCTR_Url, on='displayurl', how='left')

test_zero = test_zero.merge(pCTR_RPosition, on='RPosition', how='left')
# test_zero = test_zero.merge(pCTR_Gender, on='gender', how='left')
# test_zero = test_zero.merge(pCTR_Age, on='age', how='left')

In [None]:
test_y = test_not_zero[['i_id', 'click', 'impression']]

In [None]:
test_y_zero = test_zero[['i_id', 'click', 'impression']]

In [None]:
values = {'pCTR_Ad': pCTR_Ad['pCTR_Ad'].mean(), 'pCTR_Advertiser': pCTR_Advertiser['pCTR_Advertiser'].mean(), 
         'pCTR_Query': pCTR_Query['pCTR_Query'].mean(), 'pCTR_Title': pCTR_Title['pCTR_Title'].mean(), 
         'pCTR_Description': pCTR_Description['pCTR_Description'].mean(), 'pCTR_User': pCTR_User['pCTR_User'].mean(), 'pCTR_Keyword': 
         pCTR_Keyword['pCTR_Keyword'].mean(), 'pCTR_Url': pCTR_Url['pCTR_Url'].mean()}
test_fillna = test_not_zero.fillna(value=values)

In [None]:
values = {'pCTR_Ad': pCTR_Ad['pCTR_Ad'].mean(), 'pCTR_Advertiser': pCTR_Advertiser['pCTR_Advertiser'].mean(), 
         'pCTR_Query': pCTR_Query['pCTR_Query'].mean(), 'pCTR_Title': pCTR_Title['pCTR_Title'].mean(), 
         'pCTR_Description': pCTR_Description['pCTR_Description'].mean(), 'pCTR_Keyword': 
         pCTR_Keyword['pCTR_Keyword'].mean(), 'pCTR_Url': pCTR_Url['pCTR_Url'].mean()}
test_fillna_zero = test_zero.fillna(value=values)

In [None]:
del [pCTR_Ad, pCTR_Advertiser, pCTR_Query, pCTR_Title, pCTR_Description, pCTR_User, pCTR_Keyword, pCTR_Url]
gc.collect()

In [None]:
test_fillna_feat = test_fillna[['pCTR_Ad', 'pCTR_Advertiser', 'pCTR_Query', 'pCTR_Title', 'pCTR_Description', 'pCTR_User', 
                                        'pCTR_Keyword', 'pCTR_Url', 'pCTR_RPosition', 'pCTR_Gender', 'pCTR_Age']]

In [None]:
test_fillna_feat_zero = test_fillna_zero[['pCTR_Ad', 'pCTR_Advertiser', 'pCTR_Query', 'pCTR_Title', 'pCTR_Description', 
                                        'pCTR_Keyword', 'pCTR_Url', 'pCTR_RPosition']]

In [None]:
(test_fillna_feat.shape[0] - test_fillna_feat.count()) / test_fillna_feat.shape[0]

In [None]:
(test_fillna_feat_zero.shape[0] - test_fillna_feat_zero.count()) / test_fillna_feat_zero.shape[0]

In [None]:
X = training_not_zero[['pCTR_Ad', 'pCTR_Advertiser', 'pCTR_Query', 'pCTR_Title', 'pCTR_Description', 'pCTR_User', 'pCTR_Keyword', 'pCTR_Url', 'pCTR_RPosition', 'pCTR_Gender', 'pCTR_Age']]
y = training_not_zero[['click', 'impression', 'ctr']]

In [None]:
X_zero = training_zero[['pCTR_Ad', 'pCTR_Advertiser', 'pCTR_Query', 'pCTR_Title', 'pCTR_Description', 'pCTR_Keyword', 'pCTR_Url', 'pCTR_RPosition']]
y_zero = training_zero[['click', 'impression', 'ctr']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
X_train_zero, X_test_zero, y_train_zero, y_test_zero = train_test_split(X_zero, y_zero, test_size=0.2, random_state=42)

In [None]:
del X
del y
del X_zero
del y_zero
del training
gc.collect()

In [None]:
# len(X_train), len(X_test)

In [None]:
# len(X_train_zero), len(X_test_zero)

In [None]:
clf = LinearRegression(n_jobs=-1)

clf_zero = LinearRegression(n_jobs=-1)

In [None]:
clf.fit(X_train, y_train['ctr'])

In [None]:
clf_zero.fit(X_train_zero, y_train_zero['ctr'])

In [None]:
prob_pre = clf.predict(X_test)

# num_clicks, num_impressions = read_solution_file('./data/KDD_Track2_solution.csv')

num_clicks = list(y_test['click'])
num_impressions = list(y_test['impression'])

auc = scoreClickAUC(num_clicks, num_impressions, prob_pre)
print("AUC  : %f" % auc)
del prob_pre
gc.collect()

In [None]:
prob_pre_zero = clf_zero.predict(X_test_zero)

# num_clicks, num_impressions = read_solution_file('./data/KDD_Track2_solution.csv')

num_clicks_zero = list(y_test_zero['click'])
num_impressions_zero = list(y_test_zero['impression'])

auc_zero = scoreClickAUC(num_clicks_zero, num_impressions_zero, prob_pre_zero)
print("AUC  : %f" % auc_zero)
del prob_pre_zero
gc.collect()

In [None]:
prob_pre_test = clf.predict(test_fillna_feat)

num_clicks = list(test_y['click'])
num_impressions = list(test_y['impression'])

auc = scoreClickAUC(num_clicks, num_impressions, prob_pre_test)
print("AUC  : %f" % auc)

In [None]:
prob_pre_test_zero = clf_zero.predict(test_fillna_feat_zero)

num_clicks_zero = list(test_y_zero['click'])
num_impressions_zero = list(test_y_zero['impression'])

auc_zero = scoreClickAUC(num_clicks_zero, num_impressions_zero, prob_pre_test_zero)
print("AUC  : %f" % auc_zero)

In [None]:
df_pro = pd.DataFrame({'i_id': list(test_y['i_id']), 'prob': list(prob_pre_test)})

In [None]:
df_pro_zero = pd.DataFrame({'i_id': list(test_y_zero['i_id']), 'prob': list(prob_pre_test_zero)})

In [None]:
df_out = pd.concat([df_pro, df_pro_zero])

In [None]:
out = df_out.sort_values(by=['i_id'])

In [None]:
link_df = (pd.concat([test_not_zero[['i_id', 'click', 'impression']], test_zero[['i_id', 'click', 'impression']]])).merge(out, on='i_id')

In [None]:
# (link_df.shape[0] - link_df.count()) / link_df.shape[0]

In [None]:
link_df = link_df.sort_values(by=['i_id'])

In [None]:
num_clicks = list(link_df['click'])
num_impressions = list(link_df['impression'])

auc = scoreClickAUC(num_clicks, num_impressions, list(link_df['prob']))
print("AUC  : %f" % auc)

In [None]:
joblib.dump(link_df, './output/linearR_smoothing_sample_all.pkl')

In [None]:
# link_df

In [None]:
# prob_pre = clf.predict_proba(test)

# num_clicks, num_impressions = read_solution_file('./data/KDD_Track2_solution.csv')

# auc = scoreClickAUC(num_clicks, num_impressions, prob_pre[:, 1])
# print("AUC  : %f" % auc)