# Click-Through Rate Prediction
### We use data of mobile ads to predict click-through rate
### data source: https://www.kaggle.com/c/avazu-ctr-prediction

In [None]:
# Install a pip package in the current Jupyter kernel example:
#import sys
#!{sys.executable} -m pip install pandas

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)

import numpy as np
np.random.seed(0)
import random
random.seed(0)

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
## orig_train_df = spark.read.option("header", "true").option("inferSchema","true").csv("./data/CTR_train.gz")
## orig_train_df.printSchema

In [None]:
## orig_train_df.filter(orig_train_df.click == 1).count()
## result: 6865066

In [None]:
## orig_train_df.filter(orig_train_df.click == 0).count()
## result: 33563901

In [None]:
###df.agg(countDistinct("some_column"))

In [None]:
'''
from pyspark.sql.functions import udf
import pyspark.sql.types as SparkType

#parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')

get_hour = udf(lambda s: int(str(s)[-2]+str(s)[-1]), SparkType.IntegerType())
get_id = udf(lambda s: int(s), SparkType.IntegerType())

## extract hour info:
train_df = orig_train_df.withColumn('int_hour', get_hour('hour')).drop('hour')
## convert id to integer:
train_df = train_df.withColumn("int_id", train_df["id"].cast(SparkType.IntegerType())).drop('id')


## a list of column names
ordered_columns = ['int_id', 'int_hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
                   'app_id', 'app_domain', 'app_category',
                   'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
                   'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'click']
train_df = train_df.select(*ordered_columns)

train_df.head(1)

## result:
# [Row(int_id=2096162817, int_hour=0, C1=1005, banner_pos=0, site_id='1fbe01fe', site_domain='f3845767',
# site_category='28905ebd', app_id='ecad2386', app_domain='7801e8d9', app_category='07d7df22',
# device_id='a99f214a', device_ip='ddd2926e', device_model='44956a24', device_type=1, device_conn_type=2,
# C14=15706, C15=320, C16=50, C17=1722, C18=0, C19=35, C20=-1, C21=79, click=0)]
'''

### test.rdd.flatMap(lambda x: x).histogram(20)

### df.columns
### result_pdf = df.select("*").toPandas()

In [None]:
#click_data = train_df.select("click").toPandas()
#click_data.value_counts().plot(kind='bar')

### Sample 2 million data points from the original ~6G dataset:

In [None]:

n_train = 40428967  # total number of records in the training set
n_sample = 2000000
skipped_rows = sorted(random.sample(range(1, n_train), n_train - n_sample))

## define data type for training and test sets:
types_train = {
    'id': np.dtype(int), 'click': np.dtype(int), 'hour': np.dtype(int), 'C1': np.dtype(int),
    'banner_pos': np.dtype(int),'site_id': np.dtype(str), 'site_domain': np.dtype(str),
    'site_category': np.dtype(str), 'app_id': np.dtype(str), 'app_domain': np.dtype(str),
    'app_category': np.dtype(str), 'device_id': np.dtype(str), 'device_ip': np.dtype(str),
    'device_model': np.dtype(str), 'device_type': np.dtype(int), 'device_conn_type': np.dtype(int),
    'C14': np.dtype(int), 'C15': np.dtype(int), 'C16': np.dtype(int), 'C17': np.dtype(int), 
    'C18': np.dtype(int), 'C19': np.dtype(int), 'C20': np.dtype(int), 'C21':np.dtype(int)
}

types_test = { key:val for (key,val) in types_train.items() if key!= 'click'}

### Load data:

In [None]:
import gzip

get_date = lambda s : pd.datetime.strptime(s, '%y%m%d%H')

with gzip.open('./data/CTR_train.gz') as f:
    train_df = pd.read_csv(f, parse_dates = ['hour'], date_parser = get_date,
                         dtype = types_train, skiprows = skipped_rows)

In [None]:
train_df.head(2)

In [None]:
train_df.dtypes

In [None]:
# Install a pip package in the current Jupyter kernel
#import sys
#!{sys.executable} -m pip install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.countplot(x='click',data=train_df)
plt.show()

In [None]:
print(train_df['click'].value_counts())
print(train_df['click'].value_counts()/len(train_df))

In [None]:
#sns.countplot(x='int_hour',data=train_df)
train_df.groupby('hour').agg({'click':'sum'}).plot(figsize=(12,6))
plt.ylabel('Number of clicks')
plt.title('Number of clicks by hour')

In [None]:
#train_df.dtypes

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
## add features: 'hour_of_day' and 'day_of_week'
train_df['hour_of_day'] = train_df['hour'].apply(lambda t: t.hour)
train_df['day_of_week'] = train_df['hour'].apply(lambda t: t.weekday_name)

In [None]:
#sns.countplot(x='hour_of_day',data=train_df)

In [None]:
train_df.groupby(['hour_of_day', 'click']).size().unstack().plot(kind='bar',stacked=True, title="Hour of the Day", figsize=(12,6))

#train_df.groupby(['hour_of_day', 'click']).size().unstack().plot(kind='bar',stacked=True, title="Hour of the Day", figsize=(12,6))

plt.ylabel('count')
plt.title('Clicks on hourly impressions vs. clicks');

In [None]:
train_df.groupby(['day_of_week', 'click']).size().unstack().plot(kind='bar',stacked=True, title="Day of the week", figsize=(12,6))

#train_df.groupby(['hour_of_day', 'click']).size().unstack().plot(kind='bar',stacked=True, title="Hour of the Day", figsize=(12,6))

plt.ylabel('count')
plt.title('day of week impressions vs. clicks');

In [None]:
train_df.groupby(['C1', 'click']).size().unstack().plot(kind='bar', title="Value counts for C1", figsize=(12,6))
plt.ylabel('count')
plt.xlabel('C1 domain')

In [None]:
train_df.groupby(['banner_pos', 'click']).size().unstack().plot(kind='bar', title="Value counts for banner_pos", figsize=(12,6))
plt.ylabel('count')
plt.xlabel('banner_pos')

In [None]:
train_df.groupby(['site_category', 'click']).size().unstack().plot(kind='bar', title="Value counts for site_category", figsize=(12,6))
plt.ylabel('count')
plt.xlabel('site_category')

In [None]:
train_df.groupby(['app_category', 'click']).size().unstack().plot(kind='bar', title="Value counts for app_category", figsize=(12,6))
plt.ylabel('count')
plt.xlabel('app_category')

In [None]:
train_df.groupby(['device_type', 'click']).size().unstack().plot(kind='bar', title="Value counts for device_type", figsize=(12,6))
plt.ylabel('count')
plt.xlabel('device_type')

In [None]:
#train_df.groupby(['device_id', 'click']).size().unstack().plot(kind='bar', stacked=True, title="Value counts for device_id", figsize=(12,6))
#plt.ylabel('count')
#plt.xlabel('device_id')

In [None]:
#train_df.groupby(['device_ip', 'click']).size().unstack().plot(kind='bar', stacked=True, title="Value counts for device_ip", figsize=(12,6))
#plt.ylabel('count')
#plt.xlabel('device_ip')

In [None]:
train_df.groupby(['device_conn_type', 'click']).size().unstack().plot(kind='bar', title="Value counts for device_conn_type", figsize=(12,6))
plt.ylabel('count')
plt.xlabel('device_conn_type')

In [None]:
features = ['C15', 'C16', 'C18', 'C19', 'C21']
for feature in features:
    print(feature)
    train_df.groupby([feature, 'click']).size().unstack().plot(kind='bar', title='Value counts for '+feature, figsize=(12,6))
    plt.ylabel("count")
    plt.xlabel(feature)

In [None]:
train_df.drop('id', axis=1, inplace=True)
train_df.drop('hour', axis=1, inplace=True)
#train_df.dtypes

In [None]:
columns = list(train_df.columns)

for c in columns:
    #print("For column " + c + ", number of unique value: ", train_df[c].value_counts())
    print("For column " + c + ", number of unique value: ", train_df[c].nunique())

In [None]:
to_be_droped = {'click', 'device_id', 'device_ip'}

tolerated_cols = {'hour_of_day','day_of_week', 'C1', 'banner_pos', 
             'site_category', 'app_category', 'device_type', 'device_conn_type', 'C15', 'C16', 'C18' }

target_cols = [ c for c in list(train_df.columns) if c not in tolerated_cols and c not in to_be_droped]

In [None]:
from collections import OrderedDict

## use OrderedDict to store unique value and 0, 1 counts for each column
## then pickup the unique value that has more than certain number of samples

def choose_domain_by_col(df, tolerated_cols, target_cols, thresh0, thresh1):
    
    col_domain = {}
    
    print("########## For tolerated columns: ##########")
    for c in tolerated_cols:
        
        tmp = []
        val_count = OrderedDict(df[c].value_counts())
        for key, count in val_count.items(): tmp.append(key)
        col_domain[c] = list(set(tmp))
        
        print(f"'{c}' domain size {len(col_domain[c])}")  

    print("########## For target columns: ##########")
    for c in target_cols:
        tmp = []
        tmp0 = OrderedDict(df.query("click == 0")[c].value_counts())
        tmp1 = OrderedDict(df.query("click == 1")[c].value_counts())

        for key, count in tmp0.items():
            if count >= thresh0:
                tmp.append(key)
            else:
                break
        for key, count in tmp1.items():
            if count >= thresh1:
                tmp.append(key)
            else:
                break
        ## include 'other':
        tmp.append('other')
        
        if len(tmp) == 1: continue
        
        col_domain[c] = list(set(tmp))
        print(f"'{c}' domain size {len(col_domain[c])}")
                
    return col_domain


In [None]:
## in the original 6G data, there are total 40428967 samples including:
## click == 1: 6865066
## click == 0: 33563901

## we require that for each column, say 'banner_pos', each possible value of this columns
## should contain no less than 1% of the total 0 or 1 label in train_df
thresh0 = int( int( 2000000 / 40428967 * 33563901 ) / 100 )  
thresh1 = int( int( 2000000 / 40428967 * 6865066 ) / 100 )

print(f"Threshold for label 0 and 1: {thresh0}, {thresh1}")

col_domain = choose_domain_by_col(train_df, tolerated_cols, target_cols, thresh0, thresh1)

#for key, lst in col_domain.items():
#    print(f"'{key}' domain size {len(lst)}: ", lst)
#    print(f"'{key}' domain size {len(lst)}")

print(f"Total number of features: {sum([len(lst) for _, lst in col_domain.items()])}")

In [None]:
## define funtion to check column 0-1 distribution:
#train_df.query("click == 0")['device_id'].value_counts()

def col_01_check(col_name):
    tmp_0 = train_df.query("click == 1")[col_name].value_counts()
    tmp_1 = train_df.query("click == 0")[col_name].value_counts()
    tmp_count = pd.concat([tmp_0, tmp_1], axis=1, sort=False)
    tmp_count.columns = [col_name+'_0_count', col_name+'_1_count']
    print(tmp_count)

In [None]:
col_01_check('site_category')
col_01_check('device_id')
col_01_check('device_ip')

In [None]:
for key, lst in col_domain.items():
    print(f"'{key}' domain size {len(lst)}: ", lst)

In [None]:
train_df.dtypes

In [None]:
## now need to transfer train_df data type to strings for each column

In [None]:
from pyspark.sql.functions import udf
import pyspark.sql.types as SparkType

transfer_to_string = udf(lambda s: int(s), SparkType.StringType())

In [None]:
#train_df = train_df.withColumn('int_hour', get_hour('hour')).drop('hour')

#df[["a", "b"]] = df[["a", "b"]].apply(str)

## define function to check column by column if the sample should be descarded:
    

In [None]:
#col_list = list(train_df.columns)
#for i in range(2000000):
#    print(i)
#    train_df.loc[i] = 

train_df_copy = pd.DataFrame(columns=list(train_df.columns))

for idx, row in train_df.iterrows():
    train_df_copy.loc[idx] = row

In [None]:
train_df_copy.head(5)

In [None]:
## check data types

In [None]:
type(train_df['click'])

In [None]:
## now need to do transformation for each row in train_df

In [None]:
print(col_domain)

In [None]:
## get the whole domain for the target columns::
target_domain = []
for _, lst in col_domain.items(): target_domain += lst
target_domain = set(target_domain)
print(f"Size of the target domain: {len(target_domain)}")
#print(target_domain)

In [None]:
idxes = set(range(2000000))
for col in target_cols:
    true_false_list = list(train_df[col].isin(target_domain))
    print(f"Checking {len(true_false_list)} rows for column: {col}")
    idxes = set([i for i in range(len(true_false_list)) if true_false_list[i] == True]).intersection(idxes)
idxes = sorted(list(idxes))
print(f"How many samples left: {len(idxes)}")

In [None]:
for col in sample_df.columns:
    print(col, sample_df[col].nunique())

In [None]:
sample_df = train_df.iloc[idxes]

In [None]:
sample_df.head(5)

In [None]:
sample_df.dtypes

In [None]:
for col in sample_df.columns:
    print(col, sample_df[col].nunique())

In [None]:
#sample_df.drop('device_id', axis=1, inplace=True)
#sample_df.drop('device_ip', axis=1, inplace=True)
sample_df.drop('C1', axis=1, inplace=True)
sample_df.drop('device_type', axis=1, inplace=True)

In [None]:
dum_col = set(sample_df.columns) - {'click'}
sample_df = pd.get_dummies(sample_df, columns=list(dum_col))

In [None]:
sample_df.head(2)

In [None]:
#import sys
#!{sys.executable} -m pip install xgboost

In [None]:
## now we can do vectorizer pipline or maybe try hasing first?

In [None]:
def hash_columns(self):
    
    list_columns = self.columns
    new_col_suffix = '_int'
    for i in range(0,len(list_columns)):
        if list_columns[i] == 'click': continue
        self[list_columns[i]+new_col_suffix] = self[list_columns[i]].map( lambda  x: hash(x))
        self.drop([list_columns[i]],inplace=True,axis=1)
    return self

In [None]:
train_df = hash_columns(train_df)

In [None]:
train_df.columns

In [None]:
#X_train = train_df.drop('click', axis=1)
#Y_train = train_df.click
#X_train.dtypes

pre_X = train_df.loc[:, train_df.columns != 'click']
pre_Y = train_df.click.values

In [None]:
pre_X.head(2)

In [None]:
# Install a pip package in the current Jupyter kernel
#import sys
#!{sys.executable} -m pip install lightgbm

#!conda install -c conda-forge lightgbm

In [None]:
import lightgbm as lgb

#create lightgbm dataset
mask = np.random.rand(len(pre_X)) < 0.8

train_X, train_Y = pre_X[mask], pre_Y[mask]
test_X, test_Y = pre_X[~mask], pre_Y[~mask]

In [None]:
lgb_train = lgb.Dataset(train_X, train_Y)
lgb_test = lgb.Dataset(test_X, test_Y, reference = lgb_train)

In [None]:
## model parameters:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': { 'binary_logloss'},
    'num_leaves': 32, # default leave amount for each tree
    'learning_rate': 0.05,
    'feature_fraction': 0.7, # will select 70% features before training each tree
    #'bagging_fraction': 0.3, #feature_fraction, but this will random select part of data
    #'bagging_freq': 5, #  perform bagging at every 5 iteration
    'verbose': 1
}
# valid_sets = [valid_set, train_set], valid_names = [‘eval’, ‘train’]

# model training:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000,
                valid_sets  = [lgb_train, lgb_test],
                valid_names = ['train', 'eval'],
                early_stopping_rounds=500,
                verbose_eval = 1000)



In [None]:
#print(gbm.best_score)
#print(gbm.best_iteration)

In [None]:
#type(lgb_test)
pred_prob = gbm.predict(test_X)
pred = pred_prob > 0.5
print(sum(pred)/len(pred))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

In [None]:
# calculate accuracy, precision and recall
def compute_scores(conf_matrix):
    
    tn, fp, fn, tp = conf_matrix[0][0], conf_matrix[0][1], conf_matrix[1][0], conf_matrix[1][1]
    
    accuracy  = (tp + tn) / (tp + fp + fn + tn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    print ("Accuracy is: %0.3f" % accuracy)
    print ("precision is: %0.3f" % precision)
    print ("recall is: %0.3f" % recall)

conf_matrix = confusion_matrix(test_Y, pred)
compute_scores(conf_matrix)
print("AUC is: ", roc_auc_score(test_Y, pred_prob))

In [None]:
from sklearn.metrics import roc_curve
from sklearn import metrics

# define function for plotting curve:
def plot_curve(model_name, x, name1, y1, name2, y2, name3, y3):
    plt.plot(x, y1, lw = 2, label = name1)
    plt.plot(x, y2, lw = 2, label = name2 )
    plt.plot(x, y3, lw = 2, label = name3)
    plt.xlabel('Model threshold')
    plt.ylabel('Model score')
    plt.title('Model score vs threshold for ' + model_name)
    plt.legend(loc='lower center')
    plt.xlim([-0.05, 1.05])
    return

## initialize accuracy, precision and recall:
accuracy, precision, recall = [], [], []
## compute probability of positive case:
pred_prob = gbm.predict(test_X)
## compute fpr, tpr and threshold:
fpr, tpr, thresh = roc_curve(test_Y, pred_prob)
for j in range(len(thresh)):
    accuracy.append(   accuracy_score(test_Y, pred_prob >= thresh[j]) )
    precision.append( precision_score(test_Y, pred_prob >= thresh[j]) )
    recall.append(       recall_score(test_Y, pred_prob >= thresh[j]) )

    
accuracy, precision, recall = np.asarray(accuracy), np.asarray(precision), np.asarray(recall)
plot_curve( "lightgbm", thresh[1:], "Accucary", accuracy[1:], "Precision", precision[1:], "Recall", recall[1:] )

plt.tight_layout()
plt.show()

In [None]:
## train-test split: reserve 20% for testing
#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test =  train_test_split(preX, preY, test_size = 0.2, random_state=0)

In [None]:
#import models:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from xgboost import XGBClassifier

# Logistic Regression
classifier_LR = LogisticRegression()

# K Nearest Neighbors
#classifier_KNN = KNeighborsClassifier()

# Random Forest
classifier_RF = RandomForestClassifier(random_state = 0)

## support vector machine:
classifier_SVM = SVC()

## Gradient boosting:
classifier_GB = XGBClassifier(random_state=0)

In [None]:
# Use 5-fold Cross Validation to get the accuracy for all three models
model_names = ["Logistic Regression", "Random Forest", "Support Vector Machine", "Gradient Boosting"]
model_list = [classifier_LR, classifier_RF, classifier_SVM, classifier_GB]

for i in range(len(model_list)):
    classifier = model_list[i]
    cv_score = model_selection.cross_val_score(classifier, train_X, train_Y, cv=5)
    print("Accuracy for " + model_names[i] + " is: ", cv_score.mean())
    

In [None]:
models = [best_models[0], best_models[3]]
model_names = ["Random Forest", "Gradient Boosting"]

fig, axes = plt.subplots(1,2, figsize=(10,4))

# define function for plotting curve:
def plot_curve(model_name, ax, x, name_y1, y1, name_y2, y2, name_y3, y3):
    ax.plot(x, y1, lw = 2, label = name_y1)
    ax.plot(x, y2, lw = 2, label = name_y2 )
    ax.plot(x, y3, lw = 2, label = name_y3)
    ax.set_xlabel('Model threshold')
    ax.set_ylabel('Model score')
    ax.set_title('Model score vs threshold for ' + model_name)
    ax.legend(loc='lower center')
    ax.set_xlim([-0.05, 1.05])
    return

for i in range(len(models)):
    ## pick up models and model names:
    model, model_name = models[i], model_names[i]
    ## initialize accuracy, precision and recall:
    accuracy, precision, recall = [], [], []
    ## compute probability of positive case:
    pred_prob = model.predict_proba(X_test)[:, 1]
    ## compute fpr, tpr and threshold:
    fpr, tpr, thresh = roc_curve(Y_test, pred_prob)
    for j in range(len(thresh)):
        accuracy.append(   accuracy_score(Y_test, pred_prob >= thresh[j]) )
        precision.append( precision_score(Y_test, pred_prob >= thresh[j]) )
        recall.append(       recall_score(Y_test, pred_prob >= thresh[j]) )
    
    accuracy, precision, recall = np.asarray(accuracy), np.asarray(precision), np.asarray(recall)
    plot_curve( model_name, axes[i], thresh[1:],\
                "Accucary", accuracy[1:],\
                "Precision", precision[1:],\
                "Recall", recall[1:] )

plt.tight_layout()
plt.show()