#  WNS Analytics Wizard 2019 CONTEST

**AIM** : To predict whether the ad will be clicked by the user who enters into Partner websites, using the Ad data and the persons history on the ZAP website.

In [None]:
# importing the needed libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from imblearn.over_sampling import RandomOverSampler
import warnings
import seaborn as sns
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
!ls "/content/drive/My Drive/Hackathon/train"

In [None]:
# Loading the Train and Test Files.
train = pd.read_csv("/content/drive/My Drive/Hackathon/train/train.csv")
item = pd.read_csv("/content/drive/My Drive/Hackathon/train/item_data.csv")
view_log = pd.read_csv("/content/drive/My Drive/Hackathon/train/view_log.csv")
test = pd.read_csv("/content/drive/My Drive/Hackathon/test/test.csv")

In [None]:
#Viewing Train Data
train.head()

In [None]:
view_log.head()

In [None]:
item.head()

In [None]:
# Since the Log Data contains only item Id of product we will left join product attributes with User Log history to get a required dataset
product_view_log = pd.merge(view_log, item, on="item_id", how="left")
product_view_log.head()

In [None]:
# Visualizing how test data looks like.
test.head()

In [None]:
train['is_click'].value_counts()

In [None]:
print ("Ad's clicked to Not clicked ratio is: "+ str(train['is_click'].value_counts()[1]/train['is_click'].value_counts()[0]))

Approximately 4.8% ads displayed were clicked, so what we have here is a imbalanced binary classification problem.

Checking Time Period of the Data

In [None]:
# Let us Convert the Dates into DateTime Object. 
product_view_log['server_time'] = pd.to_datetime(product_view_log['server_time'])
train['impression_time'] = pd.to_datetime(train['impression_time'])
test['impression_time'] = pd.to_datetime(test['impression_time'])

In [None]:
print ("User Product logs are available from : " + str(product_view_log['server_time'].min()) + " to " + str(product_view_log['server_time'].max()))
print ("Equivalent to "+ str(product_view_log['server_time'].max()-product_view_log['server_time'].min()))

In [None]:
print ("Train and test records are available from : " + str(train['impression_time'].min()) + " to " + str(test['impression_time'].max()))
print ("Equivalent to "+ str(test['impression_time'].max()-train['impression_time'].min()))

In [None]:
print ("Train records are available from : " + str(train['impression_time'].min()) + " to " + str(train['impression_time'].max()))
print ("Equivalent to "+ str(train['impression_time'].max()-train['impression_time'].min()))

In [None]:
print ("Test records are available from : " + str(test['impression_time'].min()) + " to " + str(test['impression_time'].max()))
print ("Equivalent to "+ str(test['impression_time'].max()-test['impression_time'].min()))

**Observations** <br>
1) We have previous one month view history of users in the ZAP Website before making Predictions.<br>
2) Training Data is 4 Weeks and Test Data is 1 Week.

# Factors which present in Product View Log may influence the User Ad click.
1) Temporal Factors:<br>
&emsp;a) Time of the Day in which user enters the Partner Website - Both Hours and Parts of the Day <br>
&emsp;&emsp;Morning <br>
&emsp;&emsp;&emsp;&emsp;&emsp;Early morning     5 to 8 am <br>
&emsp;&emsp;&emsp;&emsp;&emsp;Late morning      11 am to 12pm <br>
&emsp;&emsp;Afternoon <br>
&emsp;&emsp;&emsp;&emsp;&emsp;Early afternoon   1 to 3pm <br>
&emsp;&emsp;&emsp;&emsp;&emsp;Late afternoon    4 to 5pm <br>
&emsp;&emsp;Evening<br>
&emsp;&emsp;&emsp;&emsp;&emsp;Early evening         5 to 7 pm <br>
&emsp;&emsp;&emsp;&emsp;&emsp;Late evening          5 pm to 9 pm <br>
&emsp;&emsp;Night                 9 pm to 4 am <br>
&emsp;b) Number of times user visited the site in last 3 hours, 6 hours, 12 hours, 1 day, 2 days, 3 days, 1 week and 1 month.<br>
&emsp;c) Number of products user viewed in the site for last 3 hours, 6 hours, 12 hours, 1 day, 2 days, 3 days, 1 week and 1 month.<br>
&emsp;d) Number of products user viewed by the user in the last session<br> 
&emsp;e) Number of times user was shown the Ad and Number of times user clicked the Ad as Click Ratio of user<br> 
&emsp;f) Last 5 products viewed by the user by the time ad was shown. Product 1, Product 2, Product 3, Product 4, Product 5.<br> 
&emsp;g) Time spent by the user on a site for last 5 sessions as session1, session2, session3, session4, session5.<br> 
&emsp;h) Day of the Week & Weekend or not weekend<br> 

In [None]:
train.info()

In [None]:
test.info()

In [None]:
view_log.info(verbose=True, null_counts=True)

In [None]:
item.info()

In [None]:
product_view_log.info(verbose=True, null_counts=True)

In [None]:
product_view_log[product_view_log['product_type'].isnull()].head(5)

In [None]:
print("Item Id's of Product Not Available and their Counts: ")
product_view_log['item_id'][product_view_log['product_type'].isnull()].unique(), len(product_view_log['item_id'][product_view_log['product_type'].isnull()].unique())

Product Attributes for 101 items when present in 1782 logs were unavialable

In [None]:
product_view_log['server_time'][product_view_log['product_type'].isnull()].dt.month.value_counts()

Since the Product information unavailablity is equal in all months with amount of data present in each month, We will not use this information while taking temporal features with Products.

In [None]:
product_view_log = product_view_log[product_view_log['product_type'].notnull()].reset_index(drop=True)

**Let us Start Extracting the Derived Features.**

Adding Part of the Day and Hour of the Day to Server time

In [None]:
product_view_log['Hour_of_the_day'] = product_view_log['server_time'].dt.hour
product_view_log['Part_of_the_day'] = None
product_view_log['Part_of_the_day'][(product_view_log['Hour_of_the_day']>=5)&(product_view_log['Hour_of_the_day']<=8)] = "Early Morning"
product_view_log['Part_of_the_day'][(product_view_log['Hour_of_the_day']>=9)&(product_view_log['Hour_of_the_day']<=10)] = "Middle Morning"
product_view_log['Part_of_the_day'][(product_view_log['Hour_of_the_day']>=11)&(product_view_log['Hour_of_the_day']<=12)] = "Late Morning"
product_view_log['Part_of_the_day'][(product_view_log['Hour_of_the_day']>=13)&(product_view_log['Hour_of_the_day']<=15)] = "Early Afternoon"
product_view_log['Part_of_the_day'][(product_view_log['Hour_of_the_day']>=16)&(product_view_log['Hour_of_the_day']<=17)] = "Late Afternoon"
product_view_log['Part_of_the_day'][(product_view_log['Hour_of_the_day']>=18)&(product_view_log['Hour_of_the_day']<=19)] = "Early Evening"
product_view_log['Part_of_the_day'][(product_view_log['Hour_of_the_day']>=20)&(product_view_log['Hour_of_the_day']<=21)] = "Late Evening"
product_view_log['Part_of_the_day'][(product_view_log['Hour_of_the_day']>=22)|(product_view_log['Hour_of_the_day']<=4)] = "Night"
product_view_log['Part_of_the_day'].value_counts()

In [None]:
def product_session_counts(x, timespan_with_unit, count):
    sessions = count
    product_info_extraction_count = count
    #print(x.impression_time, x.user_id, timespan, timespan_unit)
    temp = product_view_log[product_view_log['user_id']==x.user_id]
    session_count_with_time_list = []
    category1_seen_count_with_time_list = []
    category2_seen_count_with_time_list = []
    category3_seen_count_with_time_list = []
    product_type_seen_count_with_time_list = []
    item_id_seen_count_with_time_list = []
    category1_seen_count_with_session = []
    category2_seen_count_with_session = []
    category3_seen_count_with_session = []
    product_type_seen_count_with_session = []
    item_id_seen_count_with_session = []
    average_price_of_product_with_session =[]
    device_type_in_each_session = []
    hour_of_the_day_each_session=[]
    part_of_the_day_each_session=[]
    time_spent_surfing_each_session=[]
    last_visited_product_items = []
    last_visited_product_types = []
    last_visited_product_categories_1 = []
    last_visited_product_categories_2 = []
    last_visited_product_categories_3 = []
    last_visited_product_prices = []
    
    
    for timespan, timespan_unit in timespan_with_unit:
        session_count_with_time_list.append(temp['session_id'][(temp['server_time']<x.impression_time)&(temp['server_time']>(x['impression_time'] - pd.to_timedelta(timespan, unit=timespan_unit)))].nunique())
        category1_seen_count_with_time_list.append(temp['category_1'][(temp['server_time']<x.impression_time)&(temp['server_time']>(x['impression_time'] - pd.to_timedelta(timespan, unit=timespan_unit)))].nunique())
        category2_seen_count_with_time_list.append(temp['category_2'][(temp['server_time']<x.impression_time)&(temp['server_time']>(x['impression_time'] - pd.to_timedelta(timespan, unit=timespan_unit)))].nunique())
        category3_seen_count_with_time_list.append(temp['category_3'][(temp['server_time']<x.impression_time)&(temp['server_time']>(x['impression_time'] - pd.to_timedelta(timespan, unit=timespan_unit)))].nunique())
        product_type_seen_count_with_time_list.append(temp['product_type'][(temp['server_time']<x.impression_time)&(temp['server_time']>(x['impression_time'] - pd.to_timedelta(timespan, unit=timespan_unit)))].nunique())
        item_id_seen_count_with_time_list.append(temp['item_id'][(temp['server_time']<x.impression_time)&(temp['server_time']>(x['impression_time'] - pd.to_timedelta(timespan, unit=timespan_unit)))].nunique())
    
    temp = temp[temp['server_time']<x.impression_time]
    temp.sort_values(by='server_time', inplace=True, ascending=False)
    #print(temp['session_id'].unique()[0:sessions])
    for i in temp['session_id'].unique()[0:sessions]:
        category1_seen_count_with_session.append(temp['category_1'][temp['session_id']==i].nunique())
        category2_seen_count_with_session.append(temp['category_2'][temp['session_id']==i].nunique())
        category3_seen_count_with_session.append(temp['category_3'][temp['session_id']==i].nunique())
        product_type_seen_count_with_session.append(temp['product_type'][temp['session_id']==i].nunique())
        item_id_seen_count_with_session.append(temp['item_id'][temp['session_id']==i].nunique())
        average_price_of_product_with_session.append(temp['item_id'][temp['session_id']==i].mean())
        device_type_in_each_session.append(temp['device_type'][temp['session_id']==i].unique()[0])
        hour_of_the_day_each_session.append(temp['Hour_of_the_day'][temp['session_id']==i].unique()[0])
        part_of_the_day_each_session.append(temp['Part_of_the_day'][temp['session_id']==i].unique()[0])
        time_spent_surfing_each_session.append(((temp['server_time'][temp['session_id']==i].max())-(temp['server_time'][temp['session_id']==i].min())).seconds)
        
    category1_seen_count_with_session = (category1_seen_count_with_session+([None]*sessions))[:sessions]
    category2_seen_count_with_session = (category2_seen_count_with_session+([None]*sessions))[:sessions]
    category3_seen_count_with_session = (category3_seen_count_with_session+([None]*sessions))[:sessions]
    product_type_seen_count_with_session = (product_type_seen_count_with_session+([None]*sessions))[:sessions]
    item_id_seen_count_with_session = (item_id_seen_count_with_session+([None]*sessions))[:sessions]
    average_price_of_product_with_session = (average_price_of_product_with_session+([None]*sessions))[:sessions]
    device_type_in_each_session = (device_type_in_each_session+([None]*sessions))[:sessions]
    part_of_the_day_each_session = (part_of_the_day_each_session+([None]*sessions))[:sessions]
    hour_of_the_day_each_session = (hour_of_the_day_each_session+([None]*sessions))[:sessions]
    time_spent_surfing_each_session = (time_spent_surfing_each_session+([None]*sessions))[:sessions]
    
    # Last Visited Product Attribute Details
    
    last_visited_product_items = (temp['item_id'].unique().tolist()+([None]*product_info_extraction_count))[0:product_info_extraction_count]
    last_visited_product_types = (temp['product_type'].unique().tolist()+([None]*product_info_extraction_count))[0:product_info_extraction_count]
    last_visited_product_categories_1 = (temp['category_1'].unique().tolist()+([None]*product_info_extraction_count))[0:product_info_extraction_count]
    last_visited_product_categories_2 = (temp['category_2'].unique().tolist()+([None]*product_info_extraction_count))[0:product_info_extraction_count]
    last_visited_product_categories_3 = (temp['category_3'].unique().tolist()+([None]*product_info_extraction_count))[0:product_info_extraction_count]
    last_visited_product_prices = (temp['item_price'].unique().tolist()+([None]*product_info_extraction_count))[0:product_info_extraction_count]
    
    return (session_count_with_time_list +  category1_seen_count_with_time_list + category2_seen_count_with_time_list + 
    category3_seen_count_with_time_list + product_type_seen_count_with_time_list + item_id_seen_count_with_time_list +
    category1_seen_count_with_session + category2_seen_count_with_session + category3_seen_count_with_session + 
    product_type_seen_count_with_session + item_id_seen_count_with_session + average_price_of_product_with_session +
    device_type_in_each_session + hour_of_the_day_each_session + part_of_the_day_each_session + 
    time_spent_surfing_each_session + last_visited_product_items + last_visited_product_types +
    last_visited_product_categories_1 + last_visited_product_categories_2 + last_visited_product_categories_3 +
    last_visited_product_prices)

In [None]:
train.shape

In [None]:
%%time
#Number of times user visited the site in last 3 hours, 6 hours, 12 hours, 1 day, 2 days, 3 days, 1 week and 30 days.

column_list=['site_visit_count_last_3_hours', 'site_visit_count_last_6_hours',
            'site_visit_count__last_12_hours', 'site_visit_count_last_1_day',
            'site_visit_count_last_2_days', 'site_visit_count_last_3_days',
            'site_visit_count_last_1_week', 'site_visit_count_last_1_month',
            'category1_seen_count_last_3_hours', 'category1_seen_count_last_6_hours',
            'category1_seen_count__last_12_hours', 'category1_seen_count_last_1_day',
            'category1_seen_count_last_2_days', 'category1_seen_count_last_3_days',
            'category1_seen_count_last_1_week', 'category1_seen_count_last_1_month',
            'category2_seen_count_last_3_hours', 'category2_seen_count_last_6_hours',
            'category2_seen_count__last_12_hours', 'category2_seen_count_last_1_day',
            'category2_seen_count_last_2_days', 'category2_seen_count_last_3_days',
            'category2_seen_count_last_1_week', 'category2_seen_count_last_1_month',
            'category3_seen_count_last_3_hours', 'category3_seen_count_last_6_hours',
            'category3_seen_count__last_12_hours', 'category3_seen_count_last_1_day',
            'category3_seen_count_last_2_days', 'category3_seen_count_last_3_days',
            'category3_seen_count_last_1_week', 'category3_seen_count_last_1_month',
            'product_type_seen_count_last_3_hours', 'product_type_seen_count_last_6_hours',
            'product_type_seen_count__last_12_hours', 'product_type_seen_count_last_1_day',
            'product_type_seen_count_last_2_days', 'product_type_seen_count_last_3_days',
            'product_type_seen_count_last_1_week', 'product_type_seen_count_last_1_month',
            'product_item_seen_count_last_3_hours', 'product_item_seen_count_last_6_hours',
            'product_item_seen_count__last_12_hours', 'product_item_seen_count_last_1_day',
            'product_item_seen_count_last_2_days', 'product_item_seen_count_last_3_days',
            'product_item_seen_count_last_1_week', 'product_item_seen_count_last_1_month',
            'category1_seen_count_with_session1', 'category1_seen_count_with_session2',
            'category1_seen_count_with_session3', 'category1_seen_count_with_session4', 
            'category1_seen_count_with_session5', 'category2_seen_count_with_session1',
            'category2_seen_count_with_session2', 'category2_seen_count_with_session3',
            'category2_seen_count_with_session4', 'category2_seen_count_with_session5',
            'category3_seen_count_with_session1', 'category3_seen_count_with_session2',
            'category3_seen_count_with_session3', 'category3_seen_count_with_session4', 
            'category3_seen_count_with_session5', 'product_type_seen_count_with_session1',
            'product_type_seen_count_with_session2', 'product_type_seen_count_with_session3',
            'product_type_seen_count_with_session4', 'product_type_seen_count_with_session5',
            'item_id_seen_count_with_session1', 'item_id_seen_count_with_session2',
            'item_id_seen_count_with_session3', 'item_id_seen_count_with_session4', 
            'item_id_seen_count_with_session5', 'average_price_of_product_seen_count_with_session1',
            'average_price_of_product_seen_count_with_session2', 'average_price_of_product_seen_count_with_session3',
            'average_price_of_product_seen_count_with_session4', 'average_price_of_product_seen_count_with_session5',
            'device_type_seen_count_with_session1', 'device_type_seen_count_with_session2',
            'device_type_seen_count_with_session3', 'device_type_seen_count_with_session4', 
            'device_type_seen_count_with_session5', 'hour_of_the_day_seen_count_with_session1',
            'hour_of_the_day_seen_count_with_session2', 'hour_of_the_day_seen_count_with_session3',
            'hour_of_the_day_seen_count_with_session4', 'hour_of_the_day_seen_count_with_session5',
            'part_of_the_day_seen_count_with_session1', 'part_of_the_day_seen_count_with_session2',
            'part_of_the_day_seen_count_with_session3', 'part_of_the_day_seen_count_with_session4', 
            'part_of_the_day_seen_count_with_session5', 'time_spent_surfing_each_session_with_session1',
            'time_spent_surfing_each_session_with_session2', 'time_spent_surfing_each_session_with_session3',
            'time_spent_surfing_each_session_with_session4', 'time_spent_surfing_each_session_with_session5',
            'last_visited_product_item1', 'last_visited_product_item2',
            'last_visited_product_item3', 'last_visited_product_item4', 
            'last_visited_product_item5', 'last_visited_product_type1',
            'last_visited_product_type2', 'last_visited_product_type3',
            'last_visited_product_type4', 'last_visited_product_type5',
            'last_visited_product_category1_1', 'last_visited_product_category1_2',
            'last_visited_product_category1_3', 'last_visited_product_category1_4', 
            'last_visited_product_category1_5', 'last_visited_product_category2_1',
            'last_visited_product_category2_2', 'last_visited_product_category2_3',
            'last_visited_product_category2_4', 'last_visited_product_category2_5',
            'last_visited_product_category3_1', 'last_visited_product_category3_2',
            'last_visited_product_category3_3', 'last_visited_product_category3_4', 
            'last_visited_product_category3_5', 'last_visited_product_price_1',
            'last_visited_product_price_2', 'last_visited_product_price_3',
            'last_visited_product_price_4', 'last_visited_product_price_5']

#Train
user_visited_count_train_df = train[['impression_time','user_id']].apply(lambda x: product_session_counts(x, [(3,"h"), (6, "h"), (12, "h"), (1, "d") ,(2, "d"), (3, "d"), (7, "d"), (30, "d")], 5 ), axis=1)
user_visited_count_train_df = (pd.DataFrame(user_visited_count_train_df.values.tolist()))
user_visited_count_train_df.columns = column_list
train_a = pd.merge(train, user_visited_count_train_df, left_index=True, right_index=True)
train_a.to_csv("/content/drive/My Drive/Hackathon/train/train_check1.csv", index=False)

In [None]:
train_a.head()

In [None]:
#Test
user_visited_count_test_df = test[['impression_time','user_id']].apply(lambda x: product_session_counts(x, [(3,"h"), (6, "h"), (12, "h"), (1, "d") ,(2, "d"), (3, "d"), (7, "d"), (30, "d")], 5 ), axis=1)
user_visited_count_test_df = (pd.DataFrame(user_visited_count_test_df.values.tolist()))
user_visited_count_test_df.columns = column_list
test_a = pd.merge(test, user_visited_count_test_df, left_index=True, right_index=True)
test_a.to_csv("/content/drive/My Drive/Hackathon/test/test_check1.csv", index=False)

In [None]:
train = pd.read_csv("/content/drive/My Drive/Hackathon/train/train_check1.csv")
test = pd.read_csv("/content/drive/My Drive/Hackathon/test/test_check1.csv")

In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
test.info(verbose=True, null_counts=True)

In [None]:
train._get_numeric_data().fillna(-99, inplace=True)
train.fillna("NOTAVAILABLE", inplace=True)
train.info(verbose=True, null_counts=True)

In [None]:
test._get_numeric_data().fillna(-99, inplace=True)
test.fillna("NOTAVAILABLE", inplace=True)
test.info(verbose=True, null_counts=True)

In [None]:
train['impression_time'] = pd.to_datetime(train['impression_time'])
train['Hour_of_the_day'] = train['impression_time'].dt.hour
train['Part_of_the_day'] = None
train['Part_of_the_day'][(train['Hour_of_the_day']>=5)&(train['Hour_of_the_day']<=8)] = "Early Morning"
train['Part_of_the_day'][(train['Hour_of_the_day']>=9)&(train['Hour_of_the_day']<=10)] = "Middle Morning"
train['Part_of_the_day'][(train['Hour_of_the_day']>=11)&(train['Hour_of_the_day']<=12)] = "Late Morning"
train['Part_of_the_day'][(train['Hour_of_the_day']>=13)&(train['Hour_of_the_day']<=15)] = "Early Afternoon"
train['Part_of_the_day'][(train['Hour_of_the_day']>=16)&(train['Hour_of_the_day']<=17)] = "Late Afternoon"
train['Part_of_the_day'][(train['Hour_of_the_day']>=18)&(train['Hour_of_the_day']<=19)] = "Early Evening"
train['Part_of_the_day'][(train['Hour_of_the_day']>=20)&(train['Hour_of_the_day']<=21)] = "Late Evening"
train['Part_of_the_day'][(train['Hour_of_the_day']>=22)|(train['Hour_of_the_day']<=4)] = "Night"
train['Part_of_the_day'].value_counts()

In [None]:
test['impression_time'] = pd.to_datetime(test['impression_time'])
test['Hour_of_the_day'] = test['impression_time'].dt.hour
test['Part_of_the_day'] = None
test['Part_of_the_day'][(test['Hour_of_the_day']>=5)&(test['Hour_of_the_day']<=8)] = "Early Morning"
test['Part_of_the_day'][(test['Hour_of_the_day']>=9)&(test['Hour_of_the_day']<=10)] = "Middle Morning"
test['Part_of_the_day'][(test['Hour_of_the_day']>=11)&(test['Hour_of_the_day']<=12)] = "Late Morning"
test['Part_of_the_day'][(test['Hour_of_the_day']>=13)&(test['Hour_of_the_day']<=15)] = "Early Afternoon"
test['Part_of_the_day'][(test['Hour_of_the_day']>=16)&(test['Hour_of_the_day']<=17)] = "Late Afternoon"
test['Part_of_the_day'][(test['Hour_of_the_day']>=18)&(test['Hour_of_the_day']<=19)] = "Early Evening"
test['Part_of_the_day'][(test['Hour_of_the_day']>=20)&(test['Hour_of_the_day']<=21)] = "Late Evening"
test['Part_of_the_day'][(test['Hour_of_the_day']>=22)|(test['Hour_of_the_day']<=4)] = "Night"
test['Part_of_the_day'].value_counts()

In [None]:
train['Day_Of_Week'] = train['impression_time'].dt.dayofweek
train['Weekend'] = 0
train['Weekend'][train['Day_Of_Week'].isin([5,6])] = 1
train['Day'] = train['impression_time'].dt.day
test['Day_Of_Week'] = train['impression_time'].dt.dayofweek
test['Weekend'] = 0
test['Weekend'][test['Day_Of_Week'].isin([5,6])] = 1
test['Day'] = test['impression_time'].dt.day

In [None]:
train._get_numeric_data().fillna(-99, inplace=True)
train.fillna("NOTAVAILABLE", inplace=True)
train.info(verbose=True, null_counts=True)
test._get_numeric_data().fillna(-99, inplace=True)
test.fillna("NOTAVAILABLE", inplace=True)
test.info(verbose=True, null_counts=True)

In [None]:
# Extract Click Ratio Per User

temp1 = train[['user_id', 'is_click']][~train['impression_time'].dt.day.isin([7,8,9,10,11,12,13])].groupby('user_id').is_click.count().reset_index(name='TotalAdShown')
temp2 = train[['user_id', 'is_click']][~train['impression_time'].dt.day.isin([7,8,9,10,11,12,13])].groupby('user_id').is_click.sum().reset_index(name='TotalAdClicked')
temp=pd.merge(temp1, temp2, on='user_id', how='left')
temp.info(verbose=True, null_counts=True)
temp['Click_Ratio']=temp['TotalAdClicked']/temp['TotalAdShown']
TotalAdShown=temp.TotalAdShown.sum()
TotalAdClicked=temp.TotalAdClicked.sum()
Avg_Add_Clicked=TotalAdClicked/TotalAdShown
temp['Expected_Click']=temp['TotalAdShown']*Avg_Add_Clicked
temp['Normalized_Click_Ratio']=(temp['TotalAdClicked']-temp['Expected_Click'])/(temp['Expected_Click'] ** 0.5)
train['Normalized_Click_Ratio'] = train['user_id'].map(temp[['user_id', 'Normalized_Click_Ratio']].set_index("user_id").to_dict()['Normalized_Click_Ratio'])
train['Normalized_Click_Ratio'].fillna(train['Normalized_Click_Ratio'].mean(), inplace=True)
train['Normalized_Click_Ratio'].plot.hist()

In [None]:
# Extract Click Ratio Per App

temp1 = train[['app_code', 'is_click']][~train['impression_time'].dt.day.isin([7,8,9,10,11,12,13])].groupby('app_code').is_click.count().reset_index(name='TotalAdShown')
temp2 = train[['app_code', 'is_click']][~train['impression_time'].dt.day.isin([7,8,9,10,11,12,13])].groupby('app_code').is_click.sum().reset_index(name='TotalAdClicked')
temp=pd.merge(temp1, temp2, on='app_code', how='left')
temp.info(verbose=True, null_counts=True)
temp['Click_Ratio']=temp['TotalAdClicked']/temp['TotalAdShown']
TotalAdShown=temp.TotalAdShown.sum()
TotalAdClicked=temp.TotalAdClicked.sum()
Avg_Add_Clicked=TotalAdClicked/TotalAdShown
temp['Expected_Click']=temp['TotalAdShown']*Avg_Add_Clicked
temp['Normalized_Click_Ratio']=(temp['TotalAdClicked']-temp['Expected_Click'])/(temp['Expected_Click'] ** 0.5)
train['Normalized_Click_Ratio_app'] = train['app_code'].map(temp[['app_code', 'Normalized_Click_Ratio']].set_index("app_code").to_dict()['Normalized_Click_Ratio'])
train['Normalized_Click_Ratio_app'].fillna(train['Normalized_Click_Ratio'].mean(), inplace=True)
train['Normalized_Click_Ratio_app'].plot.hist()

In [None]:
train_backup = train.copy()
test_backup= test.copy()

In [None]:
del train['impression_id']
del train['impression_time']

In [None]:
# For Checking Correlations

def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(train[train.loc[:, train.dtypes != np.object].columns], 500))

In [None]:
# Encoding Categorical Variables into Numerical Variables.

d = defaultdict(LabelEncoder)
Numerical_Columns = train.describe().columns.to_list()
Categorical_columns = list(set(train.columns.to_list()) - set(Numerical_Columns))
train[Categorical_columns] = train[Categorical_columns].apply(lambda x: d[x.name].fit_transform(x))

In [None]:
X_train = train.drop(['is_click','Day'],axis=1)[~train['Day'].isin([12,13])]
y_train = train['is_click'][~train['Day'].isin([12,13])]

In [None]:
columns=X_train.columns.to_list()
smt=RandomOverSampler()
X_train, y_train = smt.fit_sample(X_train, y_train)
X_train=pd.DataFrame(X_train,columns=columns)
y_train=pd.DataFrame(y_train,columns=['target'])

In [None]:
X_test = train.drop(['is_click','Day'],axis=1)[train['Day'].isin([12,13])]
y_test = train['is_click'][train['Day'].isin([12,13])]

In [None]:
x_val, x_test, y_val, y_test1 = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
X_test.columns.to_list()

In [None]:
Categorical_columns

In [None]:
col = ['user_id', 'app_code', 'os_version', 'is_4G', 'Hour_of_the_day',  'Day_Of_Week'] #0.738

In [None]:
Categorical_columns_lgbm = [ 'os_version', 'user_id', 'app_code', 'is_4G']

In [None]:
col = ['user_id',
 'app_code',
 'os_version',
 'is_4G',
 'site_visit_count_last_3_hours',
 'site_visit_count_last_6_hours',
 'site_visit_count__last_12_hours',
 'site_visit_count_last_1_day',
 'site_visit_count_last_2_days',
 'site_visit_count_last_3_days',
 'site_visit_count_last_1_week',
 'site_visit_count_last_1_month',
 'category1_seen_count_last_3_hours',
 'category1_seen_count_last_6_hours',
 'category1_seen_count__last_12_hours',
 'category1_seen_count_last_1_day',
 'category1_seen_count_last_2_days',
 'category1_seen_count_last_3_days',
 'category1_seen_count_last_1_week',
 'category1_seen_count_last_1_month',
 'category2_seen_count_last_3_hours',
 'category2_seen_count_last_6_hours',
 'category2_seen_count__last_12_hours',
 'category2_seen_count_last_1_day',
 'category2_seen_count_last_2_days',
 'category2_seen_count_last_3_days',
 'category2_seen_count_last_1_week',
 'category2_seen_count_last_1_month',
 'category3_seen_count_last_3_hours',
 'category3_seen_count_last_6_hours',
 'category3_seen_count__last_12_hours',
 'category3_seen_count_last_1_day',
 'category3_seen_count_last_2_days',
 'category3_seen_count_last_3_days',
 'category3_seen_count_last_1_week',
 'category3_seen_count_last_1_month',
 'product_type_seen_count_last_3_hours',
 'product_type_seen_count_last_6_hours',
 'product_type_seen_count__last_12_hours',
 'product_type_seen_count_last_1_day',
 'product_type_seen_count_last_2_days',
 'product_type_seen_count_last_3_days',
 'product_type_seen_count_last_1_week',
 'product_type_seen_count_last_1_month',
 'product_item_seen_count_last_3_hours',
 'product_item_seen_count_last_6_hours',
 'product_item_seen_count__last_12_hours',
 'product_item_seen_count_last_1_day',
 'product_item_seen_count_last_2_days',
 'product_item_seen_count_last_3_days',
 'product_item_seen_count_last_1_week',
 'product_item_seen_count_last_1_month',
 'category1_seen_count_with_session1',
 'category1_seen_count_with_session2',
 'category1_seen_count_with_session3',
 'category1_seen_count_with_session4',
 'category1_seen_count_with_session5',
 'category2_seen_count_with_session1',
 'category2_seen_count_with_session2',
 'category2_seen_count_with_session3',
 'category2_seen_count_with_session4',
 'category2_seen_count_with_session5',
 'category3_seen_count_with_session1',
 'category3_seen_count_with_session2',
 'category3_seen_count_with_session3',
 'category3_seen_count_with_session4',
 'category3_seen_count_with_session5',
 'product_type_seen_count_with_session1',
 'product_type_seen_count_with_session2',
 'product_type_seen_count_with_session3',
 'product_type_seen_count_with_session4',
 'product_type_seen_count_with_session5',
 'item_id_seen_count_with_session1',
 'item_id_seen_count_with_session2',
 'item_id_seen_count_with_session3',
 'item_id_seen_count_with_session4',
 'item_id_seen_count_with_session5',
 'average_price_of_product_seen_count_with_session1',
 'average_price_of_product_seen_count_with_session2',
 'average_price_of_product_seen_count_with_session3',
 'average_price_of_product_seen_count_with_session4',
 'average_price_of_product_seen_count_with_session5',
 'device_type_seen_count_with_session1',
 'device_type_seen_count_with_session2',
 'device_type_seen_count_with_session3',
 'device_type_seen_count_with_session4',
 'device_type_seen_count_with_session5',
 'hour_of_the_day_seen_count_with_session1',
 'hour_of_the_day_seen_count_with_session2',
 'hour_of_the_day_seen_count_with_session3',
 'hour_of_the_day_seen_count_with_session4',
 'hour_of_the_day_seen_count_with_session5',
 'part_of_the_day_seen_count_with_session1',
 'part_of_the_day_seen_count_with_session2',
 'part_of_the_day_seen_count_with_session3',
 'part_of_the_day_seen_count_with_session4',
 'part_of_the_day_seen_count_with_session5',
 'time_spent_surfing_each_session_with_session1',
 'time_spent_surfing_each_session_with_session2',
 'time_spent_surfing_each_session_with_session3',
 'time_spent_surfing_each_session_with_session4',
 'time_spent_surfing_each_session_with_session5',
 'last_visited_product_item1',
# 'last_visited_product_item2',
# 'last_visited_product_item3',
# 'last_visited_product_item4',
# 'last_visited_product_item5',
# 'last_visited_product_type1',
# 'last_visited_product_type2',
# 'last_visited_product_type3',
# 'last_visited_product_type4',
# 'last_visited_product_type5',
# 'last_visited_product_category1_1',
# 'last_visited_product_category1_2',
# 'last_visited_product_category1_3',
# 'last_visited_product_category1_4',
# 'last_visited_product_category1_5',
# 'last_visited_product_category2_1',
# 'last_visited_product_category2_2',
# 'last_visited_product_category2_3',
# 'last_visited_product_category2_4',
# 'last_visited_product_category2_5',
# 'last_visited_product_category3_1',
# 'last_visited_product_category3_2',
# 'last_visited_product_category3_3',
# 'last_visited_product_category3_4',
# 'last_visited_product_category3_5',
 'last_visited_product_price_1',
# 'last_visited_product_price_2',
# 'last_visited_product_price_3',
# 'last_visited_product_price_4',
# 'last_visited_product_price_5',
 'Hour_of_the_day',
 #'Part_of_the_day',
 'Day_Of_Week',
 #'Weekend',
 #'Normalized_Click_Ratio',
 #'Normalized_Click_Ratio_app0'
      ]

In [None]:
col =['user_id',
 'app_code',
 'os_version',
 'is_4G',
 'site_visit_count_last_1_week',
 'product_type_seen_count_last_3_days',
 'product_type_seen_count_with_session1',
 'item_id_seen_count_with_session1',
 'average_price_of_product_seen_count_with_session1',
 'hour_of_the_day_seen_count_with_session1',
 'time_spent_surfing_each_session_with_session1',
 'last_visited_product_price_1', 'last_visited_product_item1']                                     

In [None]:
Categorical_columns_lgbm = ['device_type_seen_count_with_session2',
 'part_of_the_day_seen_count_with_session4',
 'device_type_seen_count_with_session1',
 'device_type_seen_count_with_session5',
 'device_type_seen_count_with_session3',
 'os_version', 'user_id',
 'app_code', 'is_4G',
 'part_of_the_day_seen_count_with_session3',
 'part_of_the_day_seen_count_with_session1',
 #'Part_of_the_day',
 'part_of_the_day_seen_count_with_session5',
 'part_of_the_day_seen_count_with_session2',
 'device_type_seen_count_with_session4',
 #'last_visited_product_item1',
 #'last_visited_product_item2',
 #'last_visited_product_item3',
 #'last_visited_product_item4',
 #'last_visited_product_item5',
 #'last_visited_product_type1',
 #'last_visited_product_type2',
 #'last_visited_product_type3',
 #'last_visited_product_type4',
 #'last_visited_product_type5',
 #'last_visited_product_category1_1',
 #'last_visited_product_category1_2',
 #'last_visited_product_category1_3',
 #'last_visited_product_category1_4',
 #'last_visited_product_category1_5',
# 'last_visited_product_category2_1',
 #'last_visited_product_category2_2',
 #'last_visited_product_category2_3',
 #'last_visited_product_category2_4',
 #'last_visited_product_category2_5',
 #'last_visited_product_category3_1',
 #'last_visited_product_category3_2',
 #'last_visited_product_category3_3',
 #'last_visited_product_category3_4',
 #'last_visited_product_category3_5'
                            ]

In [None]:
Categorical_columns_lgbm = ['user_id',
 'app_code',
 'os_version',
 'is_4G',]

In [None]:
train_data = lightgbm.Dataset(X_train[col], label=y_train, categorical_feature=Categorical_columns_lgbm)
test_data = lightgbm.Dataset(x_val[col], label=y_val)

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 13,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0,
    'max_depth': 17,
    'lambda': 0.05
    #'scale_pos_weight': 25
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

In [None]:
lgbm_pred = model.predict(x_val[col])
print (metrics.roc_auc_score(y_val, lgbm_pred))

In [None]:
lgbm_pred = model.predict(x_test[col])
print (metrics.roc_auc_score(y_test1, lgbm_pred))

In [None]:
lgbm_pred = model.predict(X_test[col])
print (metrics.roc_auc_score(y_test, lgbm_pred))
lgbm_pred[lgbm_pred>0.5]=1
lgbm_pred[lgbm_pred<=0.5]=0
print (metrics.confusion_matrix(y_test, lgbm_pred))
print (metrics.accuracy_score(y_test, lgbm_pred))
print (metrics.classification_report(y_test, lgbm_pred))

In [None]:
def plotImp(model, X , num = 20):
    feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(),X.columns)), 
                               columns=['Value','Feature'])
    #plt.figure(figsize=(40, 20))
    #sns.set(font_scale = 2)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    #plt.tight_layout()
    plt.show()
    #plt.savefig('lgbm_importances-01.png')

In [None]:
plotImp(model,X_train,20)

In [None]:
from sklearn.metrics import roc_curve
# compute true positive rate and false positive rate
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, model.predict(X_test))

# plotting them against each other
def plot_roc_curve(false_positive_rate, true_positive_rate, label=None):
    plt.plot(false_positive_rate, true_positive_rate, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'r', linewidth=4)
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate (FPR)', fontsize=16)
    plt.ylabel('True Positive Rate (TPR)', fontsize=16)

plt.figure(figsize=(7,6))
plot_roc_curve(false_positive_rate, true_positive_rate)
plt.show()

In [None]:
sub = pd.read_csv("/content/drive/My Drive/Hackathon/submission/sample_submission.csv")

In [None]:
test[Categorical_columns] = test[Categorical_columns].apply(lambda x: d[x.name].transform(x))

In [None]:
sub['is_click'] = model.predict(test[col])

In [None]:
model.predict(test[col])

In [None]:
sub.to_csv("/content/drive/My Drive/Hackathon/sample_submission.csv", index=False)