In [None]:
import pandas as pd
import seaborn as sns
from datetime import datetime 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(rc={'figure.figsize':(16.7,6.27)})

In [None]:
train = pd.read_csv("../data/train/train.csv")
print(train.shape)
train.head()

In [None]:
logs = pd.read_csv("../data/train/view_log.csv")
print(logs.shape)
logs.head()

In [None]:
items = pd.read_csv("../data/train/item_data.csv")
print(items.shape)
items.head()

In [None]:
test = pd.read_csv("../data/test/test.csv")
test.head()

In [None]:
def unique(df):
    print("Number of rows in df: ",df.shape[0])
    for col in df.columns:
        print("Number of unique values in "+col+": ",len(df[col].unique()))
def missing(df):
    for col in df.columns:
        print("Number of missing values in "+col+": ",df[col].isnull().sum())
def values(df,cols):
    for col in cols:
        print("Value Counts for the column :",col)
        print(df[col].value_counts())

## 1. EDA
### 1.1 Exploring train.csv
#### 1.1.1 Unique values and variable types

In [None]:
unique(train)

#### 1.1.2 Missing values check

In [None]:
missing(train)

1. Impression ID: Unique indentifier for each row
3. impression time: date_time variable
2. user_id: Identifier for each user
3. app_code: identifier for different websites/ applications. Should not be treated as a numerical value as it has no inherant numerical ranking (nominal variable)
4. os_version: Has inherant numerical ranking (ordinal variable)
5. is_4G : Binary variable
6. is_click: Target Variable (binary) to denote whether or not the user clicked on the ad/

#### 1.1.3 Distribution of features and corresponding encoding scheme to be used

In [None]:
sns.distplot(train.app_code)

In [None]:
# sns.distplot(train.os_version)
print(train.os_version.value_counts())

In [None]:
sns.distplot(train.is_4G)

In [None]:
sns.distplot(train.is_click)
print(train.is_click.value_counts())

In [None]:
print("Ratio between cases where customer actually clicks on the ad against the cases where the ad is ignored",train.is_click.value_counts()[1]/train.is_click.value_counts()[0])

1. Target Variable is Imbalanced
2. Need to find a proper encoding scheme for app code
3. is_4G does not require any encoding
4. os_version can be numerically encoded while preserving the ranks
5. User_ids can be clustered and a cluster_id can be used as a feature.

### 1.2  Exploring view_logs.csv
#### 1.2.1 Unique values and variable types

In [None]:
unique(logs)

#### Missing values check

In [None]:
missing(logs)

In [None]:
values(logs,["device_type"])

In [None]:
logs.head()

1. device_type -> get_dummies()

### 1.3 Exploring item_data.csv

In [None]:
items.head()

In [None]:
unique(items)

In [None]:
missing(items)

In [None]:
sns.boxplot(items.category_1)

In [None]:
sns.boxplot(items.category_2)

In [None]:
sns.boxplot(items.category_3)

In [None]:
sns.boxplot(items.item_price)

In [None]:
print("minimum price",items.item_price.min())
print("maximum price",items.item_price.max())

1. The depth for all the three categories varies. These three features may need to be standardized. It will be trated as a numerical variable only
2. The prices for items start from as low as 5 and goes up till 1340800. most of the items belong to the lower priced brackets. 

In [None]:
sns.distplot(items.product_type)

## 2. Performing joins

In [None]:
logs.head()

In [None]:
items.head()

In [None]:
logs_items = logs.join(items,on="item_id",how="left",lsuffix="",rsuffix="_l").drop(["item_id_l"],axis=1)
logs_items.head()

In [None]:
print(logs_items.shape)
missing(logs_items)

In [None]:
len(items.item_id.unique())-len(logs.item_id.unique())

In [None]:
train.head()

This means there are 4092 item_ids in view_logs.csv that have no corresponding id in item_data.csv

## 3. Feature Engineering

### 3.1 Ideas for new features
1. User_id decile based on item prices (Will categorize users based on amount of money spent by the user)
2. User_id decile based on no_of_clicks (Will categorize users based on number of times he/she clicks on the ad )
3. Time spent on each item (items_log)

### 3.2 Encoding Schemes 
How to encode user_id, app_code

In [None]:
def encoder_os(x):
    if x == "old":
        return 0
    elif x == "intermediate":
        return 1
    return 2
train.os_version = train.os_version.apply(lambda x: encoder_os(x))
test.os_version = test.os_version.apply(lambda x: encoder_os(x))

In [None]:
# train = pd.concat([train,pd.get_dummies(train.is_4G)],axis=1)
# test = pd.concat([test,pd.get_dummies(test.is_4G)],axis=1)

### 3.3 Datetime Encoding

In [None]:
def get_time_feats(df,col):
    df[col] = pd.to_datetime(df[col],format='%Y-%m-%d %H:%M:%S') 
    df[col+"_year"] = df[col].apply(lambda x:x.year)
    df[col+"_month"] = df[col].apply(lambda x:x.month)
    df[col+"_day"] = df[col].apply(lambda x:x.day)
    df[col+"_hour"] = df[col].apply(lambda x:x.hour)
    df[col+"_dayofweek"] = df[col].apply(lambda x:x.dayofweek)
    return df

In [None]:
train = get_time_feats(train,"impression_time")
train.head()

In [None]:
test = get_time_feats(test,"impression_time")
test.head()

In [None]:
logs_items = get_time_feats(logs_items,"server_time")
logs_items.head()

## 4. A few checks

In [None]:
total = 0
for id_ in test.user_id:
    if id_ not in train.user_id:
        print(id_)
        total += 1

In [None]:
def volume_quantile(splits, var, df, ID):    
    df = df.sort_values([var])
    df['CUM_VAR'] = df[var].cumsum()/df[var].sum()
    
    #restamp values >1
    mask = df['CUM_VAR'] > 1
    column_name = 'CUM_VAR'
    df.loc[mask, column_name] = 1
    
    #create buckets
    df['decile'] = (df['CUM_VAR']*splits)
    df['decile'] = df['decile'].apply(np.ceil)
    
    #stat summary
#     results.groupby(['decile'])['mrkt_trx_vol'].sum()
    print(df.groupby('decile').agg({ID:'count', var: ['sum','max','min']}).reset_index())
    
    return df

In [None]:
temp = train.groupby("user_id").sum().sort_values("is_click")
temp = volume_quantile(10,"is_click",temp,"decile")
temp=temp.loc[:,['is_click']]
temp = temp.reset_index()
train = train.join(temp,on="user_id",how="left",lsuffix="",rsuffix="_").drop(["user_id_","is_click_"],axis=1)
train.head()

In [None]:
test = test.join(temp,on="user_id",how="left",lsuffix="",rsuffix="_").drop(["user_id_","is_click_"],axis=1)
test.head()

In [None]:
missing(temp)

This means there are no users in test_df that are not given in train_df

## 5. Modelling

In [None]:
import xgboost as xgb

In [None]:
X = train.drop(["impression_time","is_click","impression_id","impression_time_year"],axis=1)
X.head()

In [None]:
y= train.is_click
# test_id = test.impression_id
test_df = test.drop(["impression_time","impression_id","impression_time_year"],axis=1)
test_df.head()

In [None]:
dtrain = xgb.DMatrix(np.array(X), label=y)
dtest = xgb.DMatrix(np.array(test_df))
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': -1, 'objective': 'binary:logistic', 'silent': 1, 'subsample': 0.7 ,'eval_metric':'auc'}
num_round=1000
gbdt = xgb.train(params, dtrain,num_round)
pred = gbdt.predict(dtest)

In [None]:
ans = pd.DataFrame()
ans["impression_id"] = test.impression_id
ans["is_click"] = pred
ans.head()

In [None]:
ans.to_csv("../output/xgboost_with_datetime_but_nothing_else_yearDropped.csv",index=None)

In [None]:
logs.loc[logs.session_id==112333]

In [None]:
logs_items.loc[logs.session_id==1065742]

In [None]:
train.loc[train.user_id==4557]

In [None]:
logs.loc[logs.user_id==2]

In [None]:
train.loc[train.user_id==2]