# JanataHack - E-Commerce Analytics ML Hackathon

## Import Packages

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.simplefilter('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,StackingClassifier,VotingClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier,BaggingClassifier,ExtraTreesClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

## Import Dataset

In [3]:
train = pd.read_csv("train_8wry4cB.csv")
test = pd.read_csv("test_Yix80N0.csv")
sample = pd.read_csv("sample_submission_opxHi4g.csv")

In [4]:
print(train.shape,test.shape)

(10500, 5) (4500, 4)


In [5]:
train.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


In [6]:
test.head()

Unnamed: 0,session_id,startTime,endTime,ProductList
0,u12112,08/12/14 13:36,08/12/14 13:36,A00002/B00003/C00006/D19956/
1,u19725,19/12/14 13:52,19/12/14 13:52,A00002/B00005/C00067/D02026/
2,u11795,01/12/14 10:44,01/12/14 10:44,A00002/B00002/C00004/D12538/
3,u22639,08/12/14 20:19,08/12/14 20:22,A00002/B00003/C00079/D22781/;A00002/B00003/C00...
4,u18034,15/12/14 19:33,15/12/14 19:33,A00002/B00001/C00010/D23419/


In [7]:
train['gender'].value_counts()

female    8192
male      2308
Name: gender, dtype: int64

In [11]:
train['ProductList'][0]

'A00002/B00003/C00006/D28435/;A00002/B00003/C00006/D02554/;A00002/B00003/C00006/D28436/;A00002/B00003/C00006/D28437/'

In [12]:
train['ProductList'][1]

'A00001/B00009/C00031/D29404/;A00001/B00009/C00031/D02617/;A00001/B00009/C00031/D29407/;A00001/B00009/C00031/D29410/;A00001/B00009/C00031/D29411/;A00001/B00009/C00031/D25444/;A00001/B00009/C00031/D29418/'

# Note
### From this we can observe that the initial three codes which are starting with A, B, C are same.
### In D the first three digits of the codes are same in most of the records.

### Make features as A code, B code, C, code and D code

For Ex: 
    In 'A00001/B00009/C00031/D29404/
- A00001 as A feature
- B00009 as B feature
- C00031 as C feature
- D294 as D feature

In [13]:
# This is a function get the list of only D codes lists 
def list_items(x):
    items = []
    for i in x.split(';'):
        items.append(i.split('/')[-2])
    return items

In [14]:
# To get the the D code which is most frequent, in case of more than one unique code
def most_frequent(List): 
    return max(set(List), key = List.count)

# This function to split the D function 
def items_new_list(x):
    new_items = []
    for i in x:
        new_items.append(i[:4])
    return most_frequent(new_items)

For Ex: 
   - 'A00001/B00009/C00031/D29404/;
     -  A00001/B00009/C00031/D02617/;
     -  A00001/B00009/C00031/D29407/;
     -  A00001/B00009/C00031/D29410/;
     -  A00001/B00009/C00031/D29411/;
     -  A00001/B00009/C00031/D25444/;
     -  A00001/B00009/C00031/D29418/'
      
      In this D codes are [ D294, D026, D294, D294, D294, D254, D294]
      most frequent one is selected which is D294 as D feature

# Feature Engineering

In [15]:
# Function for preprocessing the features and to create new features
def preprocessing_data(data):
    
    # No of products in the Productlist 
    data['No_of_products'] = data['ProductList'].apply(lambda x: len(x.split(';')))
    
    # A,B,C features which r the codes 
    data['A'] = data['ProductList'].apply(lambda x: x.split(';')[0].split('/')[0])
    data['B'] = data['ProductList'].apply(lambda x: x.split(';')[0].split('/')[1])
    data['C'] = data['ProductList'].apply(lambda x: x.split(';')[0].split('/')[2])
    
    # start date without time
    data['sess_date'] = data['startTime'].apply(lambda x: x.split(' ')[0])
    
    # start date and end date without time
    data['sess_start_date'] = data['startTime'].apply(lambda x: x.split(' ')[0])
    data['sess_end_date'] = data['endTime'].apply(lambda x: x.split(' ')[0])
    
    # start and end time only
    data['sess_start_time'] = data['startTime'].apply(lambda x: x.split(' ')[1])
    data['sess_end_time'] = data['endTime'].apply(lambda x: x.split(' ')[1])
    
    data['sess_temp'] = data['sess_start_time'].apply(lambda x: time.strptime(x, "%H:%M"))
    
    data['sess_start_date'] = data['sess_start_date'].apply(lambda x: x.split('/')[0]+'/'+x.split('/')[1]+'/'+'20'+x.split('/')[2])
    data['sess_end_date'] = data['sess_end_date'].apply(lambda x: x.split('/')[0]+'/'+x.split('/')[1]+'/'+'20'+x.split('/')[2])
    
    # start day,month,year features
    data['sess_day'] = data['sess_date'].apply(lambda x: int(x.split('/')[0]))
    data['sess_month'] = data['sess_date'].apply(lambda x: int(x.split('/')[1]))
    data['sess_year']  = data['sess_date'].apply(lambda x: int(x.split('/')[2]))
    
    data['sess_start_time'] = data[['sess_start_date','sess_start_time']].apply(lambda x: x[0]+" "+x[1],axis=1)
    data['sess_end_time'] = data[['sess_end_date','sess_end_time']].apply(lambda x: x[0]+" "+x[1],axis=1)
    
    #data['sess_start_time'] = data['sess_start_time'].apply(lambda x: time.strptime(x, "%H:%M"))
    #data['sess_end_time'] = data['sess_end_time'].apply(lambda x: time.strptime(x, "%H:%M"))
    
    data['sess_start_time'] = pd.to_datetime(data['sess_start_time'], format="%d/%m/%Y %H:%M")
    data['sess_end_time'] = pd.to_datetime(data['sess_end_time'], format="%d/%m/%Y %H:%M")
    
    # Difference in mins between the end date and start date
    data['sess_mins'] = data[['sess_start_time','sess_end_time']].apply(lambda x: int(divmod((x[1]-x[0]).total_seconds() , 60)[0] ),axis=1 )
    data['sess_mins'] = data['sess_mins'].apply(lambda x: 100 if(x>28) else x)
    
    # start date hour feature
    data['sess_hour'] = data['sess_temp'].apply(lambda x: x.tm_hour)
    data.drop(['sess_temp'],axis=1,inplace=True)
    
    # D product list 
    data['sess_items']=data['ProductList'].apply(lambda x: list_items(x))
    
    # D code feature which is explained above.
    data['D'] = data['sess_items'].apply(lambda x: items_new_list(x))
    
    return data


In [20]:
dtrain = train.copy()
dtest = test.copy()

In [21]:
# Apply the preprocessing
dtrain = preprocessing_data(dtrain)
dtest = preprocessing_data(dtest)

In [22]:
dtrain.shape,dtest.shape

((10500, 21), (4500, 20))

In [23]:
dtrain.columns

Index(['session_id', 'startTime', 'endTime', 'ProductList', 'gender',
       'No_of_products', 'A', 'B', 'C', 'sess_date', 'sess_start_date',
       'sess_end_date', 'sess_start_time', 'sess_end_time', 'sess_day',
       'sess_month', 'sess_year', 'sess_mins', 'sess_hour', 'sess_items', 'D'],
      dtype='object')

# Features

In [25]:
features = list(set(dtrain.columns)-set(['startTime','endTime','ProductList','gender',
                                        'sess_start_time','sess_end_time','sess_items','sess_year','sess_day','sess_start_date','sess_mins']))
target = 'gender'

features

['session_id',
 'No_of_products',
 'sess_end_date',
 'D',
 'sess_date',
 'B',
 'sess_month',
 'C',
 'sess_hour',
 'A']

In [26]:
dtrain[features].dtypes

session_id        object
No_of_products     int64
sess_end_date     object
D                 object
sess_date         object
B                 object
sess_month         int64
C                 object
sess_hour          int64
A                 object
dtype: object

# Label Encoding

In [27]:
cat_feat  = ['A','B','C','D','sess_date','sess_start_date','sess_end_date']

le = LabelEncoder()
le_target = LabelEncoder()
le_sess = LabelEncoder()
le_classes = {}

dtrain[target] = le_target.fit_transform(dtrain[target])
le_classes[target] =  le_target

df = pd.concat([dtrain,dtest])

df['session_id'] = le_sess.fit_transform(df['session_id'])

for i in cat_feat:
    df[i] = le.fit_transform(df[i])
    le_classes[i] = le

dtrain = df[df[target].notnull()]
dtest =  df[df[target].isnull()]

del df

In [29]:
def baseliner(train, features, target, cv=3, metric='accuracy'):
    """
    Function for baselining Models which return CV Score, Train Score, Valid Score
    """
    print("Baseliner Models\n")
    eval_dict = {}
    models = [lgb.LGBMClassifier(), xgb.XGBClassifier(), cat.CatBoostClassifier(verbose=0), GradientBoostingClassifier(), LogisticRegression(), 
              RandomForestClassifier(), DecisionTreeClassifier(), AdaBoostClassifier(),ExtraTreeClassifier(),ExtraTreesClassifier(),
              KNeighborsClassifier(),BaggingClassifier()
             ]
    print("Model Name \t |   CV")
    print("--" * 50)

    for index, model in enumerate(models, 0):
        model_name = str(model).split("(")[0]
        eval_dict[model_name] = {}

        results = cross_val_score(model, train[features], train[target], cv=cv, scoring=metric)
        eval_dict[model_name]['cv'] = results.mean()

        print("%s \t | %.4f \t" % (
            model_name[:12], eval_dict[model_name]['cv']))

In [30]:
baseliner(dtrain,features,target)

Baseliner Models

Model Name 	 |   CV
----------------------------------------------------------------------------------------------------
LGBMClassifi 	 | 0.8910 	
XGBClassifie 	 | 0.9058 	
<catboost.co 	 | 0.8877 	
GradientBoos 	 | 0.8847 	
LogisticRegr 	 | 0.7781 	
RandomForest 	 | 0.8926 	
DecisionTree 	 | 0.8541 	
AdaBoostClas 	 | 0.8767 	
ExtraTreeCla 	 | 0.8140 	
ExtraTreesCl 	 | 0.8824 	
KNeighborsCl 	 | 0.8173 	
BaggingClass 	 | 0.8885 	


In [31]:
def fit_model(model,dtrain,features,target,dtest):
    model.fit(dtrain[features],dtrain[target])
    y_pred_test = model.predict(dtest[features])
    dtest['gender'] = y_pred_test
    dtest['gender'] = dtest['gender'].astype(int)
    return dtest

In [32]:
model = xgb.XGBClassifier(random_state=7)
dtest = fit_model(model,dtrain,features,target,dtest)

dtest['gender'] = le_target.inverse_transform(dtest['gender'])
dtest['session_id'] = le_sess.inverse_transform(dtest['session_id'])

dtest[['session_id','gender']].to_csv("submissions/xgb_model_final_submisison.csv",index=False)

In [33]:
# Final Submission
dtest[['session_id','gender']].head()

Unnamed: 0,session_id,gender
0,u12112,female
1,u19725,female
2,u11795,female
3,u22639,male
4,u18034,male


## Few Observations:
1. Ensembling approaching did not improve the score.
2. Stacking the models did not improve the score much, but it generalised the results of the models.