In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fastai.tabular import *
pd.set_option('display.max_columns', None)

  import pandas.util.testing as tm


In [2]:
# Imports for better visualization
from matplotlib import rcParams
#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 4)
rcParams['figure.dpi'] = 150
#rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['font.size'] = 8
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'

In [3]:
train = pd.read_csv('train_8wry4cB.csv')
test = pd.read_csv('test_Yix80N0.csv')
sample = pd.read_csv('sample_submission_opxHi4g.csv')
train.shape, test.shape, sample.shape

((10500, 5), (4500, 4), (4500, 2))

In [4]:
data = pd.concat([train, test]).reset_index(drop=True)

data['total_products'] = data['ProductList'].apply(lambda x: len(x.split(';')))
data['startTime'] = pd.to_datetime(data['startTime'], format='%d/%m/%y %H:%M')
data['endTime'] = pd.to_datetime(data['endTime'], format='%d/%m/%y %H:%M')
data['duration'] = (data['endTime']-data['startTime']).dt.total_seconds()/60
data['product_per_time'] = data['total_products']/(data['duration']+1)

In [5]:
data['duration'].describe()

count    15000.000000
mean       130.356600
std       1880.782561
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max      50752.000000
Name: duration, dtype: float64

In [6]:
def session(x):
    if x == 0:
        return 1
    else:
        return 0
    
data['session'] = data['duration'].apply(session)
data['session'].value_counts()

1    7931
0    7069
Name: session, dtype: int64

In [7]:
def time_features(df):
    df['st_date'] = df['startTime'].dt.date
    
    df['st_month'] = df['startTime'].dt.month
    df['st_month_start'] = df['startTime'].dt.is_month_start.astype(int)
    df['st_month_end'] = df['startTime'].dt.is_month_end.astype(int)
    
    df['st_day'] = df['startTime'].dt.day
    
    df['st_doy'] = df['startTime'].dt.dayofyear
    
    df['st_week'] = df['startTime'].dt.week
    
    df['st_dow'] = df['startTime'].dt.dayofweek
    
    df['st_hour'] = df['startTime'].dt.hour
    df['st_minute'] = df['startTime'].dt.minute
    df['et_hour'] = df['endTime'].dt.hour
    df['et_minute'] = df['endTime'].dt.minute
    
    return df

In [8]:
data = time_features(data)
data.shape

(15000, 21)

In [9]:
data['tot_prod_day'] = data.groupby(['st_day'])['total_products'].transform('sum')
data['tot_prod_week'] = data.groupby(['st_week'])['total_products'].transform('sum')
data['tot_prod_dow'] = data.groupby(['st_dow'])['total_products'].transform('sum')
data['tot_prod_hour'] = data.groupby(['st_hour'])['total_products'].transform('sum')

In [10]:
data.shape

(15000, 25)

In [11]:
def product_list_1(x):
    if ';' in x:
        plist = x.split(';')
        first_a = plist[0].split('/')[0]
        first_b = plist[0].split('/')[1]
        first_c = plist[0].split('/')[2]

    else:
        plist = x.split('/')
        first_a = plist[0]
        first_b = plist[1]
        first_c = plist[2]
    return (first_a, first_b, first_c)

In [12]:
pl1 = data['ProductList'].apply(lambda x: product_list_1(x))
df1 = pd.DataFrame(pl1.to_list(), columns = ['First_A', 'First_B', 'First_C'])
df1 = df1.apply(lambda x: x.mask(x.map(x.value_counts())<60, 'other'))
df1.head()

Unnamed: 0,First_A,First_B,First_C
0,A00002,B00003,C00006
1,A00001,B00009,other
2,A00002,B00001,C00020
3,A00002,B00004,C00018
4,A00001,B00001,C00012


In [13]:
from collections import Counter
def product_list_2(x):
    if ';' in x:
        plist = x.split(';')
        level_1 = []
        level_2 = []
        level_3 = []
        for i in plist:
            level_1.append(i.split('/')[0])
            level_2.append(i.split('/')[1])
            level_3.append(i.split('/')[2])
        
        uniq_l1 = len(set(level_1))
        uniq_l2 = len(set(level_2))
        uniq_l3 = len(set(level_3))
        
        freq_l1 = max(level_1, key=Counter(level_1).get)
        freq_l2 = max(level_2, key=Counter(level_2).get)
        freq_l3 = max(level_3, key=Counter(level_3).get)
    else:
        plist = x.split('/')
        
        level_1 = plist[0]
        level_2 = plist[1]
        level_3 = plist[2]
        uniq_l1 = 1
        uniq_l2 = 1
        uniq_l3 = 1
        
        freq_l1 = level_1
        freq_l2 = level_2
        freq_l3 = level_3
        
    return (uniq_l1, uniq_l2, uniq_l3, freq_l1, freq_l2, freq_l3)

In [14]:
level = data['ProductList'].apply(lambda x: product_list_2(x))
df2 = pd.DataFrame(level.to_list(), columns=['level1', 'level2', 'level3', 'freq_l1', 'freq_l2', 'freq_l3'])
df2 = df2.apply(lambda x: x.mask(x.map(x.value_counts())<30, 'other') if x.name not in ['level1', 'level2', 'level3'] else x)
df2.head()

Unnamed: 0,level1,level2,level3,freq_l1,freq_l2,freq_l3
0,1,1,1,A00002,B00003,C00006
1,1,1,1,A00001,B00009,C00031
2,1,1,1,A00002,B00001,C00020
3,1,1,1,A00002,B00004,C00018
4,1,1,1,A00001,B00001,C00012


In [15]:
data['plist1'] = data['ProductList'].apply(lambda x: (''.join(x.split(';'))).split('/')[:-1])
plist = data['plist1']
from collections import Counter
len(Counter(([a for b in plist.tolist() for a in b])))
plist_counter = Counter(([a for b in plist.tolist() for a in b]))

a = []
b = []
c = []
d = []
for i in plist_counter.keys():
    if i.startswith("A"):
        a.append(i)
    elif i.startswith("B"):
        b.append(i)
    elif i.startswith("C"):
        c.append(i)
    else:
        d.append(i)

In [16]:
len(a), len(b), len(c), len(d)

(11, 86, 383, 21880)

In [17]:
data['plist1'] = data['plist1'].apply(lambda x: Counter(x))

In [18]:
temp = data.copy()

data = pd.concat([data, df1, df2], axis=1).reset_index(drop=True)
data.head(3)

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,total_products,duration,product_per_time,session,st_date,st_month,st_month_start,st_month_end,st_day,st_doy,st_week,st_dow,st_hour,st_minute,et_hour,et_minute,tot_prod_day,tot_prod_week,tot_prod_dow,tot_prod_hour,plist1,First_A,First_B,First_C,level1,level2,level3,freq_l1,freq_l2,freq_l3
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,1.0,2.0,0,2014-12-15,12,0,0,15,349,51,0,18,11,18,12,2347,8154,6330,1205,"{'A00002': 4, 'B00003': 4, 'C00006': 4, 'D2843...",A00002,B00003,C00006,1,1,1,A00002,B00003,C00006
1,u10253,2014-12-16 14:35:00,2014-12-16 14:41:00,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,7,6.0,1.0,0,2014-12-16,12,0,0,16,350,51,1,14,35,14,41,2232,8154,5038,2628,"{'A00001': 7, 'B00009': 7, 'C00031': 7, 'D2940...",A00001,B00009,other,1,1,1,A00001,B00009,C00031
2,u19037,2014-12-01 15:58:00,2014-12-01 15:58:00,A00002/B00001/C00020/D16944/,female,1,0.0,1.0,1,2014-12-01,12,1,0,1,335,49,0,15,58,15,58,1202,5207,6330,2461,"{'A00002': 1, 'B00001': 1, 'C00020': 1, 'D1694...",A00002,B00001,C00020,1,1,1,A00002,B00001,C00020


In [19]:
A = []
for i in a:
    x = data['ProductList'].str.contains(i).value_counts().values[1]
    if x>30:
        A.append(i)
len(A)

B = []
for i in b:
    x = data['ProductList'].str.contains(i).value_counts().values[1]
    if x>30:
        B.append(i)
        
len(B)


C = []
for i in c:
    x = data['ProductList'].str.contains(i).value_counts().values[1]
    if x>30:
        C.append(i)
len(C)


D = []
for i in d:
    x = data['ProductList'].str.contains(i).value_counts().values[1]
    if x>30:
        D.append(i)
        
        
D = ['D00205', 'D00266', 'D08940', 'D20430', 'D01034', 'D00232', 'D00297', 'D08480']


In [20]:
for i in A:
    data[i] = data['plist1'].apply(lambda x: x[i] if (i in x) else 0)

for i in B:
    data[i] = data['plist1'].apply(lambda x: x[i] if (i in x) else 0)

for i in C:
    data[i] = data['plist1'].apply(lambda x: x[i] if (i in x) else 0)
for i in D:
    data[i] = data['plist1'].apply(lambda x: x[i] if (i in x) else 0)
    
data.shape

(15000, 185)

In [21]:
import os
# os.mkdir("Final")
path = 'Final/'

In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['session_id'] = le.fit_transform(data['session_id'])

In [24]:
final_train = data[:train.shape[0]]
final_test = data[train.shape[0]:]
drop_cols = ['startTime', 'endTime', 'ProductList', 'gender', 'st_date', 'plist1']

X = final_train.drop(drop_cols, axis=1)
y = final_train['gender'].replace(['female', 'male'], [0, 1])
test_X = final_test.drop(drop_cols, axis=1).reset_index(drop=True)
X.shape, test_X.shape

((10500, 179), (4500, 179))

In [25]:
X = pd.get_dummies(X)
test_X = pd.get_dummies(test_X)
X.shape, test_X.shape

((10500, 387), (4500, 387))

In [26]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [81]:
from xgboost import XGBClassifier

err_xgb_ac = []
err_xgb_f1 = []
err_xgb_auc = []

y_pred_tot_xgb = []
y_pred_tot_xgb_prob = []


fold = KFold(n_splits=5, shuffle=True, random_state=42)
f = 0
for train_index, test_index in fold.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    m=XGBClassifier(n_estimators=1000,random_state=1994,eval_metric='auc',learning_rate=0.1)
    m.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=200,verbose=200)
    
    y_pred = m.predict(X_test)
    y_pred_prob = m.predict_proba(X_test)[:,1]
    
    test_pred = m.predict(test_X)
    test_pred_proba = m.predict_proba(test_X)[:,1]
    
    err_ac = accuracy_score(y_test, y_pred)
    err_f1 = f1_score(y_test, y_pred, average='weighted')
    err_auc = roc_auc_score(y_test, y_pred_prob)
    
    f += 1
    print("Fold:", f)
    print("Accuracy:", err_ac)
    print('F1 Score:', err_f1)
    print("AUC Score:", err_auc)
    print('\n')
    
    err_xgb_ac.append(err_ac)
    err_xgb_f1.append(err_f1)
    err_xgb_auc.append(err_auc)
    
    y_pred_tot_xgb.append(test_pred)
    y_pred_tot_xgb_prob.append(test_pred_proba)
    

[0]	validation_0-auc:0.782935	validation_1-auc:0.780323
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.921665	validation_1-auc:0.883618
[400]	validation_0-auc:0.946555	validation_1-auc:0.896342
[600]	validation_0-auc:0.962226	validation_1-auc:0.905578
[800]	validation_0-auc:0.974212	validation_1-auc:0.91751
[999]	validation_0-auc:0.981472	validation_1-auc:0.922071
Fold: 1
Accuracy: 0.9061904761904762
F1 Score: 0.9018255927259529
AUC Score: 0.9221237640355289


[0]	validation_0-auc:0.7818	validation_1-auc:0.775122
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.927565	validation_1-auc:0.898666
[400]	validation_0-auc:0.950426	validation_1-auc:0.909054
[600]	validation_0-auc:0.963316	validation_1-auc:0.916993
[800]	validat

In [82]:
print(np.mean(err_xgb_ac))
print(np.mean(err_xgb_f1))
print(np.mean(err_xgb_auc))

0.9032380952380953
0.899361831855311
0.9191131377487451


In [83]:
sample['gender'] = (np.mean(y_pred_tot_xgb, 0).round().astype(int))
sample['gender'] = sample['gender'].replace([0, 1], ['female', 'male'])
sample.to_csv(path+'Final_Sub.csv', index=False)
sample['gender'].value_counts()

female    3704
male       796
Name: gender, dtype: int64