In [54]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split,StratifiedShuffleSplit,StratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import lightgbm as lgb
import os
from sklearn.preprocessing import KBinsDiscretizer
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score        
        
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [55]:
def miss_values_imputer(groupby_cols,col,df):
    temp = df.groupby(groupby_cols).agg({col:['mean']})
    temp.columns = ['_'.join(x) for x in temp.columns]
    df = pd.merge(df,temp,on=groupby_cols,how='left')

    for idx in df[col][df[col].isnull()].index:
        df.at[idx,col] = df.at[idx,col+'_mean']
    # qcut
    df[col+'_cat'] = pd.qcut(df[col], 2, labels=False)
    return df

In [56]:
train = pd.read_csv('/kaggle/input/ui-preference/train.csv')
test = pd.read_csv('/kaggle/input/ui-preference/test.csv')

In [57]:
train.head()

In [58]:
# Missing values in test
test.isnull().sum()*100/train.shape[0]

In [59]:
# Missing values in train
train.isnull().sum()*100/train.shape[0]

In [60]:
# Check the category count of target variable
train['Preferred_Theme'].value_counts()

In [61]:
# Combined dataset 
train['train_or_test'] = 1
test['train_or_test'] = 0
df = pd.concat([train,test]).reset_index(drop=True)
# Numeric cols
num_cols = list(set(df.select_dtypes(include=['int64','float64'])))

In [62]:
num_greater = ['Men’s_Clothing',
 'Women’s_Clothing',
 'Beauty',
 'No_of_orders_placed',
 'Kid’s_Clothing',
 'Electronics',
 'Home_&_Living']

# Fixing outliers having value greater than 10 when range is (0,10)
for col in num_greater:
    for idx in range(0,len(df[col])):
        if df.iloc[idx][col]>10:
            df.at[idx,col] = np.nan

# Fixing outliers having value less than 0 when range is (0,10)
for col in num_cols:
    for idx in range(0,len(df[col])):
        if df.iloc[idx][col]<0:
            df.at[idx,col] = np.nan

# Fixing outliers having value like ?
col = 'City'
for idx in range(0,len(df[col])):
    if df.iloc[idx][col]=='?':
        df.at[idx,col] = np.nan

# Fixing datetime outliers and generating new features from date time
col = 'Sign_up_date' 
df['Sign_up_date'] = pd.to_datetime(df['Sign_up_date'], errors='coerce')
df['Last_order_placed_date'] = pd.to_datetime(df['Last_order_placed_date'], errors='coerce')

miss_date_idx = df[col][df[col].isnull()].index
for idx in miss_date_idx:
    df.at[idx,col] = pd.to_datetime('2017-06-01')

df['Sign_up_month'] = df['Sign_up_date'].dt.month
df['Sign_up_year'] = df['Sign_up_date'].dt.year

df['Last_order_month'] = df['Last_order_placed_date'].dt.month
df['Last_order_year'] = df['Last_order_placed_date'].dt.year

# Duration feature which tells about the time between sign up date and last order date in unit of 6 months
duration = df['Last_order_placed_date'] - df['Sign_up_date']
df['duration'] = duration.dt.days/180

In [63]:
# Check missing values
df.isnull().sum()*100/df.shape[0]

In [64]:
# Computing missing value of City
temp = df.groupby(['State'])['City'].agg(pd.Series.mode)
cityDict = {}
col = 'City'
idx = list(temp.index)
val = list(temp.values)
for i in range(0,len(idx)):
    cityDict[idx[i]] = val[i]

miss_city_idx = df[col][df[col].isnull()].index
for idx in miss_city_idx:
    df.at[idx,col] = cityDict[df.at[idx,'State']]

# Computing missing value of Age & addition of new features    
groupby_cols = ['Gender','State']
col = 'Age'
df = miss_values_imputer(groupby_cols,col,df)


# Computing missing value of Men's clothing & addition of new features     
groupby_cols = ['Gender','Age_cat']
col = 'Men’s_Clothing'
df = miss_values_imputer(groupby_cols,col,df)

# Computing missing value of Women’s_Clothing & addition of new features
groupby_cols = ['Gender','Age_cat','Men’s_Clothing_cat']
col = 'Women’s_Clothing'
df = miss_values_imputer(groupby_cols,col,df)

# Computing missing value of Beauty & addition of new features
groupby_cols = ['Gender','Age_cat','Men’s_Clothing_cat','Women’s_Clothing_cat']
col = 'Beauty'
df = miss_values_imputer(groupby_cols,col,df)

# Computing missing value of Kid’s_Clothing & addition of new features
groupby_cols = ['Gender','Age_cat','Men’s_Clothing_cat','Women’s_Clothing_cat']
col = 'Kid’s_Clothing'
df = miss_values_imputer(groupby_cols,col,df)

# Computing missing value of Electronics & addition of new features
groupby_cols = ['Gender','Age_cat','State']
col = 'Electronics'
df = miss_values_imputer(groupby_cols,col,df)

# Computing missing value of Home_&_Living & addition of new features
groupby_cols = ['Gender','Age_cat','State']
col = 'Home_&_Living'
df = miss_values_imputer(groupby_cols,col,df)

# Computing missing value of No_of_orders_placed & addition of new features
groupby_cols = ['Age_cat','Men’s_Clothing_cat','Women’s_Clothing_cat','Beauty_cat', 'Electronics_cat', 'Home_&_Living_cat','Kid’s_Clothing_cat']
col = 'No_of_orders_placed'
df = miss_values_imputer(groupby_cols,col,df)
df['No_of_orders_placed'] = df['No_of_orders_placed'].apply(np.ceil)



In [65]:
# Missing values check

df.isnull().sum()*100/df.shape[0]

In [66]:
df.head()

In [67]:
# Train and test split 

train = df[df['train_or_test']==1]
train = train.drop(['train_or_test','CustomerID','Sign_up_date','Last_order_placed_date'],axis=1)
test = df[df['train_or_test']==0]
test = test.drop(['train_or_test','Preferred_Theme','Sign_up_date','Last_order_placed_date'],axis=1)
Ids = test['CustomerID']
test = test.drop(['CustomerID'],axis=1)

In [68]:
# X & y for modelling

Y=train['Preferred_Theme']
X=train.drop(['Preferred_Theme'],axis=1)
y_map = {'New_UI':1,'Old_UI':0}
y_inv_map = {1:'New_UI',0:'Old_UI'}
Y = Y.map(y_map)


In [69]:
# Category columns computation for catboost model

cat_cols = list(set(df.select_dtypes(include = 'object'))-set(['Sign_up_date','Last_order_placed_date','train_or_test','CustomerID','Preferred_Theme']))

In [70]:
# Modeling with K fold validation, ensemble and weights for each model 

oof_pred  = np.zeros((len(train),))
y_pred_final   = np.zeros((len(test),))
n_splits  = 5

kf=StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=17042022)

for i,(train_idx,val_idx) in enumerate(kf.split(X,Y)):  

    wghts  = [0]*3
    test_roc_score   = []


    X_train, y_train = X.iloc[train_idx,:], Y.iloc[train_idx]

    X_val, y_val = X.iloc[val_idx, :], Y.iloc[val_idx]


    print('\nFold: {}\n'.format(i+1))

    model1 = CatBoostClassifier(learning_rate = 0.03,random_state=41, custom_metric=['AUC'])
    model1.fit(X_train,y_train,cat_features=cat_cols,eval_set=(X_val, y_val),early_stopping_rounds=80,verbose=100)
    testpred1 = model1.predict_proba(X_val)[:,1]
    test_roc_score.append(roc_auc_score(y_val, testpred1))
    print("Test ROC AUC for model 1: %.4f"%(roc_auc_score(y_val, testpred1)))

    model2 = CatBoostClassifier(learning_rate = 0.04,random_state=42, custom_metric=['AUC'])
    model2.fit(X_train,y_train,cat_features=cat_cols,eval_set=(X_val, y_val),early_stopping_rounds=80,verbose=100)
    testpred2 = model2.predict_proba(X_val)[:,1]
    test_roc_score.append(roc_auc_score(y_val, testpred2))
    print("Test ROC AUC for model 2: %.4f"%(roc_auc_score(y_val, testpred2)))

    model3 = CatBoostClassifier(learning_rate = 0.05,random_state=43, custom_metric=['AUC'])
    model3.fit(X_train,y_train,cat_features=cat_cols,eval_set=(X_val, y_val),early_stopping_rounds=80,verbose=100)
    testpred3 = model3.predict_proba(X_val)[:,1]
    test_roc_score.append(roc_auc_score(y_val, testpred3))
    print("Test ROC AUC for model 3: %.4f"%(roc_auc_score(y_val, testpred3)))

    wghts = np.exp(-1000*np.array(test_roc_score/sum(test_roc_score)))
    wghts = wghts/sum(wghts)

    val_pred   = wghts[0]*testpred1+wghts[1]*testpred2 +wghts[2]*testpred3
    print('validation roc_auc_score fold-',i+1,': ',roc_auc_score(y_val, val_pred))

    oof_pred[val_idx] = val_pred
    y_pred_final += (wghts[0]*model1.predict_proba(test)[:,1]+wghts[1]*model2.predict_proba(test)[:,1]+wghts[2]*model3.predict_proba(test)[:,1])/(n_splits)

    print('\n')

print('OOF ROC_AUC_Score:- ',(roc_auc_score(Y,oof_pred)))

In [71]:
# Final prediction

y_final = []
for i in y_pred_final:
    if i<=0.5:
        y_final.append(0)
    else:
        y_final.append(1)

In [72]:
# Mapping target value
y_final = list(map(y_inv_map.get, y_final))

In [73]:
# Submission dataframe

sub = pd.DataFrame({'CustomerID':Ids,'Preferred_Theme':y_final}) 
sub.to_csv("uipref.csv", index = False)
sub