In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style("whitegrid")
# plt.rc("figure", figsize=(12,10))

from sklearn.metrics import accuracy_score, mean_squared_log_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, cross_val_predict
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import GradientBoostingClassifier as GBM
from sklearn.svm import SVC

In [2]:
x_train = pd.read_csv('./data/X_train.csv')
y_train = pd.read_csv('./data/y_train.csv')
test = pd.read_csv('./data/X_test.csv')

len_train = len(x_train)
len_test = len(test)

In [3]:
train = pd.merge(x_train, y_train, on='ID')
train.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,6045,A,Flight,4,3,266,5,high,F,5,1590,0
1,44,F,Ship,3,1,174,2,low,M,44,1556,1
2,7940,F,Road,4,1,154,10,high,M,10,5674,1
3,1596,F,Ship,4,3,158,3,medium,F,27,1207,1
4,4395,A,Flight,5,3,175,3,low,M,7,4833,1


In [4]:
train.rename(columns={'Reached.on.Time_Y.N':'label'}, inplace=True)

### Categorical vs Numerical

##### Categorical

In [5]:
x = pd.concat([x_train, test], axis=0)
x = x[x.columns.difference(['ID'])]

numerical_features = x.select_dtypes(exclude='object').columns
categorical_features = x.select_dtypes(include='object').columns

In [6]:
x_categorical = pd.get_dummies(x[categorical_features])
x.drop(categorical_features, axis=1, inplace=True)

x = pd.concat([x, x_categorical], axis=1)
x

Unnamed: 0,Cost_of_the_Product,Customer_rating,Discount_offered,Prior_purchases,Weight_in_gms,Customer_care_calls_$7,Customer_care_calls_2,Customer_care_calls_3,Customer_care_calls_4,Customer_care_calls_5,...,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Product_importance_high,Product_importance_low,Product_importance_medium,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F
0,266,3,5,5,1590,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0
1,174,1,44,2,1556,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,1
2,154,1,10,10,5674,0,0,0,1,0,...,1,0,1,0,0,0,0,0,0,1
3,158,3,27,3,1207,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1
4,175,3,7,3,4833,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4396,157,1,31,3,1712,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
4397,139,5,7,2,5536,0,0,1,0,0,...,1,0,0,0,1,0,1,0,0,0
4398,170,1,3,10,5211,0,0,0,1,0,...,1,0,0,0,1,1,0,0,0,0
4399,244,1,1,3,5695,0,0,1,0,0,...,0,1,0,0,1,0,1,0,0,0


##### Numerical

In [7]:
x[numerical_features].describe()

Unnamed: 0,Cost_of_the_Product,Customer_rating,Discount_offered,Prior_purchases,Weight_in_gms
count,10999.0,10999.0,10999.0,10999.0,10999.0
mean,210.196836,2.990545,13.373216,3.567597,3634.016729
std,48.063272,1.413603,16.205527,1.52286,1635.377251
min,96.0,1.0,1.0,2.0,1001.0
25%,169.0,2.0,4.0,3.0,1839.5
50%,214.0,3.0,7.0,3.0,4149.0
75%,251.0,4.0,10.0,4.0,5050.0
max,310.0,5.0,65.0,10.0,7846.0


In [8]:
# Check skewness

from scipy.stats import skew
print(x[numerical_features].apply(skew).apply(abs).sort_values(ascending=False))

Discount_offered       1.798684
Prior_purchases        1.681668
Weight_in_gms          0.249713
Cost_of_the_Product    0.157096
Customer_rating        0.004359
dtype: float64


In [9]:
x[numerical_features] = x[numerical_features].apply(lambda x: np.log1p(x) if abs(skew(x)) > 0.5 else x)
print(x[numerical_features].apply(skew).apply(abs).sort_values(ascending=False))

Prior_purchases        0.666728
Discount_offered       0.532147
Weight_in_gms          0.249713
Cost_of_the_Product    0.157096
Customer_rating        0.004359
dtype: float64


In [10]:
X_train = x[:len_train]
X_test = x[len_train:]

In [24]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_all_scaled = pd.DataFrame(data=scaler.transform(x), columns = x.columns)

In [25]:
X_train = X_all_scaled[:len_train]
X_test = X_all_scaled[len_train:]

In [26]:
y_train = y_train['Reached.on.Time_Y.N']

KeyError: 'Reached.on.Time_Y.N'

In [28]:
clf = RFC()
score = cross_val_score(clf, X_train, y_train, cv=3 )
print(np.mean(score))

0.6489841801369768


In [29]:
params = {"max_depth": [5, 10,20,30,40], "min_samples_split": [5, 10, 20,30, 40]}
grid = GridSearchCV(estimator=clf, param_grid=params, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 10, 'min_samples_split': 30}
0.6814176258414317


In [30]:
clf = RFC(n_estimators=1000, n_jobs=-1, max_depth = 10, min_samples_split = 5)
score = cross_val_score(clf, X_train, y_train, cv=3)
print(np.mean(score))
clf.fit(X_train, y_train)
res = clf.predict(X_test)

0.6695971998291235


In [31]:
result = pd.DataFrame({'id' : test['ID'],
'label' : res})

y_test = pd.read_csv('./data/test_label/y_test.csv')
y = y_test['Reached.on.Time_Y.N']

print(roc_auc_score(y, res))

0.6989491863595894


In [35]:
(y - res).value_counts()

 0    2957
 1    1158
-1     286
Name: Reached.on.Time_Y.N, dtype: int64