The goal of the problem is to predict whether a passenger was delighted considering his/her overall travel experience of traveling in Shinkansen (Bullet Train).

### Problem Description
This is the problem of a Shinkansen (Bullet-Trains) of Japan. They aim to determine the relative importance of each parameter with regards to their contribution to the passenger travel experience. Provided is a random sample of individuals who travelled using their train. The on-time performance of the trains along with the passenger’s information is published in the CSV file named ‘Traveldata_train’. These passengers were later asked to provide their feedback on various parameters related to the travel along with their overall experience. These collected details are made available in the survey report CSV labelled ‘Surveydata_train’.

In the survey, a passenger was explicitly asked whether they were delighted with their overall travel experience and that is captured in the data of the survey report under the variable labelled ‘Overall_Experience’.

The objective of this exercise is to understand which parameters play an important role in swaying passenger feedback towards a positive scale. You are provided test data containing Travel data and Survey data of passengers. Both the test data and the train data are collected at the same time and belongs to the same company.


In [None]:
#Import required libraries

# import os
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns
# from warnings import filterwarnings 
# filterwarnings("ignore")

# from sklearn.model_selection import train_test_split, GridSearchCV

# Machine learning libraries

# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression

# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import GradientBoostingClassifier

# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# from imblearn.over_sampling import SMOTE

# from sklearn.ensemble import VotingClassifier
# from sklearn.svm import SVC

# from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix,roc_curve, roc_auc_score,accuracy_score
#Import required libraries



import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import missingno as msno
import ppscore as pps
import statistics


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, roc_auc_score, roc_curve, make_scorer, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('Traveldata_train.csv')
train_survey_data = pd.read_csv('Surveydata_train.csv')
test_data = pd.read_csv('Traveldata_test.csv')
test_survey_data = pd.read_csv('Surveydata_test.csv')

In [None]:
train_df = pd.merge(train_data, train_survey_data, how = 'left')

In [None]:
test_df = pd.merge(test_data, test_survey_data, how = 'left')

In [None]:
ID = test_df['ID']

### EDA

In [None]:
def basic_eda(df):
    print('SHAPE')
    print(df.shape)
    print('.' * 100)
    print('\nINFO')
    print(df.info())
    print('.' * 100)
    print('\nMISSING VALUES')
    print('Total Missing:', df.isna().sum().sum())
    print(df.isna().sum())
    print('.' * 100)
    print('\nDUPLICATES')
    print(df.duplicated().sum())
    print('.' * 100)  
    print('\nDESCRIBE')
    print(df.describe())

In [None]:
basic_eda(train_df)

In [None]:
basic_eda(test_df)

In [None]:
#dropping unncessary column

train_df.drop('ID', axis = 1, inplace = True)

In [None]:
# train_df.dropna(axis = 0, subset = ['Online_boarding', 'Cleanliness', 'Onboard_entertainment', 'Onboardwifi_service',
#                                    'Platform_location', 'Age', 'DepartureDelay_in_Mins', 'Seat_comfort', 'Onlinebooking_Ease',
#                                    'Checkin_service', 'Gender', 'Leg_room', 'Online_support'], inplace = True)

In [None]:
print('Unique values in the dataset:\n')

for i in train_df.columns:
    print('_' * 50)
    print(i.upper() + ':', train_df[i].nunique(), train_df[i].dtype)
    print(train_df[i].value_counts())

In [None]:
#replacing the extremely few records in features and clubbing them with the next best level  

train_df['Platform_location'].replace('very inconvinient', 'Inconvinient', inplace = True)
train_df['Onboardwifi_service'].replace('extremely poor', 'poor', inplace = True)
train_df['Online_support'].replace('extremely poor', 'poor', inplace = True)
train_df['Onlinebooking_Ease'].replace('extremely poor', 'poor', inplace = True)
train_df['Onboard_service'].replace('extremely poor', 'poor', inplace = True)
train_df['Checkin_service'].replace('extremely poor', 'poor', inplace = True)
train_df['Cleanliness'].replace('extremely poor', 'poor', inplace = True)
train_df['Online_boarding'].replace('extremely poor', 'poor', inplace = True)

In [None]:
# separating numerical and categorical train data

num_train_df = train_df.select_dtypes(exclude = 'object')
cat_train_df = train_df.select_dtypes(include = 'object')

In [None]:
# separating numerical and categorical test data

num_test_df = test_df.select_dtypes(exclude = 'object')
cat_test_df = test_df.select_dtypes(include = 'object')

In [None]:
fig, axes = plt.subplots(figsize = (16, 20), nrows = 4, ncols = 1)

for i, col in enumerate(num_train_df.columns[0:4]):
    sns.distplot(num_train_df[col], ax = axes[i])
    
plt.tight_layout()
plt.suptitle('Distribution Plot for Numeric Data', y = 1.02)

In [None]:
num_train_df.columns

In [None]:
fig, axes = plt.subplots(figsize = (16, 8), nrows = 4, ncols = 1)

for i, col in enumerate(num_train_df.columns[0:4]):
    sns.boxplot(num_train_df[col], ax = axes[i])
    
plt.tight_layout()

In [None]:
# train_df['DepartureDelay_in_Mins'].sort_values(ascending = False).head(10)

In [None]:
fig, axes = plt.subplots(figsize = (16, 8), nrows = 4, ncols = 1)

for i, col in enumerate(num_test_df.columns[0:4]):
    sns.boxplot(num_test_df[col], ax = axes[i])
    
plt.tight_layout()

In [None]:
# outlier analysis

Q1 = num_train_df.quantile(0.25)
Q3 = num_train_df.quantile(0.75) 
IQR = Q3 - Q1 
lower_range = (Q1 - 1.5 * IQR)
upper_range = (Q3 + 1.5 * IQR)
extreme_lower_range = (Q1 - 3 * IQR)
extreme_upper_range = (Q1 + 3 * IQR)

pd.DataFrame(((num_train_df < lower_range) | (num_train_df > upper_range)).sum(), 
             columns = ['No. of Outliers']).sort_values(by = 'No. of Outliers', ascending = False).T

In [None]:
plt.figure(figsize = (10, 8))

sns.heatmap(train_df.corr(), annot = True, annot_kws = {'size': 12}, cmap = 'GnBu')

In [None]:
# missing data

train_df.isna().sum()

In [None]:
# missing data imputation - median for numeric and mode for categorical

for i in num_train_df:
    train_df[i].fillna(train_df[i].median(), inplace = True)

In [None]:
cat_train_df.isna().sum().sum()

In [None]:
for i in cat_train_df:
    train_df[i].fillna(train_df[i].mode()[0], inplace = True)

In [None]:
train_df.isna().sum().sum()

In [None]:
cat_train_df.nunique().sort_values()

In [None]:
train_df['Baggage_handling'].value_counts()

In [None]:
# replace = train_df[['Checkin_service', 'Leg_room', 'Onboard_service', 'Onlinebooking_Ease', 'Online_support',
#             'Onboardwifi_service', 'Cleanliness', 'Platform_location', 'Catering', 'Arrival_time_convenient',
#             'Seat_comfort', 'Onboard_entertainment', 'Online_boarding']]

In [None]:
# manual categorical data encoding

train_df['Checkin_service'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Leg_room'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Onboard_service'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Onlinebooking_Ease'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Online_support'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Onboardwifi_service'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Cleanliness'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Platform_location'].replace(['very inconvinient', 'Inconvinient', 'need improvement', 'manageable', 'Convinient', 'very convinient'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Catering'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Arrival_time_convenient'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Seat_comfort'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Onboard_entertainment'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
train_df['Online_boarding'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)

train_df['Baggage_handling'].replace(['poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4], inplace = True)

train_df['Seat_Class'].replace(['Green Car', 'Ordinary'], 
                                    [0, 1], inplace = True)

train_df['Travel_Class'].replace(['Eco', 'Business'], 
                                    [0, 1], inplace = True)

train_df['TypeTravel'].replace(['Business travel', 'Personal Travel'], 
                                    [0, 1], inplace = True)

train_df['CustomerType'].replace(['Loyal Customer', 'disloyal Customer'], 
                                    [0, 1], inplace = True)

train_df['Gender'].replace(['Female', 'Male'], [1, 0], inplace = True)

In [None]:
train_df.info()

In [None]:
#check predictive power score

pps.predictors(train_df, "Overall_Experience")[['x', 'y', 'ppscore']]

In [None]:
# variables with high predictive power score

plt.figure(figsize = (20, 8))
predictors_df = pps.predictors(train_df, "Overall_Experience")
sns.barplot(data = predictors_df.nlargest(11, 'ppscore'), x= "x", y = "ppscore")
print(plt.xlabel)

In [None]:
# variables with low predictive power score

plt.figure(figsize = (20, 8))
predictors_df = pps.predictors(train_df, "Overall_Experience")
sns.barplot(data = predictors_df.nsmallest(15, 'ppscore'), x= "x", y = "ppscore")

In [None]:
predictors_df.nsmallest(12, 'ppscore').x

In [None]:
train_df.columns

In [None]:
# train_df['DepartureDelay_in_Mins'] = np.where(train_df['DepartureDelay_in_Mins'] > 1000, train_df['DepartureDelay_in_Mins'].median(), train_df['DepartureDelay_in_Mins'])
# train_df['ArrivalDelay_in_Mins'] = np.where(train_df['ArrivalDelay_in_Mins'] > 1000, train_df['DepartureDelay_in_Mins'].median(), train_df['ArrivalDelay_in_Mins'])

In [None]:
# train_df['Online_Score'] = train_df['Online_support'] + train_df['Onlinebooking_Ease'] + train_df['Online_boarding']
# train_df['Prior_boarding'] = train_df['Baggage_handling'] + train_df['Catering'] + train_df['Checkin_service']+ train_df['Platform_location']
# train_df['Onboard_Score'] = train_df['Onboard_entertainment'] + train_df['Onboard_service'] + train_df['Onboardwifi_service']
# train_df['Comfort_Score'] = train_df['Cleanliness'] + train_df['Leg_room'] + train_df['Seat_comfort']
# train_df['Covered'] = train_df['DepartureDelay_in_Mins'] - train_df['ArrivalDelay_in_Mins']

train_df['Dep_Delay_HRS'] = train_df['DepartureDelay_in_Mins'] / 60
train_df['Arr_Delay_HRS'] = train_df['ArrivalDelay_in_Mins'] / 60

In [None]:
#correlation

plt.figure(figsize = (24, 24))
sns.heatmap(train_df.corr(), annot = True, square = True, annot_kws = {'size': 14})

In [None]:
abs(train_df.corr()['Overall_Experience']).sort_values(ascending = False)

In [None]:
# taking same preprocessing steps as training data 

for i in num_test_df:
    test_df[i].fillna(test_df[i].median(), inplace = True)

for i in cat_test_df:
    test_df[i].fillna(test_df[i].mode()[0], inplace = True)

test_df.isna().sum().sum()

In [None]:
test_df['Onboardwifi_service'].replace('extremely poor', 'poor', inplace = True)
test_df['Onlinebooking_Ease'].replace('extremely poor', 'poor', inplace = True)
test_df['Online_boarding'].replace('extremely poor', 'poor', inplace = True)

In [None]:
test_df['Checkin_service'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Leg_room'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Onboard_service'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Onlinebooking_Ease'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Online_support'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Onboardwifi_service'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Cleanliness'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Platform_location'].replace(['very inconvinient', 'Inconvinient', 'need improvement', 'manageable', 'Convinient', 'very convinient'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Catering'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Arrival_time_convenient'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Seat_comfort'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Onboard_entertainment'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)
test_df['Online_boarding'].replace(['extremely poor', 'poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4, 5], inplace = True)

test_df['Baggage_handling'].replace(['poor', 'need improvement', 'acceptable', 'good', 'excellent'], 
                                    [0, 1, 2, 3, 4], inplace = True)

test_df['Seat_Class'].replace(['Green Car', 'Ordinary'], 
                                    [0, 1], inplace = True)

test_df['Travel_Class'].replace(['Eco', 'Business'], 
                                    [0, 1], inplace = True)

test_df['TypeTravel'].replace(['Business travel', 'Personal Travel'], 
                                    [0, 1], inplace = True)

test_df['CustomerType'].replace(['Loyal Customer', 'disloyal Customer'], 
                                    [0, 1], inplace = True)

test_df['Gender'].replace(['Female', 'Male'], [1, 0], inplace = True)

In [None]:
# test_df['DepartureDelay_in_Mins'] = np.where(test_df['DepartureDelay_in_Mins'] > 1000, test_df['DepartureDelay_in_Mins'].median(), test_df['DepartureDelay_in_Mins'])
# test_df['ArrivalDelay_in_Mins'] = np.where(test_df['ArrivalDelay_in_Mins'] > 1000, test_df['DepartureDelay_in_Mins'].median(), test_df['ArrivalDelay_in_Mins'])

In [None]:
# test_df['Online_Score'] = test_df['Online_support'] + test_df['Onlinebooking_Ease'] + test_df['Online_boarding']
# test_df['Prior_boarding'] = test_df['Baggage_handling'] + test_df['Catering'] + test_df['Checkin_service']+ test_df['Platform_location']
# test_df['Onboard_Score'] = test_df['Onboard_entertainment'] + test_df['Onboard_service'] + test_df['Onboardwifi_service']
# test_df['Comfort_Score'] = test_df['Cleanliness'] + test_df['Leg_room'] + test_df['Seat_comfort']
# test_df['Covered'] = test_df['DepartureDelay_in_Mins'] - test_df['ArrivalDelay_in_Mins']

test_df['Dep_Delay_HRS'] = test_df['DepartureDelay_in_Mins'] / 60
test_df['Arr_Delay_HRS'] = test_df['ArrivalDelay_in_Mins'] / 60

In [None]:
test_df = test_df.drop(['ID', 'Seat_Class'], axis = 1)

### Modeling

In [None]:
X = train_df.drop(['Overall_Experience', 'Seat_Class'], axis = 1)

y = train_df['Overall_Experience']

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 1)

# X_train.shape, y_train.shape, X_val.shape, y_val.shape

### MAX VOTING ENSEMBLE - [TEST ACC: 0.9580922]

In [None]:
# NOTE: some below provided hyperparameters might have been altered and is not the set that gave the best test score
# Train accuracy score of 0.9886376934943348 gave the best test score of .9580922 


model1 = CatBoostClassifier(random_state = 1, verbose = False,
                              iterations = 1000,
                              depth = 6,
                              random_strength = 1e-5)

model2 = lgb.LGBMClassifier(random_state = 1,
                              boosting_type = 'gbdt',
                              num_leaves = 31,  
                              max_depth = 12,  
                              n_estimators = 950,  
                              learning_rate = 0.1,
                              colsample_bytree = 0.6,
                              n_jobs = 4)

model3 = XGBClassifier(random_state = 1, 
                         n_jobs = 4,
                         n_estimators = 800,
                         max_depth = 15,   
                         colsample_bylevel = 0.7,
                         learning_rate = 0.0891)



model1.fit(X, y)
model2.fit(X, y)
model3.fit(X, y)


pred1 = model1.predict(test_df)
pred2 = model2.predict(test_df)
pred3 = model3.predict(test_df)


# final test predictions

final_pred = np.array([])
for i in range(0,len(test_df)):
    final_pred = np.append(final_pred, statistics.mode([pred1[i], pred2[i], pred3[i]]))
    

# scoring on train data   
preda = model1.predict(X)
predb = model2.predict(X)
predc = model3.predict(X)

check_pred = np.array([])
for i in range(0,len(X)):
    check_pred = np.append(check_pred, statistics.mode([preda[i], predb[i], predc[i]]))
    

accuracy_score(y, check_pred)

In [None]:
#feature importance 

pd.DataFrame(model3.feature_importances_, index = X.columns, columns = ['Feature Imp']).sort_values('Feature Imp')

In [None]:
print(confusion_matrix(y, check_pred))

In [None]:
Overall_Experience = pd.DataFrame(final_pred, columns = ['Overall_Experience'])
submission117 = pd.concat([ID, Overall_Experience], axis = 1)
submission117.to_csv('submission00117.csv', index = False)
sub117 = pd.read_csv('submission00117.csv')
sub117.tail()