The goal of the problem is to predict whether a passenger was delighted considering his/her overall travel experience of traveling in Shinkansen (Bullet Train).

### Problem Description
This is the problem of a Shinkansen (Bullet-Trains) of Japan. They aim to determine the relative importance of each parameter with regards to their contribution to the passenger travel experience. Provided is a random sample of individuals who travelled using their train. The on-time performance of the trains along with the passenger’s information is published in the CSV file named ‘Traveldata_train’. These passengers were later asked to provide their feedback on various parameters related to the travel along with their overall experience. These collected details are made available in the survey report CSV labelled ‘Surveydata_train’.

In the survey, a passenger was explicitly asked whether they were delighted with their overall travel experience and that is captured in the data of the survey report under the variable labelled ‘Overall_Experience’.

The objective of this exercise is to understand which parameters play an important role in swaying passenger feedback towards a positive scale. You are provided test data containing Travel data and Survey data of passengers. Both the test data and the train data are collected at the same time and belongs to the same company.

In [1]:
#Import required libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from warnings import filterwarnings 
filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV

# Machine learning libraries

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix,roc_curve, roc_auc_score,accuracy_score

In [None]:
#Read all csv files
df1 = pd.read_csv('Surveydata_train.csv')
df2 = pd.read_csv('Traveldata_train.csv')

In [None]:
# Surveydata head
df1.head()

In [None]:
# Surveydata tail
df1.tail()

In [None]:
# Travel data head
df2.head()

In [None]:
# Travel data tail
df2.tail()

In [None]:
# shape of survey data
df1.shape

In [None]:
# shape of travel data
df2.shape

In [None]:
# merge survey data and travel data
dfm = pd.merge(df1, df2, on = 'ID' , how = 'outer')

In [None]:
# merged data head
dfm.head()

In [None]:
# merged data tail
dfm.tail()

In [None]:
# shape of merged dataset
dfm.shape

In [None]:
#Descriptive statsistics of merged data
dfm.describe().T

In [None]:
#Info of merged data
dfm.info()

In [None]:
#Descriptive stats of categorical features (merged data)
dfm.describe(include = ['object']).T

### Checking for duplicate records

In [None]:
dups = dfm.duplicated()
print('Number of duplicate row = %d' %(dups.sum()))

In [None]:
dfm_num = dfm.select_dtypes(['float64','int64'])
dfm_cat = dfm.select_dtypes(['object'])

### Checking for null values

In [None]:
# Checking null values for numeric features
dfm_num.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(missing_values= np.nan, strategy='median')
imr = imputer.fit(dfm_num)
dfm_num = pd.DataFrame(imr.transform(dfm_num), columns=dfm_num.columns)

In [None]:
dfm_num.isnull().sum()

In [None]:
# Checking null values for categorical features
dfm_cat.isnull().sum()

In [None]:
dfm_cat = dfm_cat.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [None]:
dfm_cat.isnull().sum()

In [None]:
dfm_cat.head()

In [None]:
dfm_num.head()

In [None]:
from scipy.stats import zscore
dfm_num_scaled=dfm_num[['Age','Travel_Distance','DepartureDelay_in_Mins','ArrivalDelay_in_Mins']].apply(zscore)

In [None]:
dfm_num_scaled.head()

In [None]:
dfm = pd.concat([dfm_num['Overall_Experience'],dfm_num_scaled, dfm_cat], axis = 1, join = 'inner')

In [None]:
dfm.head()

In [None]:
dfm.shape

In [None]:
dfm['Overall_Experience'] = dfm['Overall_Experience'].astype('int64')

In [None]:
dfm.info()

In [None]:
for feature in df_train.columns:
    if dfm[feature].dtype == 'object':
        print(feature)
        print(dfm[feature].value_counts())
        print('\n')

In [None]:
dfm.columns

In [None]:
dfm['Seat_comfort'] = np.where(dfm['Seat_comfort'] == 'extremely poor', '0',dfm['Seat_comfort'])
dfm['Seat_comfort'] = np.where(dfm['Seat_comfort'] == 'poor', '1',dfm['Seat_comfort'])
dfm['Seat_comfort'] = np.where(dfm['Seat_comfort'] == 'need improvement', '2',dfm['Seat_comfort'])
dfm['Seat_comfort'] = np.where(dfm['Seat_comfort'] == 'acceptable', '3',dfm['Seat_comfort'])
dfm['Seat_comfort'] = np.where(dfm['Seat_comfort'] == 'good', '4',dfm['Seat_comfort'])
dfm['Seat_comfort'] = np.where(dfm['Seat_comfort'] == 'excellent', '5',dfm['Seat_comfort'])

In [None]:
dfm['Seat_Class'] = np.where(dfm['Seat_Class'] == 'Ordinary', '0',dfm['Seat_Class'])
dfm['Seat_Class'] = np.where(dfm['Seat_Class'] == 'Green Car', '1',dfm['Seat_Class'])

In [None]:
dfm['Arrival_time_convenient'] = np.where(dfm['Arrival_time_convenient'] == 'extremely poor', '0',dfm['Arrival_time_convenient'])
dfm['Arrival_time_convenient'] = np.where(dfm['Arrival_time_convenient'] == 'poor', '1',dfm['Arrival_time_convenient'])
dfm['Arrival_time_convenient'] = np.where(dfm['Arrival_time_convenient'] == 'need improvement', '2',dfm['Arrival_time_convenient'])
dfm['Arrival_time_convenient'] = np.where(dfm['Arrival_time_convenient'] == 'acceptable', '3',dfm['Arrival_time_convenient'])
dfm['Arrival_time_convenient'] = np.where(dfm['Arrival_time_convenient'] == 'good', '4',dfm['Arrival_time_convenient'])
dfm['Arrival_time_convenient'] = np.where(dfm['Arrival_time_convenient'] == 'excellent', '5',dfm['Arrival_time_convenient'])

In [None]:
dfm['Catering'] = np.where(dfm['Catering'] == 'extremely poor', '0',dfm['Catering'])
dfm['Catering'] = np.where(dfm['Catering'] == 'poor', '1',dfm['Catering'])
dfm['Catering'] = np.where(dfm['Catering'] == 'need improvement', '2',dfm['Catering'])
dfm['Catering'] = np.where(dfm['Catering'] == 'acceptable', '3',dfm['Catering'])
dfm['Catering'] = np.where(dfm['Catering'] == 'good', '4',dfm['Catering'])
dfm['Catering'] = np.where(dfm['Catering'] == 'excellent', '5',dfm['Catering'])

In [None]:
dfm['Platform_location'] = np.where(dfm['Platform_location'] == 'very inconvinient', '0',dfm['Platform_location'])
dfm['Platform_location'] = np.where(dfm['Platform_location'] == 'Inconvinient', '1',dfm['Platform_location'])
dfm['Platform_location'] = np.where(dfm['Platform_location'] == 'need improvement', '2',dfm['Platform_location'])
dfm['Platform_location'] = np.where(dfm['Platform_location'] == 'manageable', '3',dfm['Platform_location'])
dfm['Platform_location'] = np.where(dfm['Platform_location'] == 'Convinient', '4',dfm['Platform_location'])
dfm['Platform_location'] = np.where(dfm['Platform_location'] == 'very convinient', '5',dfm['Platform_location'])

In [None]:
dfm['Onboardwifi_service'] = np.where(dfm['Onboardwifi_service'] == 'extremely poor', '0',dfm['Onboardwifi_service'])
dfm['Onboardwifi_service'] = np.where(dfm['Onboardwifi_service'] == 'poor', '1',dfm['Onboardwifi_service'])
dfm['Onboardwifi_service'] = np.where(dfm['Onboardwifi_service'] == 'need improvement', '2',dfm['Onboardwifi_service'])
dfm['Onboardwifi_service'] = np.where(dfm['Onboardwifi_service'] == 'acceptable', '3',dfm['Onboardwifi_service'])
dfm['Onboardwifi_service'] = np.where(dfm['Onboardwifi_service'] == 'good', '4',dfm['Onboardwifi_service'])
dfm['Onboardwifi_service'] = np.where(dfm['Onboardwifi_service'] == 'excellent', '5',dfm['Onboardwifi_service'])

In [None]:
dfm['Onboard_entertainment'] = np.where(dfm['Onboard_entertainment'] == 'extremely poor', '0',dfm['Onboard_entertainment'])
dfm['Onboard_entertainment'] = np.where(dfm['Onboard_entertainment'] == 'poor', '1',dfm['Onboard_entertainment'])
dfm['Onboard_entertainment'] = np.where(dfm['Onboard_entertainment'] == 'need improvement', '2',dfm['Onboard_entertainment'])
dfm['Onboard_entertainment'] = np.where(dfm['Onboard_entertainment'] == 'acceptable', '3',dfm['Onboard_entertainment'])
dfm['Onboard_entertainment'] = np.where(dfm['Onboard_entertainment'] == 'good', '4',dfm['Onboard_entertainment'])
dfm['Onboard_entertainment'] = np.where(dfm['Onboard_entertainment'] == 'excellent', '5',dfm['Onboard_entertainment'])

In [None]:
dfm['Online_support'] = np.where(dfm['Online_support'] == 'extremely poor', '0',dfm['Online_support'])
dfm['Online_support'] = np.where(dfm['Online_support'] == 'poor', '1',dfm['Online_support'])
dfm['Online_support'] = np.where(dfm['Online_support'] == 'need improvement', '2',dfm['Online_support'])
dfm['Online_support'] = np.where(dfm['Online_support'] == 'acceptable', '3',dfm['Online_support'])
dfm['Online_support'] = np.where(dfm['Online_support'] == 'good', '4',dfm['Online_support'])
dfm['Online_support'] = np.where(dfm['Online_support'] == 'excellent', '5',dfm['Online_support'])

In [None]:
dfm['Onlinebooking_Ease'] = np.where(dfm['Onlinebooking_Ease'] == 'extremely poor', '0',dfm['Onlinebooking_Ease'])
dfm['Onlinebooking_Ease'] = np.where(dfm['Onlinebooking_Ease'] == 'poor', '1',dfm['Onlinebooking_Ease'])
dfm['Onlinebooking_Ease'] = np.where(dfm['Onlinebooking_Ease'] == 'need improvement', '2',dfm['Onlinebooking_Ease'])
dfm['Onlinebooking_Ease'] = np.where(dfm['Onlinebooking_Ease'] == 'acceptable', '3',dfm['Onlinebooking_Ease'])
dfm['Onlinebooking_Ease'] = np.where(dfm['Onlinebooking_Ease'] == 'good', '4',dfm['Onlinebooking_Ease'])
dfm['Onlinebooking_Ease'] = np.where(dfm['Onlinebooking_Ease'] == 'excellent', '5',dfm['Onlinebooking_Ease'])

In [None]:
dfm['Onboard_service'] = np.where(dfm['Onboard_service'] == 'extremely poor', '0',dfm['Onboard_service'])
dfm['Onboard_service'] = np.where(dfm['Onboard_service'] == 'poor', '1',dfm['Onboard_service'])
dfm['Onboard_service'] = np.where(dfm['Onboard_service'] == 'need improvement', '2',dfm['Onboard_service'])
dfm['Onboard_service'] = np.where(dfm['Onboard_service'] == 'acceptable', '3',dfm['Onboard_service'])
dfm['Onboard_service'] = np.where(dfm['Onboard_service'] == 'good', '4',dfm['Onboard_service'])
dfm['Onboard_service'] = np.where(dfm['Onboard_service'] == 'excellent', '5',dfm['Onboard_service'])

In [None]:
dfm['Leg_room'] = np.where(dfm['Leg_room'] == 'extremely poor', '0',dfm['Leg_room'])
dfm['Leg_room'] = np.where(dfm['Leg_room'] == 'poor', '1',dfm['Leg_room'])
dfm['Leg_room'] = np.where(dfm['Leg_room'] == 'need improvement', '2',dfm['Leg_room'])
dfm['Leg_room'] = np.where(dfm['Leg_room'] == 'acceptable', '3',dfm['Leg_room'])
dfm['Leg_room'] = np.where(dfm['Leg_room'] == 'good', '4',dfm['Leg_room'])
dfm['Leg_room'] = np.where(dfm['Leg_room'] == 'excellent', '5',dfm['Leg_room'])

In [None]:
dfm['Baggage_handling'] = np.where(dfm['Baggage_handling'] == 'extremely poor', '0',dfm['Baggage_handling'])
dfm['Baggage_handling'] = np.where(dfmn['Baggage_handling'] == 'poor', '1',dfm['Baggage_handling'])
dfm['Baggage_handling'] = np.where(dfm['Baggage_handling'] == 'need improvement', '2',dfm['Baggage_handling'])
dfm['Baggage_handling'] = np.where(dfm['Baggage_handling'] == 'acceptable', '3',dfm['Baggage_handling'])
dfm['Baggage_handling'] = np.where(dfm['Baggage_handling'] == 'good', '4',dfm['Baggage_handling'])
dfm['Baggage_handling'] = np.where(dfm['Baggage_handling'] == 'excellent', '5',dfm['Baggage_handling'])

In [None]:
dfm['Checkin_service'] = np.where(dfm['Checkin_service'] == 'extremely poor', '0',dfm['Checkin_service'])
dfm['Checkin_service'] = np.where(dfm['Checkin_service'] == 'poor', '1',dfm['Checkin_service'])
dfm['Checkin_service'] = np.where(dfm['Checkin_service'] == 'need improvement', '2',dfm['Checkin_service'])
dfm['Checkin_service'] = np.where(dfm['Checkin_service'] == 'acceptable', '3',dfm['Checkin_service'])
dfm['Checkin_service'] = np.where(dfm['Checkin_service'] == 'good', '4',dfm['Checkin_service'])
dfm['Checkin_service'] = np.where(dfm['Checkin_service'] == 'excellent', '5',dfm['Checkin_service'])

In [None]:
dfm['Cleanliness'] = np.where(dfm['Cleanliness'] == 'extremely poor', '0',dfm['Cleanliness'])
dfm['Cleanliness'] = np.where(dfm['Cleanliness'] == 'poor', '1',dfm['Cleanliness'])
dfm['Cleanliness'] = np.where(dfm['Cleanliness'] == 'need improvement', '2',dfm['Cleanliness'])
dfm['Cleanliness'] = np.where(dfm['Cleanliness'] == 'acceptable', '3',dfm['Cleanliness'])
dfm['Cleanliness'] = np.where(dfm['Cleanliness'] == 'good', '4',dfm['Cleanliness'])
dfm['Cleanliness'] = np.where(dfm['Cleanliness'] == 'excellent', '5',dfm['Cleanliness'])

In [None]:
dfm['Online_boarding'] = np.where(dfm['Online_boarding'] == 'extremely poor', '0',dfm['Online_boarding'])
dfm['Online_boarding'] = np.where(dfm['Online_boarding'] == 'poor', '1',dfm['Online_boarding'])
dfm['Online_boarding'] = np.where(dfm['Online_boarding'] == 'need improvement', '2',dfm['Online_boarding'])
dfm['Online_boarding'] = np.where(dfm['Online_boarding'] == 'acceptable', '3',dfm['Online_boarding'])
dfm['Online_boarding'] = np.where(dfm['Online_boarding'] == 'good', '4',dfm['Online_boarding'])
dfm['Online_boarding'] = np.where(dfm['Online_boarding'] == 'excellent', '5',dfm['Online_boarding'])

In [None]:
dfm['CustomerType'] = np.where(dfm['CustomerType'] == 'disloyal Customer', '0',dfm['CustomerType'])
dfm['CustomerType'] = np.where(dfm['CustomerType'] == 'Loyal Customer', '1',dfm['CustomerType'])

In [None]:
dfm['Travel_Class'] = np.where(dfm['Travel_Class'] == 'Eco', '0',dfm['Travel_Class'])
dfm['Travel_Class'] = np.where(dfm['Travel_Class'] == 'Business', '1',dfm['Travel_Class'])

In [None]:
dfm['TypeTravel'] = np.where(dfm['TypeTravel'] == 'Personal Travel', '0',dfm['TypeTravel'])
dfm['TypeTravel'] = np.where(dfm['TypeTravel'] == 'Business travel', '1',dfm['TypeTravel'])

In [None]:
dfm['Gender'] = np.where(dfm['Gender'] == 'Female', '0',dfm['Gender'])
dfm['Gender'] = np.where(dfm['Gender'] == 'Male', '1',dfm['Gender'])

In [None]:
dfm['Seat_comfort'] = dfm['Seat_comfort'].astype('int64')
dfm['Seat_Class'] = dfm['Seat_Class'].astype('int64')
dfm['Arrival_time_convenient'] = dfm['Arrival_time_convenient'].astype('int64')
dfm['Catering'] = dfm['Catering'].astype('int64')
dfm['Platform_location'] = dfm['Platform_location'].astype('int64')
dfm['Onboardwifi_service'] = dfm['Onboardwifi_service'].astype('int64')
dfm['Onboard_entertainment'] = dfm['Onboard_entertainment'].astype('int64')
dfm['Online_support'] = dfm['Online_support'].astype('int64')
dfm['Onlinebooking_Ease'] = dfm['Onlinebooking_Ease'].astype('int64')
dfm['Onboard_service'] = dfm['Onboard_service'].astype('int64')
dfm['Leg_room'] = dfm['Leg_room'].astype('int64')
dfm['Baggage_handling'] = dfm['Baggage_handling'].astype('int64')
dfm['Checkin_service'] = dfm['Checkin_service'].astype('int64')
dfm['Cleanliness'] = dfm['Cleanliness'].astype('int64')
dfm['Online_boarding'] = dfm['Online_boarding'].astype('int64')
dfm['Gender'] = dfm['Gender'].astype('int64')
dfm['CustomerType'] = dfm['CustomerType'].astype('int64')
dfm['TypeTravel'] = dfm['TypeTravel'].astype('int64')
dfm['Travel_Class'] = dfm['Travel_Class'].astype('int64')

In [None]:
dfm.shape

In [None]:
dm.info()

In [None]:
dfm.isnull().sum()

In [None]:
dfm.head()

In [None]:
dfm.tail()

In [None]:
dfm.describe().T