# Data Source: https://www.kaggle.com/sjleshrac/airlines-customer-satisfaction

***

# Initial Data Setup and Filtering

In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import *
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_selection import RFE, RFECV
from sklearn.decomposition import PCA
from sklearn.utils import resample
from xgboost import XGBClassifier
import eli5
from eli5.sklearn import explain_weights, permutation_importance
import datetime as dt
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from imblearn.datasets import make_imbalance
start=dt.datetime.now()
print('Elapsed time: ',str(dt.datetime.now()-start))

Elapsed time:  0:00:00.000059


In [2]:
# load data
df = pd.read_csv('Invistico_Airline.csv')

In [3]:
# preview data
print(df.shape)
df.head()

(129880, 23)


Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [4]:
# preview data types for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 23 columns):
satisfaction                         129880 non-null object
Gender                               129880 non-null object
Customer Type                        129880 non-null object
Age                                  129880 non-null int64
Type of Travel                       129880 non-null object
Class                                129880 non-null object
Flight Distance                      129880 non-null int64
Seat comfort                         129880 non-null int64
Departure/Arrival time convenient    129880 non-null int64
Food and drink                       129880 non-null int64
Gate location                        129880 non-null int64
Inflight wifi service                129880 non-null int64
Inflight entertainment               129880 non-null int64
Online support                       129880 non-null int64
Ease of Online booking               129880 non-null int64

In [36]:
# rename columns for easier coding
for col in df.columns:
    df.rename(columns={col:col.replace(' ','_')},inplace=True)
    df.rename(columns={col:col.replace('-','_')},inplace=True)
    df.rename(columns={col:col.replace('/','_')},inplace=True)

In [37]:
# new column names
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129880 entries, 0 to 129879
Data columns (total 23 columns):
satisfaction                         129880 non-null int64
Gender                               129880 non-null object
Customer_Type                        129880 non-null object
Age                                  129880 non-null int64
Type_of_Travel                       129880 non-null object
Class                                129880 non-null object
Flight_Distance                      129880 non-null int64
Seat_comfort                         129880 non-null int64
Departure_Arrival_time_convenient    129880 non-null int64
Food_and_drink                       129880 non-null int64
Gate_location                        129880 non-null int64
Inflight_wifi_service                129880 non-null int64
Inflight_entertainment               129880 non-null int64
Online_support                       129880 non-null int64
Ease_of_Online_booking               129880 non-null int64


In [5]:
# map satisfaction as binary integer
df.satisfaction = df.satisfaction.map(lambda x: {'satisfied':1,'dissatisfied':0}[x])

In [21]:
# how much data is lost by removing all null values indiscriminately?
f'data lost: {(len(df)-len(df.dropna()))/len(df):.2%} of total data'

'data lost: 0.30% of total data'

In [38]:
# create a new data frame where Arrival Delay in Minutes is unknown for further investigation
unknown_arrival_delay = df[df['Arrival_Delay_in_Minutes'].isna()==True]

In [40]:
# remove data with null values
df.dropna(inplace=True)

In [41]:
# reset index to account for dropped values and remove added column
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

In [43]:
df.head()

Unnamed: 0,satisfaction,Gender,Customer_Type,Age,Type_of_Travel,Class,Flight_Distance,Seat_comfort,Departure_Arrival_time_convenient,Food_and_drink,...,Online_support,Ease_of_Online_booking,On_board_service,Leg_room_service,Baggage_handling,Checkin_service,Cleanliness,Online_boarding,Departure_Delay_in_Minutes,Arrival_Delay_in_Minutes
0,1,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,1,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,1,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,1,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,1,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0
