In [168]:
#Imports

import numpy as np
import pandas as pd

from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor

## Data preparation and data clearning

In [169]:
#Loading Dataset

df_initial = pd.read_csv('Invistico_Airline.csv')

In [170]:
df_initial.columns

Index(['satisfaction', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Seat comfort',
       'Departure/Arrival time convenient', 'Food and drink', 'Gate location',
       'Inflight wifi service', 'Inflight entertainment', 'Online support',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes'],
      dtype='object')

In [171]:
# Changes to column names

df_initial.columns = df_initial.columns.str.lower().str.replace(' ', '_')

In [172]:
# Unify categorical values

categorical_columns = list(df_initial.dtypes[df_initial.dtypes == 'object'].index)

for c in categorical_columns:
    df_initial[c] = df_initial[c].str.lower().str.replace(' ', '_')

In [173]:
# Dealing with Nan values

df_initial.isnull().sum()

satisfaction                           0
gender                                 0
customer_type                          0
age                                    0
type_of_travel                         0
class                                  0
flight_distance                        0
seat_comfort                           0
departure/arrival_time_convenient      0
food_and_drink                         0
gate_location                          0
inflight_wifi_service                  0
inflight_entertainment                 0
online_support                         0
ease_of_online_booking                 0
on-board_service                       0
leg_room_service                       0
baggage_handling                       0
checkin_service                        0
cleanliness                            0
online_boarding                        0
departure_delay_in_minutes             0
arrival_delay_in_minutes             393
dtype: int64

There are 393 Nan values in "arrival_delay_in_minutes"

In [174]:
# Fill NaN values with mean

df_initial['arrival_delay_in_minutes'].fillna(df_initial['arrival_delay_in_minutes'].mean(),inplace=True)

In [175]:
# Lets check types of values

df_initial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129880 non-null  object 
 1   gender                             129880 non-null  object 
 2   customer_type                      129880 non-null  object 
 3   age                                129880 non-null  int64  
 4   type_of_travel                     129880 non-null  object 
 5   class                              129880 non-null  object 
 6   flight_distance                    129880 non-null  int64  
 7   seat_comfort                       129880 non-null  int64  
 8   departure/arrival_time_convenient  129880 non-null  int64  
 9   food_and_drink                     129880 non-null  int64  
 10  gate_location                      129880 non-null  int64  
 11  inflight_wifi_service              1298

In [176]:
# Convert "departure_delay_in_minutes" to float

df_initial['departure_delay_in_minutes']=df_initial['departure_delay_in_minutes'].astype('float')

## Exploratory data analysis

In [177]:
df_initial.shape

(129880, 23)

We have 129880 survey results, 22 features and 1 target variable

In [178]:
df_initial.head().T

Unnamed: 0,0,1,2,3,4
satisfaction,satisfied,satisfied,satisfied,satisfied,satisfied
gender,female,male,female,female,female
customer_type,loyal_customer,loyal_customer,loyal_customer,loyal_customer,loyal_customer
age,65,47,15,60,70
type_of_travel,personal_travel,personal_travel,personal_travel,personal_travel,personal_travel
class,eco,business,eco,eco,eco
flight_distance,265,2464,2138,623,354
seat_comfort,0,0,0,0,0
departure/arrival_time_convenient,0,0,0,0,0
food_and_drink,0,0,0,0,0


In [179]:
for col in df_initial.columns:
    print(col)
    print(df_initial[col].unique()[:10])
    print(df_initial[col].nunique())
    print()

satisfaction
['satisfied' 'dissatisfied']
2

gender
['female' 'male']
2

customer_type
['loyal_customer' 'disloyal_customer']
2

age
[65 47 15 60 70 30 66 10 56 22]
75

type_of_travel
['personal_travel' 'business_travel']
2

class
['eco' 'business' 'eco_plus']
3

flight_distance
[ 265 2464 2138  623  354 1894  227 1812   73 1556]
5398

seat_comfort
[0 1 4 5 2 3]
6

departure/arrival_time_convenient
[0 1 2 3 4 5]
6

food_and_drink
[0 1 2 3 4 5]
6

gate_location
[2 3 4 1 5 0]
6

inflight_wifi_service
[2 0 3 4 5 1]
6

inflight_entertainment
[4 2 0 3 5 1]
6

online_support
[2 3 4 5 1 0]
6

ease_of_online_booking
[3 2 1 5 4 0]
6

on-board_service
[3 4 1 2 5 0]
6

leg_room_service
[0 4 3 2 5 1]
6

baggage_handling
[3 4 1 2 5]
5

checkin_service
[5 2 4 3 1 0]
6

cleanliness
[3 4 1 2 5 0]
6

online_boarding
[2 3 5 4 1 0]
6

departure_delay_in_minutes
[  0. 310.  17.  30.  47.  40.   5.   2.  34.   4.]
466

arrival_delay_in_minutes
[  0. 305.  15.  26.  48.  23.  19.   2. 440.   5.]
473



Target variable has only two values: 'satisfied' and 'dissatisfied'.  
"baggage_handling" has no 0 value

In [180]:
# Lets change target variable values to: 'satisfied' = 1 and 'dissatisfied' = 0

df_initial.satisfaction = (df_initial.satisfaction == 'satisfied').astype(int)

In [181]:
df_initial.satisfaction.tail(10)

129870    1
129871    1
129872    1
129873    1
129874    1
129875    1
129876    0
129877    0
129878    0
129879    0
Name: satisfaction, dtype: int32

In [182]:
# Define numerical and categorical columns

numerical = ['age', 'flight_distance', 'departure_delay_in_minutes', 'arrival_delay_in_minutes']

In [183]:
categorical = [
    'gender',
    'customer_type',
    'type_of_travel',
    'class',
    'seat_comfort',
    'departure/arrival_time_convenient',
    'food_and_drink',
    'gate_location',
    'inflight_wifi_service',
    'inflight_entertainment',
    'online_support',
    'ease_of_online_booking',
    'on-board_service',
    'leg_room_service',
    'baggage_handling',
    'checkin_service',
    'cleanliness',
    'online_boarding'
]

#### Mutual information

In [184]:
def mutual_info_satisfaction_score(series):
    return mutual_info_score(series, df_initial.satisfaction)

In [185]:
for elem in [categorical]:
    df_mi = df_initial[elem].apply(mutual_info_satisfaction_score)
    df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

In [186]:
df_mi

Unnamed: 0,MI
inflight_entertainment,0.231364
seat_comfort,0.138918
ease_of_online_booking,0.107244
online_support,0.096864
on-board_service,0.067389
online_boarding,0.063066
leg_room_service,0.0574
class,0.04974
baggage_handling,0.049322
cleanliness,0.047556


Main categorical features which influence the 'satisfaction' are: inflight_entertainment, seat_comfort, ease_of_online_booking.  
Main categorical features which do not affect 'satisfaction' are: departure/arrival_time_convenient, type_of_travel, gate_location, gender.  
This information looks quite logical. Entertainment and ыуфе сщьащке significantly influence satisfaction with the flight, while departure / arrival convenience, gate location do not matter, since the passenger, when planning a flight, either adapts to its parameters and arrives at the airport in advance or plans his departure in advance in such a way that it would be as convenient as possible.

#### Correlation

In [187]:
df_num = df_initial[numerical]

df_num.corr()

Unnamed: 0,age,flight_distance,departure_delay_in_minutes,arrival_delay_in_minutes
age,1.0,-0.249625,-0.009041,-0.011229
flight_distance,-0.249625,1.0,0.112555,0.109912
departure_delay_in_minutes,-0.009041,0.112555,1.0,0.960329
arrival_delay_in_minutes,-0.011229,0.109912,0.960329,1.0


Correlation between numerical variables shows strong dependence between daparture delay and arrival delay

In [188]:
df_initial[numerical].corrwith(df_initial.satisfaction)

age                           0.117971
flight_distance              -0.039224
departure_delay_in_minutes   -0.073909
arrival_delay_in_minutes     -0.080567
dtype: float64

In [189]:
df_initial.departure_delay_in_minutes.max()

1592.0

In [190]:
df_initial[df_initial.departure_delay_in_minutes > 0].satisfaction.mean()

0.5149140188238625

In [191]:
df_initial[df_initial.departure_delay_in_minutes > 300].satisfaction.mean()

0.39403973509933776

In [192]:
df_initial.arrival_delay_in_minutes.max()

1584.0

In [193]:
df_initial[df_initial.arrival_delay_in_minutes > 0].satisfaction.mean()

0.49720797521312166

In [194]:
df_initial[df_initial.arrival_delay_in_minutes > 300].satisfaction.mean()

0.3987138263665595

As you can see there is a relationship between departure / arrival delay and 'satisfaction'. The longer the delay, the worse 'satisfaction' parameter

#### ROC AUC feature importance

In [195]:
for elem in numerical:
    auc = roc_auc_score(y_train, df_train[elem])
    if auc < 0.5:
        auc = roc_auc_score(y_train, -df_train[elem])
    print(elem, np.round(auc, 3))

age 0.576
flight_distance 0.528
departure_delay_in_minutes 0.539
arrival_delay_in_minutes 0.555


ROC AUC feature importance analysis confirmed previous observations

### Perform the train/validation/test split with Scikit-Learn

In [196]:
df_full_train, df_test = train_test_split(df_initial, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [197]:
len(df_train), len(df_val), len(df_test)

(77928, 25976, 25976)

## Model training

### Logistic Regression

Logistic Regression + Hyperparameter Tuning

In [198]:
def train (df_train, y_train, C=1.0):
    dicts = df_train.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)

    return dv, model

In [199]:
def predict (df_val, dv, model):
    dicts = df_val.to_dict(orient='records')

    X_val = dv.transform(dicts)

    y_pred = model.predict_proba(X_val)[:, 1]

    return y_pred

In [200]:
C = [0.01, 0.1, 0.5, 1, 2, 5, 10]

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for elem in C:
    scores = []

    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]

        df_train = df_train.reset_index(drop=True)
        df_val = df_val.reset_index(drop=True)

        y_train = df_train.satisfaction.values
        y_val = df_val.satisfaction.values

        del df_train['satisfaction']
        del df_val['satisfaction']

        dv, model = train(df_train, y_train, C=elem)
        y_pred = predict(df_val, dv, model)

        auc = roc_auc_score(y_val, y_pred)

        scores.append(auc)

    print('C=%4s, %.3f +- %.3f' % (elem, np.mean(scores), np.std(scores)))

C=0.01, 0.909 +- 0.001
C= 0.1, 0.909 +- 0.001
C= 0.5, 0.909 +- 0.001
C=   1, 0.909 +- 0.001
C=   2, 0.909 +- 0.001
C=   5, 0.909 +- 0.001
C=  10, 0.909 +- 0.001


There is no difference which C parameter to use

In [203]:
ll_full_train = df_full_train.reset_index(drop=True)
ll_test = df_test.reset_index(drop=True)

ll_y_full_train = df_full_train.satisfaction.values
ll_y_test = df_test.satisfaction.values

del ll_full_train['satisfaction']
del ll_test['satisfaction']

train_dict = ll_full_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

test_dict = ll_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [204]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, ll_y_full_train)

In [205]:
ll_y_pred = model.predict_proba(X_test)[:, 1]

In [207]:
ll_auc = np.round(roc_auc_score(ll_y_test, ll_y_pred),3)
ll_auc

0.909

### Random Forest

In [210]:
df_full_train, df_test = train_test_split(df_initial, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.satisfaction.values
y_val = df_val.satisfaction.values
y_test = df_test.satisfaction.values

del df_train['satisfaction']
del df_val['satisfaction']
del df_test['satisfaction']

In [211]:
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

In [212]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

Random forest + Hyperparameter Tuning

In [220]:
scores = []

for m in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=m, random_state=42, n_jobs=-1)
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_val)
        score = np.round(roc_auc_score(y_val, y_pred),3)
        
        scores.append([m, n, score])

df_scores = pd.DataFrame(scores, columns=['max_depth', 'n_estimators', 'auc'])

In [223]:
df_scores.sort_values('auc', ascending=False).head(20)

Unnamed: 0,max_depth,n_estimators,auc
79,25,200,0.993
59,20,200,0.993
55,20,160,0.993
54,20,150,0.993
53,20,140,0.993
52,20,130,0.993
51,20,120,0.993
58,20,190,0.993
57,20,180,0.993
56,20,170,0.993


Best auc is 0.993  
Using Hyperparameter tuning will use max_depth = 20 and n_estimators = 120

### Random Forest model gives better results with auc = 0.993

In [224]:
df_full_train = df_full_train.reset_index(drop=True)
y_full_train = df_full_train.satisfaction.values
del df_full_train['satisfaction']

In [225]:
full_train_dicts = df_full_train.to_dict(orient='records')
test_dicts = df_test.to_dict(orient='records')

In [226]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(full_train_dicts)
X_test = dv.transform(test_dicts)

In [227]:
rf = RandomForestRegressor(n_estimators=120, max_depth=20, random_state=42, n_jobs=-1)
rf.fit(X_full_train, y_full_train)

In [229]:
y_pred = rf.predict(X_test)
score = np.round(roc_auc_score(y_test, y_pred),3)
score

0.994

Random forest model perform slightly better on test dataset