In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [2]:
train_data = pd.read_csv('./airline_passenger_satisfaction_TRAIN.csv')
test_data = pd.read_csv('./airline_passenger_satisfaction_TEST.csv')
train_data = train_data.drop('Unnamed: 0', axis=1) #reading in this csv includes the row-number as a column so here we are dropping it
test_data = test_data.drop('Unnamed: 0', axis=1)

x_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
x_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:,-1]

# Data preprocessing
The first thing we see about the data is that some values, such as 'Customer Type', 'Type of Travel', and 'Class' are categorical rather than continuous

In [3]:
x_train.head(5)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,5,4,3,4,4,5,5,25,18.0
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,1,5,3,1,4,1,1,6.0
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,5,4,3,4,4,4,5,0,0.0
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,2,5,3,1,4,2,11,9.0
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,5,3,3,4,4,3,3,3,0,0.0


We can change the categorical data to be numerical using Scikit-learns Ordinal Encoder, and LabelEncoder to binarize the two classes

In [4]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

encoder = OrdinalEncoder()
labelEnc = LabelEncoder()

#encoding categorical features
categorical_features = x_train.select_dtypes(include='object').columns
x_train[categorical_features] = encoder.fit_transform(x_train[categorical_features])
categorical_features = x_test.select_dtypes(include='object').columns
x_test[categorical_features] = encoder.fit_transform(x_test[categorical_features])

#Binarizing the target labels
y_train = labelEnc.fit_transform(y_train)
y_test = labelEnc.fit_transform(y_test)

Now we can see that the data has been ordinalized

In [5]:
x_train.head(5)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,70172,1.0,0.0,13,1.0,2.0,460,3,4,3,...,5,5,4,3,4,4,5,5,25,18.0
1,5047,1.0,1.0,25,0.0,0.0,235,3,2,3,...,1,1,1,5,3,1,4,1,1,6.0
2,110028,0.0,0.0,26,0.0,0.0,1142,2,2,2,...,5,5,4,3,4,4,4,5,0,0.0
3,24026,0.0,0.0,25,0.0,0.0,562,2,5,5,...,2,2,2,5,3,1,4,2,11,9.0
4,119299,1.0,0.0,61,0.0,0.0,214,3,3,3,...,5,3,3,4,4,3,3,3,0,0.0


## Imputing Missing Values
Now we make sure there are no  missing values

In [6]:
data_is_none = x_train.isnull().sum()
missing_train_data = pd.DataFrame({"Number of missing values": data_is_none}).sort_values(by=['Number of missing values'], ascending=False)[:]

missing_train_data.head(3)

Unnamed: 0,Number of missing values
Arrival Delay in Minutes,310
Online boarding,0
Departure Delay in Minutes,0


We can see that the only missing values are in the flight delays column. We were able to verify that the same holds for the test data.
Since flight delays are clearly independent from the other attributes, we can just use the average flight delay time

In [7]:
x_train['Arrival Delay in Minutes'] = x_train['Arrival Delay in Minutes'].fillna(x_train['Arrival Delay in Minutes'].mean())
x_test['Arrival Delay in Minutes'] = x_test['Arrival Delay in Minutes'].fillna(x_test['Arrival Delay in Minutes'].mean())

## Normalizing the data

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

y_train = y_train.ravel()

# Training the model
We first try predicting using logistic regression

In [9]:
lr = LogisticRegression()

In [10]:
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.8723821989528796

Now we try support vector machines (SVM's)

In [11]:
from sklearn.svm import LinearSVC

In [12]:
y_train = y_train * 2 - 1 #Since support vector machines use -1 and 1's as the data labels, we are converting our data labels from 0 and 1 to -1 and 1
y_test = y_test * 2 -1

svc = LinearSVC(dual='auto')


svc.fit(x_train, y_train)
print(svc.score(x_test, y_test))

0.8720742223591007
