# Airline Passenger Satisfaction

## Imporing required libs

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.impute import KNNImputer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from statistics import mode

## Getting dataset

In [3]:
df = pd.read_csv("data/proj72/train.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0.0,satisfied


## Cleaning dataset

In [4]:
df.isna().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction    

In [7]:
def dataCleaner(df, method):
    if method == "mean":
        for column in df.keys():
            if pd.api.types.is_object_dtype(df[column]) == False:
                df[column].fillna(np.mean(df[column]), inplace=True)
    
    if method == "median":
        for column in df.keys():
            if pd.api.types.is_object_dtype(df[column]) == False:
                df[column].fillna(np.median(df[column]), inplace=True)
    
    if method == "mode":
        for column in df.keys():
            if pd.api.types.is_object_dtype(df[column]) == False:
                df[column].fillna(mode(df[column]), inplace=True)


In [9]:
dataCleaner(df, method="mean")

## Pre-Processing data

In [11]:
def preprocessData(df):
    classes = {}
    for column in df.keys():
        if pd.api.types.is_object_dtype(df[column]):
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])
            classes[column] = le.classes_
    
    return classes

In [12]:
labels = preprocessData(df)
labels

{'Gender': array(['Female', 'Male'], dtype=object),
 'Customer Type': array(['Loyal Customer', 'disloyal Customer'], dtype=object),
 'Type of Travel': array(['Business travel', 'Personal Travel'], dtype=object),
 'Class': array(['Business', 'Eco', 'Eco Plus'], dtype=object),
 'satisfaction': array(['neutral or dissatisfied', 'satisfied'], dtype=object)}

In [15]:
X = df.drop("satisfaction", axis=1)
y = df["satisfaction"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Training data

In [16]:
rclf = RandomForestClassifier()
rclf.fit(X_train, y_train)

calibrated_clf = CalibratedClassifierCV(rclf, cv=5, method='sigmoid')
calibrated_clf.fit(X_train, y_train)
predictions = calibrated_clf.predict_proba(X_test)

In [19]:
calibrated_clf.score(X_test, y_test)

0.9638612193830903

In [20]:
xclf = XGBClassifier()
xclf.fit(X_train.values, y_train.values)
xclf.score(X_test.values, y_test.values)

0.9657379336894278