# Imports & Configs

In [1]:
%pip install numpy pandas seaborn matplotlib optuna sklearn xgboost catboost lightgbm > /dev/null 2>&1

In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import warnings
import optuna
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree  import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import BernoulliNB
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from tabulate import tabulate
import os
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Preprocessing

In [25]:
df=pd.read_csv('/kaggle/input/airline-passenger-satisfaction/airline_passenger_satisfaction.csv')
df

In [26]:
df.info()

In [27]:
df.head()

In [28]:
df.describe()

In [29]:
df.describe(include='object')

## Missing Data

In [30]:
total = df.shape[0]
missing_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
for col in missing_columns:
    null_count = df[col].isnull().sum()
    per = (null_count/total) * 100
    print(f"{col}: {null_count} ({round(per, 3)}%)")

Missing values are very few! we can just fill them with the median

In [31]:
df['Arrival Delay'].fillna(df['Arrival Delay'].median(), inplace=True)

## Duplicates 

In [32]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")

Great! No duplicates

## Outliers 

In [33]:
for col in df:
    if col != 'Satisfaction' and col in ['Flight Distance', 'Departure Delay', 'Arrival Delay']:
        print('-'*30)
        print(f"{col}:")
        fig, ax = plt.subplots(2, 1, figsize=(12, 8))
        g1 = sns.boxplot(x = df[col], ax=ax[0])
        g2 = sns.scatterplot(data=df, x=df[col],y=df['Satisfaction'], ax=ax[1])
        plt.show()

In [34]:
valid_data = {'Flight Distance': 4000, 'Departure Delay': 500, 'Arrival Delay': 500}

for k in valid_data:
    print(f'Number of outliers for {k} is {df[df[k] >= valid_data[k]].shape[0]}')

for k in valid_data:
    df = df[df[k] < valid_data[k]]

# EDA

In [13]:
f, axes = plt.subplots(9, 2, figsize=(15,30), sharex=False)
idx, idy = 0, 0
for i in range(1,24):
    if is_numeric_dtype(df.iloc[:,i]):
        sns.violinplot(x=df.iloc[:,i], color="skyblue", ax=axes[idx,idy])
        idx += idy
        idy = int(not idy)
plt.show()

In [14]:
f, axes = plt.subplots(9, 2, figsize=(15,30), sharex=False)
idx, idy = 0, 0
for i in range(1, 24):
    if is_numeric_dtype(df.iloc[:,i]):
        plot = sns.distplot(x=df.iloc[:,i], color="skyblue", ax=axes[idx,idy])
        plot.set(xlabel=df.columns[i])
        idx += idy
        idy = int(not idy)
plt.show()

In [15]:
f, axes = plt.subplots(6, 3, figsize=(15,30), sharex=False)

idx, idy = 0, 0
for i in range(7, 24):
    if is_numeric_dtype(df.iloc[:,i]):
        sns.barplot(x="Satisfaction", y=df.columns[i], data=df, ax=axes[idx,idy])
        idx += 1 if idy == 2 else 0
        idy = idy + 1 if idy < 2 else 0
        
plt.show()

In [16]:
plt.figure(figsize=(40,30))
sns.heatmap(df.corr(), annot=True)

# import plotly.express as px
# fig = px.imshow(df.corr(), text_auto=True, aspect="auto")
# fig.show()

## Observations

- The customers Neutral or Dissatisfaction also increased with the increase in Departure Delay 
- The Demand for Business class and Business class passengers are more satisfied with the increase in flight distance 
- The customer satisfaction increased with the increase of Check-in Service 
- The customers Neutral or Dissatisfaction also increased with the increase in Arrival Delay 
- The customer satisfaction increased with the increase of Ease of Online Booking 
- The customer satisfaction increased with the increase of Seat Comfort 
- The customer satisfaction increased with the increase of Online Boarding 
- The customer satisfaction increased with the increase of Leg Room Service 
- The customer satisfaction increased with the increase of Cleanliness 
- The customer satisfaction increased with the increase of In-flight Wifi Service 
- The customer satisfaction increased with the increase of On-board Service 
- The customer satisfaction increased with the increase of In-flight Service 
- The customer satisfaction increased with the increase of Food and Drink 
- The customer satisfaction increased with the increase of In-flight Entertainment 
- The customer satisfaction increased with the increase of Baggage Handling 

# Label Encoding

In [35]:
for col in df.columns:
    if df[col].dtype == 'object':
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])

In [1]:
df.drop(['ID'], axis = 1, inplace=True)
df.head()

NameError: name 'df' is not defined

# Split and scale data

In [37]:
x = df.drop(['Satisfaction'], axis=1)
y = df['Satisfaction']

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.25, random_state=42)

In [39]:
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.fit_transform(x_test)

# K Nearest Neighbors (KNN) classification model

In [2]:
def objective(trial):
    n_neighbors = trial.suggest_int('KNN_n_neighbors', 2, 16, log=False)
    classifier_obj = KNeighborsClassifier(n_neighbors=n_neighbors)
    classifier_obj.fit(x_train, y_train)
    accuracy = classifier_obj.score(x_test, y_test)
    return accuracy

In [3]:
study_KNN = optuna.create_study(direction='maximize')
study_KNN.optimize(objective, n_trials=30)
print(study_KNN.best_trial)

NameError: name 'optuna' is not defined

In [40]:
KNN_model = KNeighborsClassifier(n_neighbors=study_KNN.best_trial.params['KNN_n_neighbors'])
KNN_model.fit(x_train, y_train)

KNN_train, KNN_test = KNN_model.score(x_train, y_train), KNN_model.score(x_test, y_test)

print(f"Train Score: {KNN_train}")
print(f"Test Score: {KNN_test}")

# Logistic Regression Model

In [31]:
lg_model = LogisticRegression(random_state = 42)
lg_model.fit(x_train, y_train)

In [32]:
lg_train, lg_test = lg_model.score(x_train , y_train), lg_model.score(x_test , y_test)

print(f"Training Score: {lg_train}")
print(f"Test Score: {lg_test}")

# Decision Tree Classifier

In [35]:
def objective(trial):
    dt_max_depth = trial.suggest_int('dt_max_depth', 2, 32, log=False)
    dt_max_features = trial.suggest_int('dt_max_features', 2, 22, log=False)
    classifier_obj = DecisionTreeClassifier(max_features = dt_max_features, max_depth = dt_max_depth)
    classifier_obj.fit(x_train, y_train)
    accuracy = classifier_obj.score(x_test, y_test)
    return accuracy

In [36]:
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective, n_trials=30)
print(study_dt.best_trial)

In [37]:
dt = DecisionTreeClassifier(max_features = study_dt.best_trial.params['dt_max_features'], max_depth = study_dt.best_trial.params['dt_max_depth'])
dt.fit(x_train, y_train)

dt_train, dt_test = dt.score(x_train, y_train), dt.score(x_test, y_test)

print(f"Train Score: {dt_train}")
print(f"Test Score: {dt_test}")

In [38]:
fig = plt.figure(figsize = (30,12))
tree.plot_tree(dt, filled=True);
plt.show()

In [39]:
from matplotlib import pyplot as plt

def f_importance(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)

    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.title('feature importance for dt')
    plt.show()

# whatever your features are called
features_names = x.columns

# Specify your top n features you want to visualize.
# You can also discard the abs() function 
# if you are interested in negative contribution of features
f_importance(abs(dt.feature_importances_), features_names, top=7)

# Random Forest Classifier

In [41]:
def objective(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32, log=False)
    rf_max_features = trial.suggest_int('rf_max_features', 2, 22, log=False)
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 3, 20, log=False)
    classifier_obj = RandomForestClassifier(max_features = rf_max_features, max_depth = rf_max_depth, n_estimators = rf_n_estimators)
    classifier_obj.fit(x_train, y_train)
    accuracy = classifier_obj.score(x_test, y_test)
    return accuracy

In [42]:
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective, n_trials=30)
print(study_rf.best_trial)

In [46]:
rf = RandomForestClassifier(max_features = study_rf.best_trial.params['rf_max_features'], max_depth = study_rf.best_trial.params['rf_max_depth'], n_estimators = study_rf.best_trial.params['rf_n_estimators'])
rf.fit(x_train, y_train)

rf_train, rf_test = rf.score(x_train, y_train), rf.score(x_test, y_test)

print(f"Train Score: {rf_train}")
print(f"Test Score: {rf_test}")

In [47]:
from matplotlib import pyplot as plt

def f_importance(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)

    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.title('feature importance for dt')
    plt.show()

# whatever your features are called
features_names = x.columns

# Specify your top n features you want to visualize.
# You can also discard the abs() function 
# if you are interested in negative contribution of features
f_importance(abs(rf.feature_importances_), features_names, top=7)

# SKLearn Gradient Boosting Model

In [48]:
SKGB = GradientBoostingClassifier(random_state=42)
SKGB.fit(x_train, y_train)

In [51]:
SKGB_train, SKGB_test = SKGB.score(x_train , y_train), SKGB.score(x_test , y_test)

print(f"Training Score: {SKGB_train}")
print(f"Test Score: {SKGB_test}")

# XGBoost Gradient Boosting Model

In [52]:
xgb_model = XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(x_train, y_train)

In [53]:
xgb_train, xgb_test = xgb_model.score(x_train , y_train), xgb_model.score(x_test , y_test)

print(f"Training Score: {xgb_train}")
print(f"Test Score: {xgb_test}")

# Light Gradient Boosting Model

In [54]:
lgb_model = LGBMClassifier(random_state=42)
lgb_model.fit(x_train, y_train)

In [55]:
lgb_train, lgb_test = lgb_model.score(x_train , y_train), lgb_model.score(x_test , y_test)

print(f"Training Score: {lgb_train}")
print(f"Test Score: {lgb_test}")

# SKLearn AdaBoost Model

In [56]:
ab_model = AdaBoostClassifier(random_state=42)

In [57]:
ab_model.fit(x_train, y_train)

In [58]:
ab_train, ab_test = ab_model.score(x_train , y_train), ab_model.score(x_test , y_test)

print(f"Training Score: {ab_train}")
print(f"Test Score: {ab_test}")

# CatBoost Classifier Model

In [59]:
cb_model = CatBoostClassifier(verbose=0)

In [60]:
cb_model.fit(x_train, y_train)

In [61]:
cb_train, cb_test = cb_model.score(x_train , y_train), cb_model.score(x_test , y_test)

print(f"Training Score: {cb_train}")
print(f"Test Score: {cb_test}")

# Naive Baye Model

In [40]:
BNB_model = BernoulliNB()
BNB_model.fit(x_train, y_train)

In [41]:
BNB_train, BNB_test = BNB_model.score(x_train , y_train), BNB_model.score(x_test , y_test)

print(f"Training Score: {BNB_train}")
print(f"Test Score: {BNB_test}")

# Voting Model 

In [70]:
v_clf = VotingClassifier(estimators=[('KNeighborsClassifier', KNN_model), ("XGBClassifier", xgb_model), ("RandomForestClassifier", rf), ("DecisionTree", dt), ("XGBoost", xgb_model), ("LightGB", lgb_model), ("AdaBoost", ab_model), ("Catboost", cb_model)], voting = "hard")

In [73]:
v_clf.fit(x_train, y_train)

In [74]:
voting_train, voting_test = v_clf.score(x_train , y_train), v_clf.score(x_test , y_test)

print(f"Training Score: {voting_train}")
print(f"Test Score: {voting_test}")

# SVM Model

smaller traning and testing set for hyperparameter tuning  

In [19]:
n_size = int(len(x_train)*0.4)
x_train_sm, y_train_sm = x_train[:n_size], y_train[:n_size]
x_test_sm, y_test_sm = x_test[:n_size], y_test[:n_size]
print(f"Small training set: {len(x_train_sm)}")
print(f"Small training set: {len(x_test_sm)}")

In [None]:
def objective(trial):
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'linearSVC'])
    c = trial.suggest_float('c', 0.02, 1.0, step=0.02)
    if kernel in ['linear', 'rbf']:
        classifier_obj = SVC(kernel=kernel, C=c).fit(x_train_sm, y_train_sm)
    elif kernel == 'linearSVC':
        classifier_obj = LinearSVC(C=c).fit(x_train_sm, y_train_sm)
    elif kernel == 'poly':
        degree = trial.suggest_int('degree', 2, 10)
        classifier_obj = SVC(kernel=kernel, C=c, degree=degree).fit(x_train_sm, y_train_sm)
        
    accuracy = classifier_obj.score(x_test_sm, y_test_sm)
    return accuracy

In [None]:
study_svm = optuna.create_study(direction='maximize')
study_svm.optimize(objective, n_trials=30)
print(study_svm.best_trial)

In [None]:
if study_svm.best_trial.params['kernel'] in ['linear', 'rbf']:
    SVM_model = SVC(kernel=study_svm.best_trial.params['kernel'], C=study_svm.best_trial.params['c'])
elif kernel == 'linearSVC':
    SVM_model = LinearSVC(C=study_svm.best_trial.params['c'])
elif kernel == 'poly':
    SVM_model = SVC(kernel=study_svm.best_trial.params['kernel'], C=study_svm.best_trial.params['c'], degree=study_svm.best_trial.params['degree'])

SVM_model.fit(x_train, y_train)

In [None]:
SVM_train, SVM_test = SVM_model.score(x_train , y_train), SVM_model.score(x_test , y_test)

print(f"Training Score: {SVM_train}")
print(f"Test Score: {SVM_test}")

# Summary

In [75]:
data = [["KNN", KNN_train, KNN_test], 
        ["Logistic Regression", lg_train, lg_test],
        ["Decision Tree", dt_train, dt_test], 
        ["Random Forest", rf_train, rf_test], 
        ["GBM", SKGB_train, SKGB_test], 
        ["XGBM", xgb_train, xgb_test], 
        ["Adaboost", ab_train, ab_test], 
        ["light GBM", lgb_train, lgb_test],
        ["CatBoost", cb_train, cb_test], 
        ["Naive Baye Model", BNB_train, BNB_test], 
        ["Voting", voting_train, voting_test],
        ["SVM", SVM_train, SVM_test]]

col_names = ["Model", "Train Score", "Test Score"]
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))