In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports & Configs

In [None]:
%pip install numpy pandas seaborn matplotlib optuna sklearn xgboost catboost lightgbm > /dev/null 2>&1

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import warnings
import optuna
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree  import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Data Preprocessing

In [None]:
df=pd.read_csv('/kaggle/input/airline-passenger-satisfaction/airline_passenger_satisfaction.csv')
df

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

## Missing Data

In [None]:
total = df.shape[0]
missing_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
for col in missing_columns:
    null_count = df[col].isnull().sum()
    per = (null_count/total) * 100
    print(f"{col}: {null_count} ({round(per, 3)}%)")

Missing values are very few! we can just fill them with the median

In [None]:
df['Arrival Delay'].fillna(df['Arrival Delay'].median(), inplace=True)

## Duplicates 

In [None]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")

Great! No duplicates

## Outliers 

In [None]:
for col in df:
    if col != 'Satisfaction' and col in ['Flight Distance', 'Departure Delay', 'Arrival Delay']:
        print('-'*30)
        print(f"{col}:")
        fig, ax = plt.subplots(2, 1, figsize=(12, 8))
        g1 = sns.boxplot(x = df[col], ax=ax[0])
        g2 = sns.scatterplot(data=df, x=df[col],y=df['Satisfaction'], ax=ax[1])
        plt.show()

In [None]:
valid_data = {'Flight Distance': 4000, 'Departure Delay': 500, 'Arrival Delay': 500}

for k in valid_data:
    print(f'Number of outliers for {k} is {df[df[k] >= valid_data[k]].shape[0]}')

for k in valid_data:
    df = df[df[k] < valid_data[k]]

# EDA

In [None]:
f, axes = plt.subplots(9, 2, figsize=(15,30), sharex=False)
idx, idy = 0, 0
for i in range(1,24):
    if is_numeric_dtype(df.iloc[:,i]):
        sns.violinplot(x=df.iloc[:,i], color="skyblue", ax=axes[idx,idy])
        idx += idy
        idy = int(not idy)
plt.show()

In [None]:
f, axes = plt.subplots(9, 2, figsize=(15,30), sharex=False)
idx, idy = 0, 0
for i in range(1, 24):
    if is_numeric_dtype(df.iloc[:,i]):
        plot = sns.distplot(x=df.iloc[:,i], color="skyblue", ax=axes[idx,idy])
        plot.set(xlabel=df.columns[i])
        idx += idy
        idy = int(not idy)
plt.show()

In [None]:
f, axes = plt.subplots(6, 3, figsize=(15,30), sharex=False)

idx, idy = 0, 0
for i in range(7, 24):
    if is_numeric_dtype(df.iloc[:,i]):
        sns.barplot(x="Satisfaction", y=df.columns[i], data=df, ax=axes[idx,idy])
        idx += 1 if idy == 2 else 0
        idy = idy + 1 if idy < 2 else 0
        
plt.show()

In [None]:
plt.figure(figsize=(40,30))
sns.heatmap(df.corr(), annot=True)

# import plotly.express as px
# fig = px.imshow(df.corr(), text_auto=True, aspect="auto")
# fig.show()

## Observations

- The customers Neutral or Dissatisfaction also increased with the increase in Departure Delay 
- The Demand for Business class and Business class passengers are more satisfied with the increase in flight distance 
- The customer satisfaction increased with the increase of Check-in Service 
- The customers Neutral or Dissatisfaction also increased with the increase in Arrival Delay 
- The customer satisfaction increased with the increase of Ease of Online Booking 
- The customer satisfaction increased with the increase of Seat Comfort 
- The customer satisfaction increased with the increase of Online Boarding 
- The customer satisfaction increased with the increase of Leg Room Service 
- The customer satisfaction increased with the increase of Cleanliness 
- The customer satisfaction increased with the increase of In-flight Wifi Service 
- The customer satisfaction increased with the increase of On-board Service 
- The customer satisfaction increased with the increase of In-flight Service 
- The customer satisfaction increased with the increase of Food and Drink 
- The customer satisfaction increased with the increase of In-flight Entertainment 
- The customer satisfaction increased with the increase of Baggage Handling 

# Label Encoding

In [None]:
for col in df.columns:
    if df[col].dtype == 'object':
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])

In [None]:
df.drop(['ID'], axis = 1, inplace=True)
df.head()

# Split and scale data

In [None]:
x = df.drop(['Satisfaction'], axis=1)
y = df['Satisfaction']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.25, random_state=42)

In [None]:
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.fit_transform(x_test)

# Modeling

### K Nearest Neighbors (KNN) classification model

In [None]:
def objective(trial):
    n_neighbors = trial.suggest_int('KNN_n_neighbors', 2, 16, log=False)
    classifier_obj = KNeighborsClassifier(n_neighbors=n_neighbors)
    classifier_obj.fit(x_train, y_train)
    accuracy = classifier_obj.score(x_test, y_test)
    return accuracy

In [None]:
study_KNN = optuna.create_study(direction='maximize')
study_KNN.optimize(objective, n_trials=1)
print(study_KNN.best_trial)

In [None]:
KNN_model = KNeighborsClassifier(n_neighbors=study_KNN.best_trial.params['KNN_n_neighbors'])
KNN_model.fit(x_train, y_train)

print(f"Train Score: {KNN_model.score(x_train, y_train)}")
print(f"Test Score: {KNN_model.score(x_test, y_test)}")

### Logistic Regression Model

In [None]:
lg_model = LogisticRegression(random_state = 42)
lg_model.fit(x_train, y_train)

In [None]:
print(f"Training Score: {lg_model.score(x_train , y_train)}")
print(f"Test Score: {lg_model.score(x_test , y_test)}")

### Decision  Tree

In [None]:
def objective(trial):
    dt_max_depth = trial.suggest_int('dt_max_depth', 2, 32, log=False)
    dt_max_features = trial.suggest_int('dt_max_features', 2, 22, log=False)
    classifier_obj = DecisionTreeClassifier(max_features = dt_max_features, max_depth = dt_max_depth)
    classifier_obj.fit(x_train, y_train)
    accuracy = classifier_obj.score(x_test, y_test)
    return accuracy

In [None]:
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective, n_trials=30)
print(study_dt.best_trial)

In [None]:
dt = DecisionTreeClassifier(max_features = study_dt.best_trial.params['dt_max_features'], max_depth = study_dt.best_trial.params['dt_max_depth'])
dt.fit(x_train, y_train)
print(f"Train Score: {dt.score(x_train, y_train)}")
print(f"Test Score: {dt.score(x_test, y_test)}")

In [None]:
fig = plt.figure(figsize = (30,12))
tree.plot_tree(dt, filled=True);
plt.show()

In [None]:
from matplotlib import pyplot as plt

def f_importance(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)

    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.title('feature importance for dt')
    plt.show()

# whatever your features are called
features_names = x.columns

# Specify your top n features you want to visualize.
# You can also discard the abs() function 
# if you are interested in negative contribution of features
f_importance(abs(dt.feature_importances_), features_names, top=7)

### Random Forest Classifier

In [None]:
def objective(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32, log=False)
    rf_max_features = trial.suggest_int('rf_max_features', 2, 22, log=False)
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 3, 20, log=False)
    classifier_obj = RandomForestClassifier(max_features = rf_max_features, max_depth = rf_max_depth, n_estimators = rf_n_estimators)
    classifier_obj.fit(x_train, y_train)
    accuracy = classifier_obj.score(x_test, y_test)
    return accuracy

In [None]:
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective, n_trials=30)
print(study_rf.best_trial)

In [None]:
rf = RandomForestClassifier(max_features = study_rf.best_trial.params['rf_max_features'], max_depth = study_rf.best_trial.params['rf_max_depth'], n_estimators = study_rf.best_trial.params['rf_n_estimators'])
rf.fit(x_train, y_train)

print(f"Train Score: {rf.score(x_train, y_train)}")
print(f"Test Score: {rf.score(x_test, y_test)}")

In [None]:
from matplotlib import pyplot as plt

def f_importance(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)

    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.title('feature importance for dt')
    plt.show()

# whatever your features are called
features_names = x.columns

# Specify your top n features you want to visualize.
# You can also discard the abs() function 
# if you are interested in negative contribution of features
f_importance(abs(rf.feature_importances_), features_names, top=7)

### SKLearn Gradient Boosting

In [None]:
clf = GradientBoostingClassifier(random_state=42)
clf.fit(x_train, y_train)

In [None]:
print(f"Training Score: {clf.score(x_train , y_train)}")
print(f"Test Score: {clf.score(x_test , y_test)}")

### XGBoost Gradient Boosting

In [None]:
xgb_model = XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(x_train, y_train)

In [None]:
print(f"Training Score: {xgb_model.score(x_train , y_train)}")
print(f"Test Score: {xgb_model.score(x_test , y_test)}")

### Light Gradient Boosting

In [None]:
lgb_model = LGBMClassifier(random_state=42)
lgb_model.fit(x_train, y_train)

In [None]:
print(f"Training Score: {lgb_model.score(x_train , y_train)}")
print(f"Test Score: {lgb_model.score(x_test , y_test)}")

### SKLearn AdaBoost

In [None]:
ab_model = AdaBoostClassifier(random_state=42)

In [None]:
ab_model.fit(x_train, y_train)

In [None]:
print(f"Training Score: {ab_model.score(x_train , y_train)}")
print(f"Test Score: {ab_model.score(x_test , y_test)}")

### CatBoost Classifier Model

In [None]:
cb_model = CatBoostClassifier(verbose=0)

In [None]:
cb_model.fit(x_train, y_train)

In [None]:
print(f"Training Score: {cb_model.score(x_train , y_train)}")
print(f"Test Score: {cb_model.score(x_test , y_test)}")

### Voting Model 

In [None]:
clf1 = LogisticRegression(random_state=42)
clf2 = KNeighborsClassifier(n_neighbors=study_KNN.best_trial.params['KNN_n_neighbors'])
clf3 = XGBClassifier(objective = 'binary:logistic', random_state=42)
clf4 = RandomForestClassifier(max_features = study_rf.best_trial.params['rf_max_features'], max_depth = study_rf.best_trial.params['rf_max_depth'], n_estimators = study_rf.best_trial.params['rf_n_estimators'])

In [None]:
v_clf = VotingClassifier(estimators=[("LogisticRegression", clf1), ('KNeighborsClassifier', clf2), ("XGBClassifier", clf3), ("RandomForestClassifier", clf4)], voting = "hard")

In [None]:
v_clf.fit(x_train, y_train)

In [None]:
print(f"Training Score: {cb_model.score(x_train , y_train)}")
print(f"Test Score: {cb_model.score(x_test , y_test)}")