In [1]:
# Standard operational package imports
import numpy as np
import pandas as pd

# Important imports for modeling and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import sklearn.metrics as metrics

# Visualization package imports
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_original = pd.read_csv("Invistico_Airline.csv")

In [3]:
df_original.shape

(129880, 23)

In [4]:
df_original.head(n = 10)

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0
5,satisfied,Male,Loyal Customer,30,Personal Travel,Eco,1894,0,0,0,...,2,2,5,4,5,5,4,2,0,0.0
6,satisfied,Female,Loyal Customer,66,Personal Travel,Eco,227,0,0,0,...,5,5,5,0,5,5,5,3,17,15.0
7,satisfied,Male,Loyal Customer,10,Personal Travel,Eco,1812,0,0,0,...,2,2,3,3,4,5,4,2,0,0.0
8,satisfied,Female,Loyal Customer,56,Personal Travel,Business,73,0,0,0,...,5,4,4,0,1,5,4,4,0,0.0
9,satisfied,Male,Loyal Customer,22,Personal Travel,Eco,1556,0,0,0,...,2,2,2,4,5,3,4,2,30,26.0


In [5]:
df_original.dtypes

satisfaction                          object
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival De

In [6]:
df_original["Class"].unique()

array(['Eco', 'Business', 'Eco Plus'], dtype=object)

In [7]:
df_original['satisfaction'].value_counts(dropna = False)

satisfaction
satisfied       71087
dissatisfied    58793
Name: count, dtype: int64

In [8]:
df_original.isnull().sum()

satisfaction                           0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Seat comfort                           0
Departure/Arrival time convenient      0
Food and drink                         0
Gate location                          0
Inflight wifi service                  0
Inflight entertainment                 0
Online support                         0
Ease of Online booking                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Cleanliness                            0
Online boarding                        0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
dtype: int64

In [9]:
df_subset = df_original.dropna(axis=0).reset_index(drop = True)

In [10]:
df_subset.isna().sum()

satisfaction                         0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Seat comfort                         0
Departure/Arrival time convenient    0
Food and drink                       0
Gate location                        0
Inflight wifi service                0
Inflight entertainment               0
Online support                       0
Ease of Online booking               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Cleanliness                          0
Online boarding                      0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
dtype: int64

In [11]:
df_subset.dtypes

satisfaction                          object
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival De

In [12]:
df_subset['Class'] = df_subset['Class'].map({"Business": 3, "Eco Plus": 2, "Eco": 1}) 
df_subset['satisfaction'] = df_subset['satisfaction'].map({"satisfied": 1, "dissatisfied": 0})
df_subset = pd.get_dummies(df_subset, drop_first = True)

In [13]:
df_subset.dtypes

satisfaction                           int64
Age                                    int64
Class                                  int64
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
Gender_Male                             bool
Customer Type_disloyal Customer         bool
Type of Tr

In [14]:
y = df_subset["satisfaction"]

X = df_subset.copy()
X = X.drop("satisfaction", axis = 1)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)

In [17]:
from sklearn.linear_model import LogisticRegression
sgd_clf = LogisticRegression(solver='saga',max_iter=3,C=100)
sgd_clf.fit(X_train, y_train)
y_pred = sgd_clf.predict(X_test)



In [18]:
from sklearn.metrics import accuracy_score
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 83.42%


In [19]:
X_sl, _ , y_sl, _ = train_test_split(X, y, train_size=450, stratify=y, random_state=42)

In [20]:
X_sl.shape

(450, 22)

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X_sl,y_sl,test_size=0.2,stratify=y_sl,random_state=90)
X_train1=pd.DataFrame(X_train)
X_train1.to_csv('X_train1.csv', index=False)
X_test1=pd.DataFrame(X_test)
X_test1.to_csv('X_test1.csv', index=False)
y_train1=pd.DataFrame(y_train)
y_train1.to_csv('y_train1.csv', index=False)
y_test1=pd.DataFrame(y_test)
y_test1.to_csv('y_test1.csv', index=False)

In [22]:
X_train.shape

(360, 22)

In [23]:
sgd_clf2 = LogisticRegression(solver='saga',max_iter=3,C=100)
sgd_clf2.fit(X_train, y_train)
sgd_clf2.coef_[0]



array([ 0.13922645,  0.20102318, -0.15402319,  0.19230113, -0.42576862,
       -0.18782872,  0.08231997, -0.24123479,  0.73666786, -0.04578996,
        0.39831605,  0.38825462,  0.30858021,  0.07640786,  0.13300578,
        0.17870093,  0.09953229, -0.1220103 , -0.14077694, -0.58155478,
       -0.74801002, -0.24749691])

In [24]:
y_pred = sgd_clf2.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 81.11%


In [26]:
X_sl2, _ , y_sl2, _ = train_test_split(X, y, train_size=850, stratify=y, random_state=42)

In [27]:
X_sl2.shape

(850, 22)

In [28]:
X_train,X_test,y_train,y_test=train_test_split(X_sl2,y_sl2,test_size=0.2,stratify=y_sl2,random_state=90)
X_train2=pd.DataFrame(X_train)
X_train2.to_csv('X_train2.csv', index=False)
X_test2=pd.DataFrame(X_test)
X_test2.to_csv('X_test2.csv', index=False)
y_train2=pd.DataFrame(y_train)
y_train2.to_csv('y_train2.csv', index=False)
y_test2=pd.DataFrame(y_test)
y_test2.to_csv('y_test2.csv', index=False)
# X_train2.to_csv('X_train2.csv', index=False)
# X_test.to_csv('X_test2.csv', index=False)
# y_train.to_csv('y_train2.csv', index=False)
# y_test.to_csv('y_test2.csv', index=False)

In [29]:
X_train.shape

(680, 22)

In [30]:
sgd_clf = LogisticRegression(solver='saga',max_iter=3,C=100)
sgd_clf.fit(X_train, y_train)
sgd_clf.coef_[0]



array([ 0.05599235,  0.20078314,  0.00493653,  0.21589936, -0.11669145,
        0.07315779,  0.01100077,  0.03407647,  0.39999399,  0.16768783,
        0.25583593,  0.30492755,  0.18004541,  0.08425074,  0.18415194,
        0.11867163,  0.11202014, -0.05487999, -0.06896227, -0.27170596,
       -0.35520953, -0.09312152])

In [31]:
y_pred = sgd_clf.predict(X_test)

In [32]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 81.76%


In [33]:
X_sl3, _ , y_sl3, _ = train_test_split(X, y, train_size=1350, stratify=y, random_state=42)

In [34]:
X_sl3.shape

(1350, 22)

In [35]:
X_train,X_test,y_train,y_test=train_test_split(X_sl3,y_sl3,test_size=0.2,stratify=y_sl3,random_state=90)
X_train3=pd.DataFrame(X_train)
X_train3.to_csv('X_train3.csv', index=False)
X_test3=pd.DataFrame(X_test)
X_test3.to_csv('X_test3.csv', index=False)
y_train3=pd.DataFrame(y_train)
y_train3.to_csv('y_train3.csv', index=False)
y_test3=pd.DataFrame(y_test)
y_test3.to_csv('y_test3.csv', index=False)
# X_train.to_csv('X_train3.csv', index=False)
# X_test.to_csv('X_test3.csv', index=False)
# y_train.to_csv('y_train3.csv', index=False)
# y_test.to_csv('y_test3.csv', index=False)

In [36]:
X_train.shape

(1080, 22)

In [37]:
sgd_clf = LogisticRegression(solver='saga',max_iter=3,C=100)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
sgd_clf.fit(X_train, y_train)
sgd_clf.coef_[0]



array([ 0.05086933,  0.33370901, -0.05408693,  0.230389  , -0.25393931,
       -0.03450433,  0.01979671, -0.01801603,  0.59996078,  0.14341444,
        0.27083355,  0.38814898,  0.17471565,  0.10495869,  0.24950877,
        0.1637835 ,  0.19294404, -0.11600009, -0.14051105, -0.40352234,
       -0.46054809, -0.19935565])

In [38]:
y_pred = sgd_clf.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 81.11%
