### Importing Libraries

In [67]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datasist.structdata import detect_outliers
from sklearn.pipeline import Pipeline

In [16]:
! pip install datasist

Collecting datasist
  Downloading datasist-1.5.3-py3-none-any.whl (33 kB)


Installing collected packages: datasist
Successfully installed datasist-1.5.3


### Loading Data

In [6]:
raw_data=pd.read_csv('../data/interim/train_airline.csv',index_col='Unnamed: 0')
df=raw_data.copy()

In [7]:
df

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,2,...,2,3,1,4,2,3,2,3,0.0,neutral or dissatisfied
103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,4,...,5,5,5,5,5,5,4,0,0.0,satisfied
103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,1,...,4,3,2,4,5,5,4,7,14.0,neutral or dissatisfied
103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,1,...,1,4,5,1,5,4,1,0,0.0,neutral or dissatisfied


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

In [9]:
df.id.nunique()

103904

In [10]:
drop_columns=['id']

In [12]:
df.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [13]:
df.columns=df.columns.str.lower().str.replace(' ','_')
df.columns

Index(['id', 'gender', 'customer_type', 'age', 'type_of_travel', 'class',
       'flight_distance', 'inflight_wifi_service',
       'departure/arrival_time_convenient', 'ease_of_online_booking',
       'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'on-board_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness', 'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'satisfaction'],
      dtype='object')

In [14]:
df.customer_type.unique()

array(['Loyal Customer', 'disloyal Customer'], dtype=object)

In [15]:
df.describe()

Unnamed: 0,id,age,flight_distance,inflight_wifi_service,departure/arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes
count,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103594.0
mean,64924.210502,39.379706,1189.448375,2.729683,3.060296,2.756901,2.976883,3.202129,3.250375,3.439396,3.358158,3.382363,3.351055,3.631833,3.30429,3.640428,3.286351,14.815618,15.178678
std,37463.812252,15.114964,997.147281,1.327829,1.525075,1.398929,1.277621,1.329533,1.349509,1.319088,1.332991,1.288354,1.315605,1.180903,1.265396,1.175663,1.312273,38.230901,38.698682
min,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,32533.75,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,64856.5,40.0,843.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,97368.25,51.0,1743.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,129880.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [18]:
px.box(df,'flight_distance')

In [20]:
indexes=detect_outliers(df,0,['flight_distance'])
len(indexes)

2291

In [21]:
df_fligh_distance_outliers=df[df.index.isin(indexes)]

In [22]:
df_fligh_distance_outliers

Unnamed: 0,id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure/arrival_time_convenient,ease_of_online_booking,...,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
80,73302,Male,Loyal Customer,26,Business travel,Business,3960,1,1,1,...,4,4,2,5,4,4,4,45,48.0,satisfied
173,101275,Male,Loyal Customer,52,Business travel,Business,3747,5,5,5,...,4,4,4,4,5,4,5,24,20.0,satisfied
201,66800,Female,Loyal Customer,43,Business travel,Business,3854,5,5,5,...,5,5,5,5,5,5,3,0,0.0,satisfied
215,23328,Female,Loyal Customer,38,Business travel,Business,3753,2,2,2,...,4,4,4,4,4,4,1,0,0.0,satisfied
379,85109,Male,Loyal Customer,46,Business travel,Business,3995,4,4,4,...,5,5,5,5,5,5,4,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103565,68522,Female,Loyal Customer,59,Business travel,Business,3881,2,2,2,...,2,2,2,2,5,2,3,0,0.0,satisfied
103648,38750,Male,Loyal Customer,43,Business travel,Business,3890,4,4,2,...,5,5,5,5,2,5,2,0,0.0,satisfied
103727,35286,Female,Loyal Customer,25,Business travel,Business,3990,3,3,2,...,3,1,4,2,4,3,3,0,12.0,neutral or dissatisfied
103865,46017,Male,Loyal Customer,35,Business travel,Business,3795,5,5,5,...,2,2,2,2,2,2,1,0,0.0,satisfied


In [23]:
df_fligh_distance_outliers.flight_distance.describe()

count    2291.000000
mean     3886.820602
std       152.824681
min      3737.000000
25%      3802.000000
50%      3871.000000
75%      3937.000000
max      4983.000000
Name: flight_distance, dtype: float64

In [25]:
df.drop(indexes,axis=0,inplace=True)

In [26]:
df

Unnamed: 0,id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure/arrival_time_convenient,ease_of_online_booking,...,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,2,...,2,3,1,4,2,3,2,3,0.0,neutral or dissatisfied
103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,4,...,5,5,5,5,5,5,4,0,0.0,satisfied
103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,1,...,4,3,2,4,5,5,4,7,14.0,neutral or dissatisfied
103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,1,...,1,4,5,1,5,4,1,0,0.0,neutral or dissatisfied


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101613 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 101613 non-null  int64  
 1   gender                             101613 non-null  object 
 2   customer_type                      101613 non-null  object 
 3   age                                101613 non-null  int64  
 4   type_of_travel                     101613 non-null  object 
 5   class                              101613 non-null  object 
 6   flight_distance                    101613 non-null  int64  
 7   inflight_wifi_service              101613 non-null  int64  
 8   departure/arrival_time_convenient  101613 non-null  int64  
 9   ease_of_online_booking             101613 non-null  int64  
 10  gate_location                      101613 non-null  int64  
 11  food_and_drink                     1016

In [29]:
df.satisfaction.unique()

array(['neutral or dissatisfied', 'satisfied'], dtype=object)

In [36]:
px.histogram(df,'age')

In [37]:
px.histogram(df,'age',color='satisfaction')

In [38]:
df.satisfaction.value_counts()

neutral or dissatisfied    58345
satisfied                  43268
Name: satisfaction, dtype: int64

In [39]:
df.columns

Index(['id', 'gender', 'customer_type', 'age', 'type_of_travel', 'class',
       'flight_distance', 'inflight_wifi_service',
       'departure/arrival_time_convenient', 'ease_of_online_booking',
       'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'on-board_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness', 'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'satisfaction'],
      dtype='object')

In [40]:
columns=['inflight_wifi_service',
       'departure/arrival_time_convenient', 'ease_of_online_booking',
       'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'on-board_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness']

df['overall_scoring']=round(df[columns].sum(axis=1)/len(columns)).astype(int)
df

Unnamed: 0,id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure/arrival_time_convenient,ease_of_online_booking,...,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction,overall_scoring
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,4,3,4,4,5,5,25,18.0,neutral or dissatisfied,4
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,5,3,1,4,1,1,6.0,neutral or dissatisfied,2
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,4,3,4,4,4,5,0,0.0,satisfied,4
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,5,3,1,4,2,11,9.0,neutral or dissatisfied,3
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,4,4,3,3,3,0,0.0,satisfied,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,2,...,3,1,4,2,3,2,3,0.0,neutral or dissatisfied,2
103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,4,...,5,5,5,5,5,4,0,0.0,satisfied,4
103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,1,...,3,2,4,5,5,4,7,14.0,neutral or dissatisfied,3
103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,1,...,4,5,1,5,4,1,0,0.0,neutral or dissatisfied,2


In [41]:
df.type_of_travel.unique()

array(['Personal Travel', 'Business travel'], dtype=object)

In [44]:
df['class'].unique()

array(['Eco Plus', 'Business', 'Eco'], dtype=object)

In [46]:
cat_columns=df.select_dtypes(exclude=np.number).columns.to_list()

In [51]:
cat_columns

['gender', 'customer_type', 'type_of_travel', 'class', 'satisfaction']

In [48]:
df=pd.get_dummies(df,columns=cat_columns,drop_first=True)

In [50]:
df.columns

Index(['id', 'age', 'flight_distance', 'inflight_wifi_service',
       'departure/arrival_time_convenient', 'ease_of_online_booking',
       'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'on-board_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness', 'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'overall_scoring', 'gender_Male', 'customer_type_disloyal Customer',
       'type_of_travel_Personal Travel', 'class_Eco', 'class_Eco Plus',
       'satisfaction_satisfied'],
      dtype='object')

In [52]:
df

Unnamed: 0,id,age,flight_distance,inflight_wifi_service,departure/arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,...,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,overall_scoring,gender_Male,customer_type_disloyal Customer,type_of_travel_Personal Travel,class_Eco,class_Eco Plus,satisfaction_satisfied
0,70172,13,460,3,4,3,1,5,3,5,...,5,25,18.0,4,1,0,1,0,1,0
1,5047,25,235,3,2,3,3,1,3,1,...,1,1,6.0,2,1,1,0,0,0,0
2,110028,26,1142,2,2,2,2,5,5,5,...,5,0,0.0,4,0,0,0,0,0,1
3,24026,25,562,2,5,5,5,2,2,2,...,2,11,9.0,3,0,0,0,0,0,0
4,119299,61,214,3,3,3,3,4,5,5,...,3,0,0.0,4,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,23,192,2,1,2,3,2,2,2,...,2,3,0.0,2,0,1,0,1,0,0
103900,73097,49,2347,4,4,4,4,2,4,5,...,4,0,0.0,4,1,0,0,0,0,1
103901,68825,30,1995,1,1,1,3,4,1,5,...,4,7,14.0,3,1,1,0,0,0,0
103902,54173,22,1000,1,1,1,5,1,1,1,...,1,0,0.0,2,0,1,0,1,0,0


In [56]:
drop_columns.extend(columns)

In [59]:
drop_columns.remove('id')
drop_columns

['inflight_wifi_service',
 'departure/arrival_time_convenient',
 'ease_of_online_booking',
 'gate_location',
 'food_and_drink',
 'online_boarding',
 'seat_comfort',
 'inflight_entertainment',
 'on-board_service',
 'leg_room_service',
 'baggage_handling',
 'checkin_service',
 'inflight_service',
 'cleanliness']

In [60]:
df.drop(drop_columns,axis=1,inplace=True)

In [61]:
df

Unnamed: 0,age,flight_distance,departure_delay_in_minutes,arrival_delay_in_minutes,overall_scoring,gender_Male,customer_type_disloyal Customer,type_of_travel_Personal Travel,class_Eco,class_Eco Plus,satisfaction_satisfied
0,13,460,25,18.0,4,1,0,1,0,1,0
1,25,235,1,6.0,2,1,1,0,0,0,0
2,26,1142,0,0.0,4,0,0,0,0,0,1
3,25,562,11,9.0,3,0,0,0,0,0,0
4,61,214,0,0.0,4,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
103899,23,192,3,0.0,2,0,1,0,1,0,0
103900,49,2347,0,0.0,4,1,0,0,0,0,1
103901,30,1995,7,14.0,3,1,1,0,0,0,0
103902,22,1000,0,0.0,2,0,1,0,1,0,0


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101613 entries, 0 to 103903
Data columns (total 11 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   age                              101613 non-null  int64  
 1   flight_distance                  101613 non-null  int64  
 2   departure_delay_in_minutes       101613 non-null  int64  
 3   arrival_delay_in_minutes         101307 non-null  float64
 4   overall_scoring                  101613 non-null  int64  
 5   gender_Male                      101613 non-null  uint8  
 6   customer_type_disloyal Customer  101613 non-null  uint8  
 7   type_of_travel_Personal Travel   101613 non-null  uint8  
 8   class_Eco                        101613 non-null  uint8  
 9   class_Eco Plus                   101613 non-null  uint8  
 10  satisfaction_satisfied           101613 non-null  uint8  
dtypes: float64(1), int64(4), uint8(6)
memory usage: 5.2 MB


In [74]:
df['arrival_delay_in_minutes'].fillna(df['arrival_delay_in_minutes'].median(),inplace=True)

In [75]:
x=df.drop('satisfaction_satisfied',axis=1)
y=df['satisfaction_satisfied']

In [76]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y)

In [77]:
pipeline=Pipeline([('std_scaler',StandardScaler())])
x_train_scaled=pipeline.fit_transform(x_train)
x_test_scaled=pipeline.transform(x_test)

In [81]:
models={
    'LR':LogisticRegression(),
    'NB':GaussianNB(),
    'DT':DecisionTreeClassifier(),
    'KNN':KNeighborsClassifier(n_neighbors=5),
    'RF':RandomForestClassifier(n_estimators=25)
}

In [82]:
for name,model in models.items():
    print(f'using{name}')
    model.fit(x_train_scaled,y_train)
    y_pred=model.predict(x_train_scaled)
    y_test_pred=model.predict(x_test_scaled)
    print('training accuracy: ',accuracy_score(y_train,y_pred))
    print('testing accuracy: ',accuracy_score(y_test,y_test_pred))
    print('--'*20)

usingLR
training accuracy:  0.8373108623446919
testing accuracy:  0.838606504945136
----------------------------------------
usingNB
training accuracy:  0.8017468323286997
testing accuracy:  0.8047532352507012
----------------------------------------
usingDT
training accuracy:  0.9980932464017714
testing accuracy:  0.7784775869704276
----------------------------------------
usingKNN
training accuracy:  0.8709927420346906
testing accuracy:  0.8296511341829454
----------------------------------------
usingRF
training accuracy:  0.9941198179357854
testing accuracy:  0.828765438173498
----------------------------------------


In [85]:
model=RandomForestClassifier(n_estimators=15)
model.fit(x_train_scaled,y_train)

In [88]:
model.predict(x_test_scaled)[0]

0

In [94]:
x_test.iloc[5].values

array([3.40e+01, 1.94e+03, 9.00e+01, 9.00e+01, 3.00e+00, 1.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00])

In [95]:
data=[3.40e+01, 1.94e+03, 9.00e+01, 9.00e+01, 3.00e+00, 1.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00]

In [96]:
data_scaled=scaler.transform([data])


X does not have valid feature names, but StandardScaler was fitted with feature names



In [97]:
data_scaled

array([[-0.35145072,  0.87800551,  1.95371937,  1.92372001, -0.32049474,
         1.01592204, -0.47751421, -0.68303321, -0.92331479, -0.28247522]])

In [99]:
model.predict(data_scaled)

array([1], dtype=uint8)

In [100]:
import joblib

In [102]:
joblib.dump(model,'../models/model.h5')

['../models/model.h5']

In [103]:
external_model=joblib.load('../models/model.h5')

In [109]:
data=[3.40e+01, 1.94e+03, 9.00e+01, 9.00e+01, 3.00e+00, 1.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00]
if external_model.predict(scaler.transform([data]))[0]==1:
    print('sss')
else:
    print('oooo')

sss



X does not have valid feature names, but StandardScaler was fitted with feature names

