In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder

from xgboost import XGBClassifier
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df_train = pd.read_csv("data/train.csv", index_col="id")
df_test = pd.read_csv("data/test.csv", index_col="id")

df_train.drop(columns=['Unnamed: 0', 'Gate location', 'Leg room service', 'Baggage handling'], inplace=True)
df_test.drop(columns=['Unnamed: 0', 'Gate location', 'Leg room service', 'Baggage handling'], inplace=True)

In [3]:
X_train = df_train.drop(columns="satisfaction")
y_train = df_train.satisfaction
X_test = df_test.drop(columns="satisfaction")
y_test = df_test.satisfaction

y_train = y_train.replace({'neutral or dissatisfied': 0, 'satisfied': 1})
y_test = y_test.replace({'neutral or dissatisfied': 0, 'satisfied': 1})

In [4]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='median'))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder())
])

ordinal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),    
    ("ordinal", OrdinalEncoder(categories=[['0', '1', '2', '3', '4', '5']] * 11))
])

In [5]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ["Age", "Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes"]),
    ("categoric", categorical_pipeline, ['Gender', 'Customer Type', 'Type of Travel', 'Class']),
    ("ordinal", ordinal_pipeline, ['Inflight wifi service',
        'Departure/Arrival time convenient', 'Ease of Online booking', 
        'Food and drink', 'Online boarding', 'Seat comfort',
        'Inflight entertainment', 'On-board service', 'Checkin service', 
        'Inflight service', 'Cleanliness'])
])

In [7]:
parameters = {
    "learning_rate": 0.17070059179090855,
    "max_depth": 10,
    "gamma": 3.941904591403078,
    "colsample_bytree": 0.7109662235514159,
    "reg_alpha": 0.7194571167899805,
    "reg_lambda": 5.521019924961057
}

pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", XGBClassifier(**parameters, n_jobs=-1, random_state=42))
])

pipeline.fit(X_train, y_train)

print(f"Training: {pipeline.score(X_train, y_train)*100:.2f}% | Testing: {pipeline.score(X_test, y_test)*100:.2f}%")

Training: 96.65% | Testing: 96.18%


In [8]:
import pickle

with open("model/final_model_airlance.pkl", "wb") as file:
    pickle.dump(pipeline, file)

In [11]:
with open("model/final_model_airlance.pkl", "rb") as file:
    pipeline = pickle.load(file)

In [16]:
columns = ['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Checkin service',
       'Inflight service', 'Cleanliness', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes']

In [26]:
predict = pd.DataFrame([[ "Male",  "disloyal Customer", 9, "Personal Travel", "Eco", 
              500, 3, 2, 4, 2, 3, 4, 1, 0, 3, 2, 5, 30, 120]], columns=columns)
predict

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,Male,disloyal Customer,9,Personal Travel,Eco,500,3,2,4,2,3,4,1,0,3,2,5,30,120


In [27]:
if pipeline.predict(predict) != 0:
    print("Customer Puas")
else:
    print("Customer Tidak Puas")

Customer Tidak Puas
