In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.metrics import accuracy_score

In [54]:
url = 'https://raw.githubusercontent.com/remijul/dataset/master/Airline%20Passenger%20Satisfaction.csv'
df = pd.read_csv(url, sep=';')

df.head()

Unnamed: 0,id,Satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,11112,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,110278,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,103199,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,47462,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,120011,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [55]:
df.dropna(inplace = True) 
df.shape

(129487, 24)

In [56]:
X = df.drop(columns='Satisfaction', axis=1)
y = df['Satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
X_cat = X.select_dtypes(include=[object])
X_num = X.select_dtypes(exclude=[object])

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 129487 entries, 0 to 129879
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 129487 non-null  int64  
 1   Satisfaction                       129487 non-null  object 
 2   Gender                             129487 non-null  object 
 3   Customer Type                      129487 non-null  object 
 4   Age                                129487 non-null  int64  
 5   Type of Travel                     129487 non-null  object 
 6   Class                              129487 non-null  object 
 7   Flight Distance                    129487 non-null  int64  
 8   Seat comfort                       129487 non-null  int64  
 9   Departure/Arrival time convenient  129487 non-null  int64  
 10  Food and drink                     129487 non-null  int64  
 11  Gate location                      129487 no

In [59]:
# Preprocessing

# create ColumnTransformer, and pass the column names to transform in each step
cols_trans = ColumnTransformer([
    ('txt', OneHotEncoder(), ['Gender', 'Customer Type','Type of Travel','Class']),
    ('num', StandardScaler(), ['Age', 'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient',
                               'Food and drink','Gate location', 'Inflight wifi service', 'Inflight entertainment', 
                               'Online support', 'Ease of Online booking', 'On-board service', 'Leg room service', 
                               'Baggage handling', 'Checkin service'])
])

In [60]:
# create the pipeline
pipe_lr = Pipeline([
    ('transform', cols_trans),
    ('clf1', LogisticRegression())
])
pipe_abc = Pipeline([
    ('transform', cols_trans),
    ('clf2', AdaBoostClassifier())
])
pipe_svc = Pipeline([
    ('transform', cols_trans),
    ('clf3', SVC())
])

In [61]:
from sklearn import set_config

# with display='diagram', simply use display() to see the diagram
set_config(display='diagram')

display(pipe_lr)
set_config(display='text')

In [62]:
pipe_lr.fit(X_train, y_train)

Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('txt', OneHotEncoder(),
                                                  ['Gender', 'Customer Type',
                                                   'Type of Travel', 'Class']),
                                                 ('num', StandardScaler(),
                                                  ['Age', 'Flight Distance',
                                                   'Seat comfort',
                                                   'Departure/Arrival time '
                                                   'convenient',
                                                   'Food and drink',
                                                   'Gate location',
                                                   'Inflight wifi service',
                                                   'Inflight entertainment',
                                                   'Online support',
                  

In [63]:
y_pred = pipe_lr.predict(X_test)

# evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Validation accuracy: {accuracy:.4f}")

Validation accuracy: 0.8351


In [64]:
# Savz the LR model
joblib.dump(pipe_lr, 'model_lr.joblib')

['model_lr.joblib']

In [65]:
pipe_abc.fit(X_train, y_train)

Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('txt', OneHotEncoder(),
                                                  ['Gender', 'Customer Type',
                                                   'Type of Travel', 'Class']),
                                                 ('num', StandardScaler(),
                                                  ['Age', 'Flight Distance',
                                                   'Seat comfort',
                                                   'Departure/Arrival time '
                                                   'convenient',
                                                   'Food and drink',
                                                   'Gate location',
                                                   'Inflight wifi service',
                                                   'Inflight entertainment',
                                                   'Online support',
                  

In [67]:
y_pred = pipe_abc.predict(X_test)

# evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Validation accuracy: {accuracy:.4f}")

Validation accuracy: 0.8980


In [68]:
# Savz the LR model
joblib.dump(pipe_abc, 'model_abc.joblib')

['model_abc.joblib']

In [71]:
# Define the parameter grid for grid search
param_grid_lr = {
    'clf1__C': [0.1, 1, 10],
    'transform__num__with_mean': [True, False],
    'transform__num__with_std': [True, False]
}

# Perform grid search
grid_search_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=5)
grid_search_lr.fit(X_train, y_train)

# Print the best parameters
print('Best parameters:', grid_search_lr.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'clf1__C': 10, 'transform__num__with_mean': True, 'transform__num__with_std': True}


In [73]:
# grid search for AdaBoostClassifier
params_abc = {
    'clf2__n_estimators': [50, 100, 200],
    'clf2__learning_rate': [0.1, 0.5, 1]
}

grid_abc = GridSearchCV(pipe_abc, params_abc, cv=5, n_jobs=-1)
grid_abc.fit(X_train, y_train)

print("Best parameters:", grid_abc.best_params_)

Best parameters: {'clf2__learning_rate': 1, 'clf2__n_estimators': 200}
