## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.impute import SimpleImputer
# Modelling
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
import warnings

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


#### Import the CSV Data as Pandas DataFrame

In [3]:
df = pd.read_csv('data/raw_data.csv')

#### Show Top 5 Records

In [4]:
df.head()

Unnamed: 0,id,gender,customer_type,age,type_of_travel,Class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,...,inflight_entertainment,on_board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [5]:
df.isnull().sum()

id                                     0
gender                                 0
customer_type                          0
age                                    0
type_of_travel                         0
Class                                  0
flight_distance                        0
inflight_wifi_service                  0
departure_arrival_time_convenient      0
ease_of_online_booking                 0
gate_location                          0
food_and_drink                         0
online_boarding                        0
seat_comfort                           0
inflight_entertainment                 0
on_board_service                       0
leg_room_service                       0
baggage_handling                       0
checkin_service                        0
inflight_service                       0
cleanliness                            0
departure_delay_in_minutes             0
arrival_delay_in_minutes             310
satisfaction                           0
dtype: int64

In [6]:
imputer = SimpleImputer(strategy='median')
df['arrival_delay_in_minutes']=imputer.fit_transform(df[['arrival_delay_in_minutes']])

In [7]:
df.isnull().sum()

id                                   0
gender                               0
customer_type                        0
age                                  0
type_of_travel                       0
Class                                0
flight_distance                      0
inflight_wifi_service                0
departure_arrival_time_convenient    0
ease_of_online_booking               0
gate_location                        0
food_and_drink                       0
online_boarding                      0
seat_comfort                         0
inflight_entertainment               0
on_board_service                     0
leg_room_service                     0
baggage_handling                     0
checkin_service                      0
inflight_service                     0
cleanliness                          0
departure_delay_in_minutes           0
arrival_delay_in_minutes             0
satisfaction                         0
dtype: int64

#### Preparing X and Y variables

In [8]:
X = df.drop(columns=['satisfaction'],axis=1)

In [9]:
X.head()

Unnamed: 0,id,gender,customer_type,age,type_of_travel,Class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,...,seat_comfort,inflight_entertainment,on_board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,5,4,3,4,4,5,5,25,18.0
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,1,5,3,1,4,1,1,6.0
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,5,4,3,4,4,4,5,0,0.0
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,2,5,3,1,4,2,11,9.0
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,5,3,3,4,4,3,3,3,0,0.0


In [10]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values for column '{column}':")
    print(unique_values)
    print()

Unique values for column 'id':
[ 70172   5047 110028 ...  68825  54173  62567]

Unique values for column 'gender':
['Male' 'Female']

Unique values for column 'customer_type':
['Loyal Customer' 'disloyal Customer']

Unique values for column 'age':
[13 25 26 61 47 52 41 20 24 12 53 33 45 38  9 17 43 58 23 57 49 36 22 31
 15 35 67 37 40 34 39 50 29 54 21 28 27 69 60 48 59 46 30 66 64 44 51 32
 19 42 16 11 62  8 56 68 55 18 65 72 70 63 10  7 14 80 74 71 85 73 76 77
 75 79 78]

Unique values for column 'type_of_travel':
['Personal Travel' 'Business travel']

Unique values for column 'Class':
['Eco Plus' 'Business' 'Eco']

Unique values for column 'flight_distance':
[ 460  235 1142 ...  974 1479  400]

Unique values for column 'inflight_wifi_service':
[3 2 4 1 5 0]

Unique values for column 'departure_arrival_time_convenient':
[4 2 5 3 1 0]

Unique values for column 'ease_of_online_booking':
[3 2 5 4 1 0]

Unique values for column 'gate_location':
[1 3 2 5 4 0]

Unique values for column 'fo

In [11]:
y = df['satisfaction']

In [12]:
y

0         neutral or dissatisfied
1         neutral or dissatisfied
2                       satisfied
3         neutral or dissatisfied
4                       satisfied
                   ...           
103899    neutral or dissatisfied
103900                  satisfied
103901    neutral or dissatisfied
103902    neutral or dissatisfied
103903    neutral or dissatisfied
Name: satisfaction, Length: 103904, dtype: object

In [13]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [14]:
X = preprocessor.fit_transform(X)

In [15]:
X

array([[ 0.        ,  1.        ,  1.        , ...,  1.30586973,
         0.26639265,  0.07416916],
       [ 0.        ,  1.        ,  0.        , ..., -1.74229153,
        -0.36137482, -0.23631279],
       [ 1.        ,  0.        ,  1.        , ...,  1.30586973,
        -0.3875318 , -0.39155376],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.54382941,
        -0.20443295, -0.02932482],
       [ 1.        ,  0.        ,  0.        , ..., -1.74229153,
        -0.3875318 , -0.39155376],
       [ 0.        ,  1.        ,  1.        , ..., -1.74229153,
        -0.3875318 , -0.39155376]])

In [16]:
X.shape

(103904, 28)

In [17]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((83123, 28), (20781, 28))

#### Create an Evaluate Function to give all metrics after model Training

In [18]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, labels=['satisfied'], average='weighted')
    recall = recall_score(true, predicted, labels=['satisfied'], average='weighted')
    f1 = f1_score(true, predicted, labels=['satisfied'], average='weighted')
    return accuracy, precision, recall, f1

In [19]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Support Vector Classifier":SVC(),
    # "XGB Classifier": XGBClassifier(), 
    "GradientBoosting Classifier": GradientBoostingClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_acc , model_train_precision, model_train_recall, model_train_f1_score = evaluate_model(y_train, y_train_pred)

    model_test_acc , model_test_precision, model_test_recall, model_test_f1_score = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_acc))
    print("- Precision Score: {:.4f}".format(model_train_precision))
    print("- Recall Score: {:.4f}".format(model_train_recall))
    print("- F1 Score: {:.4f}".format(model_train_f1_score))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_acc))
    print("- Precision Score: {:.4f}".format(model_test_precision))
    print("- Recall Score: {:.4f}".format(model_test_recall))
    print("- F1 Score: {:.4f}".format(model_test_f1_score))
    r2_list.append(model_test_acc)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy Score: 0.8756
- Precision Score: 0.8708
- Recall Score: 0.8364
- F1 Score: 0.8533
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8786
- Precision Score: 0.8757
- Recall Score: 0.8413
- F1 Score: 0.8582


K-Neighbors Classifier
Model performance for Training set
- Accuracy Score: 0.9485
- Precision Score: 0.9624
- Recall Score: 0.9168
- F1 Score: 0.9390
----------------------------------
Model performance for Test set
- Accuracy Score: 0.9291
- Precision Score: 0.9461
- Recall Score: 0.8881
- F1 Score: 0.9162


Decision Tree Classifier
Model performance for Training set
- Accuracy Score: 1.0000
- Precision Score: 1.0000
- Recall Score: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy Score: 0.9480
- Precision Score: 0.9394
- Recall Score: 0.9417
- F1 Score: 0.9405


Random Forest Classifier
Model performance for Training set
- Acc



AdaBoost Classifier
Model performance for Training set
- Accuracy Score: 0.9283
- Precision Score: 0.9214
- Recall Score: 0.9120
- F1 Score: 0.9167
----------------------------------
Model performance for Test set
- Accuracy Score: 0.9294
- Precision Score: 0.9229
- Recall Score: 0.9146
- F1 Score: 0.9187


