In [208]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder ## Encoder
from sklearn.preprocessing import  StandardScaler, MaxAbsScaler ## scaling
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [209]:
df = pd.read_csv('https://raw.githubusercontent.com/ManonYa09/MachineLearningT3/refs/heads/main/Dataset/Invistico_Airline.csv')

### EDA

In [210]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129880 non-null  object 
 1   Customer Type                      129880 non-null  object 
 2   Age                                129880 non-null  int64  
 3   Type of Travel                     129880 non-null  object 
 4   Class                              129880 non-null  object 
 5   Flight Distance                    129880 non-null  int64  
 6   Seat comfort                       129880 non-null  int64  
 7   Departure/Arrival time convenient  129880 non-null  int64  
 8   Food and drink                     129880 non-null  int64  
 9   Gate location                      129880 non-null  int64  
 10  Inflight wifi service              129880 non-null  int64  
 11  Inflight entertainment             1298

In [211]:
df.columns

Index(['satisfaction', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient',
       'Food and drink', 'Gate location', 'Inflight wifi service',
       'Inflight entertainment', 'Online support', 'Ease of Online booking',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes'],
      dtype='object')

In [212]:
df.shape

(129880, 22)

In [213]:
df.nunique()

satisfaction                            2
Customer Type                           2
Age                                    75
Type of Travel                          2
Class                                   3
Flight Distance                      5398
Seat comfort                            6
Departure/Arrival time convenient       6
Food and drink                          6
Gate location                           6
Inflight wifi service                   6
Inflight entertainment                  6
Online support                          6
Ease of Online booking                  6
On-board service                        6
Leg room service                        6
Baggage handling                        5
Checkin service                         6
Cleanliness                             6
Online boarding                         6
Departure Delay in Minutes            466
Arrival Delay in Minutes              472
dtype: int64

In [214]:
df['Customer Type'].value_counts()

Customer Type
Loyal Customer       106100
disloyal Customer     23780
Name: count, dtype: int64

In [215]:
df.groupby('Customer Type')['satisfaction'].value_counts()

Customer Type      satisfaction
Loyal Customer     satisfied       65387
                   dissatisfied    40713
disloyal Customer  dissatisfied    18080
                   satisfied        5700
Name: count, dtype: int64

In [216]:
df[df['Arrival Delay in Minutes'].isnull()]['satisfaction'].value_counts()

satisfaction
satisfied       205
dissatisfied    188
Name: count, dtype: int64

In [217]:
# df[1:2]

In [218]:
df.isnull().sum()

satisfaction                           0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Seat comfort                           0
Departure/Arrival time convenient      0
Food and drink                         0
Gate location                          0
Inflight wifi service                  0
Inflight entertainment                 0
Online support                         0
Ease of Online booking                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Cleanliness                            0
Online boarding                        0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
dtype: int64

In [219]:
df.dropna(inplace=True)

In [220]:
df['satisfaction'].value_counts()

satisfaction
satisfied       70882
dissatisfied    58605
Name: count, dtype: int64

### preprocessing

In [221]:
df = df.dropna().reset_index(drop=True)
target = 'satisfaction'
x = df.drop(target , axis = 1)
y = df[target]

cat_col = []
num_col = []

for i in df.columns:
    if i in [target]:
        continue
    elif df[i].dtype =='O':
        cat_col.append(i)
    else: 
        num_col.append(i)

In [222]:
# X = df.drop(columns = target)
# y = df[target]
# x_train, x_test, y_train, y_test = train_test_split(X, y)

In [223]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [224]:
#Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),#most common value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#Create preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_col),
    ('cat', categorical_transformer, cat_col)
])

### Logistics Regression

In [225]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [226]:
Model_pipeline = Pipeline([
    # ('drop_columns', dropper),
    ('preprocessing',preprocessor), ## encoder and scaling
    ('model', LogisticRegression()) ### OLS estimate : paramter : a1, a2, a3..., a4
])

In [227]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [228]:
Model_pipeline.fit(x_train, y_train)

In [229]:
df['satisfaction'].value_counts()

satisfaction
satisfied       70882
dissatisfied    58605
Name: count, dtype: int64

In [230]:
y_pred_logistics  = Model_pipeline.predict(x_test)

In [231]:
confusion_matrix(y_test, y_pred_logistics)

array([[11891,  2825],
       [ 2692, 14964]])

In [232]:
accuracy_score(y_test,y_pred_logistics)

0.8295749413073026

In [233]:
print(classification_report(y_test, y_pred_logistics))

              precision    recall  f1-score   support

dissatisfied       0.82      0.81      0.81     14716
   satisfied       0.84      0.85      0.84     17656

    accuracy                           0.83     32372
   macro avg       0.83      0.83      0.83     32372
weighted avg       0.83      0.83      0.83     32372



### Support Vector Machine

In [234]:
from sklearn.svm import SVC ## classification, Regression

In [235]:
Model_SVC = Pipeline([
    # ('drop_columns', dropper),
    ('preprocessing',preprocessor), ## encoder and scaling
    ('model', SVC()) ### OLS estimate : paramter : a1, a2, a3..., a4
])

In [236]:
Model_SVC.fit(x_train, y_train)

In [237]:
y_pred_SVM  = Model_SVC.predict(x_test)

In [238]:
print(classification_report(y_test, y_pred_SVM))

              precision    recall  f1-score   support

dissatisfied       0.93      0.94      0.93     14716
   satisfied       0.95      0.94      0.94     17656

    accuracy                           0.94     32372
   macro avg       0.94      0.94      0.94     32372
weighted avg       0.94      0.94      0.94     32372



In [239]:
confusion_matrix(y_test, y_pred_SVM)

array([[13881,   835],
       [ 1107, 16549]])

In [240]:
accuracy_score(y_test,y_pred_SVM)

0.9400098850858767

### Decision Tree

In [241]:
from sklearn.tree import DecisionTreeClassifier

In [242]:
#Create training pipeline
training_pipeline_Des_Tree = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier())
])

In [243]:
training_pipeline_Des_Tree.fit(x_train, y_train)

In [244]:
y_prediction = training_pipeline_Des_Tree.predict(x_test)

In [245]:
confusion_matrix(y_test, y_prediction)

array([[13681,  1035],
       [ 1051, 16605]])

In [246]:
accuracy_score(y_test,y_prediction)

0.935561596441369

In [247]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

dissatisfied       0.93      0.93      0.93     14716
   satisfied       0.94      0.94      0.94     17656

    accuracy                           0.94     32372
   macro avg       0.93      0.94      0.94     32372
weighted avg       0.94      0.94      0.94     32372



### Ranforest

In [248]:
from sklearn.ensemble import RandomForestClassifier

In [249]:
training_pipeline_Ran_Forest = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

In [250]:
training_pipeline_Ran_Forest.fit(x_train, y_train)

In [251]:
y_prediction = training_pipeline_Ran_Forest.predict(x_test)

In [252]:
confusion_matrix(y_test, y_prediction)

array([[14190,   526],
       [  900, 16756]])

In [253]:
print(x_test.shape)
print(len(y_prediction), len(y_test))


(32372, 21)
32372 32372


In [254]:
print(x.shape, y.shape) 

(129487, 21) (129487,)


In [255]:
accuracy_score(y_test,y_prediction)

0.955949586062029

In [256]:
# LogisticsRegression = Model_pipeline.named_steps['model']

In [257]:
# LogisticsRegression.coef_

### SAVE Logistic

In [258]:
import joblib

In [259]:
filename = 'logistic_airline.joblib'
joblib.dump(Model_pipeline, filename)

['logistic_airline.joblib']

### SAVE SVM

In [260]:
filename = 'SVM_airline.joblib'
joblib.dump(Model_SVC, filename)

['SVM_airline.joblib']

### SAVE Decision Tree

In [261]:
filename = 'DT_airline.joblib'
joblib.dump(training_pipeline_Des_Tree, filename)

['DT_airline.joblib']

### SAVE Ranforest

In [274]:
filename = 'RF_airline.joblib'
joblib.dump(training_pipeline_Ran_Forest, filename)

['RF_airline.joblib']

### LOAD MODEL

In [263]:
df = pd.read_csv('https://raw.githubusercontent.com/ManonYa09/MachineLearningT3/refs/heads/main/Dataset/Invistico_Airline.csv')

In [264]:
df1 = df.copy()

In [265]:
df1 = df1.drop('satisfaction', axis = 1)

In [266]:
def prediction(df1):
    # Load saved model
    loaded_model = joblib.load('RF_airline.joblib')
    
    # Make predictions (text labels)
    y_pred = loaded_model.predict(df1)
    
    # Map text labels to numbers
    mapping = {"satisfied": 1, "dissatisfied": 0}
    y_pred_numeric = pd.Series(y_pred).map(mapping)
    
    # Add to dataframe
    df1['satisfaction Prediction'] = y_pred_numeric
    return df1


In [267]:
loaded_model = joblib.load('RF_airline.joblib')

In [268]:
print(type(loaded_model))

<class 'sklearn.pipeline.Pipeline'>


In [269]:
# print(loaded_model)


In [270]:
loaded_model.predict(df1)

array(['satisfied', 'satisfied', 'satisfied', ..., 'dissatisfied',
       'dissatisfied', 'dissatisfied'], dtype=object)

In [271]:
prediction(df1)

Unnamed: 0,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,...,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction Prediction
0,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,2,...,3,3,0,3,5,3,2,0,0.0,1
1,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,0,...,3,4,4,4,2,3,2,310,305.0,1
2,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,2,...,2,3,3,4,4,4,2,0,0.0,1
3,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,3,...,1,1,0,1,4,1,3,0,0.0,1
4,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,4,...,2,2,0,2,4,2,5,0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,disloyal Customer,29,Personal Travel,Eco,1731,5,5,5,3,2,...,2,3,3,4,4,4,2,0,0.0,1
129876,disloyal Customer,63,Personal Travel,Business,2087,2,3,2,4,2,...,3,2,3,3,1,2,1,174,172.0,0
129877,disloyal Customer,69,Personal Travel,Eco,2320,3,0,3,3,3,...,4,4,3,4,2,3,2,155,163.0,0
129878,disloyal Customer,66,Personal Travel,Eco,2450,3,2,3,2,3,...,3,3,2,3,2,1,2,193,205.0,0


In [272]:
df1.columns

Index(['Customer Type', 'Age', 'Type of Travel', 'Class', 'Flight Distance',
       'Seat comfort', 'Departure/Arrival time convenient', 'Food and drink',
       'Gate location', 'Inflight wifi service', 'Inflight entertainment',
       'Online support', 'Ease of Online booking', 'On-board service',
       'Leg room service', 'Baggage handling', 'Checkin service',
       'Cleanliness', 'Online boarding', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes', 'satisfaction Prediction'],
      dtype='object')

### MLFLOW

In [273]:
import os
import joblib
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# ==============================
# 1. Load and preprocess dataset
# ==============================
df = pd.read_csv("https://raw.githubusercontent.com/ManonYa09/MachineLearningT3/refs/heads/main/Dataset/Invistico_Airline.csv")

# Target
target = "satisfaction"
X = df.drop(target, axis=1)
y = df[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==============================
# 2. Define model files (your saved models)
# ==============================
model_files = {
    "DecisionTree": "/Users/chhunhoir/Desktop/project_pandas/Machine_Learning/DT_airline.joblib",
    "LogisticRegression": "/Users/chhunhoir/Desktop/project_pandas/Machine_Learning/logistic_airline.joblib",
    "RandomForest": "/Users/chhunhoir/Desktop/project_pandas/Machine_Learning/RF_airline.joblib",
    "SVM": "/Users/chhunhoir/Desktop/project_pandas/Machine_Learning/SVM_airline.joblib"
}

# ==============================
# 3. MLflow tracking setup
# ==============================
mlflow.set_tracking_uri("file:///Users/chhunhoir/Desktop/project_pandas/mlruns")
mlflow.set_experiment("airline_satisfaction_experiment")

# ==============================
# 4. Evaluate and log models
# ==============================
for model_name, path in model_files.items():
    if not os.path.exists(path):
        print(f"⚠️ File not found: {path}. Skipping {model_name}.")
        continue

    with mlflow.start_run(run_name=model_name):
        # Load model
        model = joblib.load(path)

        # Predict
        try:
            y_pred = model.predict(X_test)
        except ValueError as e:
            print(f"⚠️ Could not predict with {model_name}: {e}")
            continue

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        # Log metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)

        # Log hyperparameters (if available)
        if hasattr(model, "get_params"):
            mlflow.log_params(model.get_params())

        # Log model with input example
        mlflow.sklearn.log_model(model, name=model_name, input_example=X_test.iloc[:5])

        print(f"✅ {model_name} logged: accuracy={acc:.4f}, f1_score={f1:.4f}")

print("All models logged. Run:")
print("mlflow ui --backend-store-uri /Users/chhunhoir/Desktop/project_pandas/mlruns")
print("to compare them.")




✅ DecisionTree logged: accuracy=0.9841, f1_score=0.9841




✅ LogisticRegression logged: accuracy=0.8290, f1_score=0.8290




✅ RandomForest logged: accuracy=0.9886, f1_score=0.9886




✅ SVM logged: accuracy=0.9461, f1_score=0.9462
All models logged. Run:
mlflow ui --backend-store-uri /Users/chhunhoir/Desktop/project_pandas/mlruns
to compare them.


### use Pickle for save model


In [275]:
import pickle
import pandas as pd

In [276]:
with open("airline_randomforest.pkl", "wb") as f:
    pickle.dump(training_pipeline_Ran_Forest, f)