![Description of Image](Airline_satisfaction_cover.png)

### ❓ Problem Statment
- Description

Analyzing Airline Passenger Satisfaction Data to discover factors affecting satisfaction and to build a predictive model for it.

- Objective

To predict passenger satisfaction (Satisfied vs. Neutral/Dissatisfied) using machine learning models and to gain insights into the most influential features affecting satisfaction.

- Approach

    - Load and clean the data

    - Handle missing/duplicate/mixed-type values

    - Explore the data through visualizations

    - Build and evaluate multiple ML models

    - Select the best-performing model

    - Save it using joblib for later integration



# 📊🔧 Data Environment Setup 

In [None]:
# Import essential libraries for data manipulation and visualization
import pandas as pd                                         # for working with data in tabular form (DataFrames)
import numpy as np                                          # for numerical computations and array operations

import matplotlib.pyplot as plt                             # for creating plots and charts
import seaborn as sns                                       # for advanced visualizations with better styling
from colorama import Fore                                   # for printing colored text in the console (e.g., errors, alerts)

from sklearn.model_selection import train_test_split        # Importing the train_test_split function to split the dataset into training and testing sets

from sklearn.preprocessing import LabelEncoder              # Import class to convert categorical labels to numeric codes

from sklearn.model_selection import cross_val_score         # Import cross_val_score from sklearn.model_selection to perform cross-validation on the model


from sklearn.linear_model import LogisticRegression         # Logistic Regression: a simple and interpretable linear classification model
from sklearn.tree import DecisionTreeClassifier             # Decision Tree: a non-linear model that splits data into branches for classification
from sklearn.ensemble import RandomForestClassifier         # Random Forest: an ensemble of decision trees to reduce overfitting
from sklearn.ensemble import GradientBoostingClassifier     # Gradient Boosting: builds trees sequentially to correct errors from previous ones    
from sklearn.svm import SVC                                 # Support Vector Classifier: effective in high-dimensional spaces, aims to find the optimal decision boundary
from sklearn.neighbors import KNeighborsClassifier          # K-Nearest Neighbors: classifies based on the majority label of nearest data points
from xgboost import XGBClassifier                           # XGBoost: a powerful and efficient gradient boosting algorithm optimized for speed and performance

import time                                                 # Importing the time library to calc the time to train models
from sklearn.metrics import classification_report           # Importing the classification_report to generate a detailed performance report (precision, recall, and F1-score).
from sklearn.metrics import accuracy_score                  # Importing accuracy_score to calculate the accuracy of the classification model
from sklearn.metrics import confusion_matrix                # Used to compute the confusion matrix for actual vs predicted labels
from sklearn.metrics import ConfusionMatrixDisplay          # Used to visualize the confusion matrix as a plot

from sklearn.model_selection import GridSearchCV            # Tool for performing exhaustive search over specified hyperparameter values using cross-validation

from sklearn.pipeline import Pipeline                       # Used to create modeling pipelines
from sklearn.preprocessing import StandardScaler            # Used for feature scaling (standardization)

import joblib                                               # Import the joblib library to save and load the trained model efficiently

# Set visualization style to dark grid for better readability
sns.set_style("darkgrid")

# Configure pandas display options to show full data without truncation
pd.set_option("display.max_columns", None)                  # Show all columns in outputs
pd.set_option("display.max_colwidth", None)                 # Show full content of each column (no '...')

# Suppress FutureWarning messages to keep the output clean
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

np.random.seed(42)                                          # Set random seed for reproducibility (ensures same random results every time)


# 📥🔍 Loading and Discovering the Training Dataset

- loading the data

In [None]:
# Load the dataset from a CSV file into a DataFrame
df = pd.read_csv('Airline_Passenger_Satifaction_Data.csv')  
print('Loading is Done')

- Showing some details about our data

In [None]:
# Returns a tuple with the number of rows and columns in the DataFrame (rows, columns)
df.shape  

In [None]:
# Shows the first 5 rows of the DataFrame to get a quick overview of the data
df.head()  

In [None]:
# Displays summary info about the DataFrame: columns, non-null values, and data types
df.info()  

- Exploring unknown values ​​in data

In [None]:
# Shows the number of missing (NaN) values in each column of the DataFrame
df.isna().sum()  

In [None]:
# Show all rows that contain at least one NaN value
df[df.isna().any(axis=1)].head(10)  

- Exploring duplicated rows ​​in data

In [None]:
# Checks if there are any duplicated rows in the DataFrame and returns True or False
print(df.duplicated().any())

# Count duplicated rows
print(df.duplicated().sum())  

- Exploring illogical values ​​in data

In [None]:
# Display columns that contain at least one negative value (numeric only)
negative_columns = df.select_dtypes(include=[np.number]).columns[(df.select_dtypes(include=[np.number]) < 0).any()]
print("Columns with negative values:", list(negative_columns))

- Display summary statistics for all object columns

In [None]:
# Display summary statistics for categorical (object type) columns
df.describe(include="O")

### ✅ After using these functions we noticed many things:

- Data loaded successfully.
- Number of Rows is: 103988.
- Number of columns is: 25.
- There are null values in 21 columns (359 NaN value in data).
- There is 66 complete duplicates.
- There is a column with negative values [Age].
- Unnamed:0 and id columns wouldn't be useful for our project.
- There is a problem with the data type in the 'online boarding' column (it is supposed to be numeric).

# 📈🧹 Data preparation and cleaning

At this stage we will get rid of unimportant columns and process all problems of the data.

- Delete unimportant columns

In [None]:
# Delete columns with specific names
df = df.drop(columns=['Unnamed: 0', 'id']) 

# To verify, print the column names after deletion
print(df.columns)  

- Remove duplicated rows

In [None]:
# Remove duplicate rows
df = df.drop_duplicates()

# Verify the result by checking the shape 
df.shape

- Process some unknown and fales values ​​(Imputation)

Replace unknown, negative, or very large values ​​with the average values ​​in the 'Age' column.

In [None]:
# Calculate the mean of valid ages (between 0 and 100)
valid_age_mean = df.loc[(df['Age'] > 0) & (df['Age'] < 100), 'Age'].mean()

# Replace ages less than 0 or greater than 100 with the valid mean
df.loc[(df['Age'] < 0) | (df['Age'] > 100), 'Age'] = valid_age_mean

# Replace NaN values with the mean
df['Age'].fillna(df['Age'].mean(), inplace=True)  


- Drop rows of unknown values

Dropping unknown values ​​is a good option because their number is small compared to the total data.

In [None]:
# Drop rows that contain any NaN values
df = df.dropna()

# This will show the number of rows and columns after removal
print(df.shape)  

- Ensuring that unknown values ​​are deleted or processed

In [None]:
# Shows the number of missing (NaN) values in each column of the DataFrame
df.isna().sum()  

> Great! Our Data without any NaN.

- discovering the type of each column

In [None]:
# Loop through each column in the DataFrame
for col in df.columns:
    # Get the unique data types (after dropping NaNs)
    types = df[col].dropna().apply(type).unique()
    
    # If the column has only one type
    if len(types) == 1:
        # Check if it's numeric (int or float)
        if pd.api.types.is_numeric_dtype(df[col]):
            print(f"🔢 Numeric: {col}")
        else:
            print(f"🔤 Categorical: {col}")
    else:
        # Column contains mixed data types (e.g., int and str)
        print(f"⚠️ Mixed-type: {col} ({types})")


> Oops! there is a problem with values in 'Online boarding' column. the values are mixed-type !!! 

In [None]:
# Print the frequency of each unique value in the 'Online boarding' column, including NaNs if any
print(df['Online boarding'].value_counts(dropna=False))

# Print the count of different data types present in the 'Online boarding' column
# This helps identify if the column contains mixed types (e.g., int and str), which can cause encoding issues
print(df['Online boarding'].apply(type).value_counts())

> To solve this problem we neet to cast all values of column to integer data type

In [None]:
# Convert all values in the 'Online boarding' column to integers
# Invalid parsing will be set as NaN (using 'errors="coerce"'), and we use 'Int64' to allow nullable integers
df['Online boarding'] = pd.to_numeric(df['Online boarding'], errors='coerce').astype('Int64')

In [None]:
# Check if there are any missing (NaN) values in the 'Online boarding' column after conversion
df['Online boarding'].isna().any()

- check dtype for each column

In [None]:
# to know the number of Categorical columns
df.info()

### ✅ Now, we can say that our data is almost ready

- Unnecessary columns were removed.
- Duplicate rows were removed.
- Some unknown values ​​were processed and some were deleted.
- The data type of the columns was checked and the mixed-dtype issue was solved.

# 📶🧐 Data Visualization and Exploration (to Gain Insights)

In [None]:
# the next 5 lines define the default font sizes
plt.rc('font', size=10)
plt.rc('axes', labelsize=14, titlesize=18)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)

- Plot a Histogram for Each Column in the DataFrame

In [None]:
# Generate histograms for all numerical columns in the DataFrame
df.hist(bins=22, figsize=(20, 18), grid = True)

# Show the histogram plot on the screen
plt.show()

- Age Distribution

In [None]:
# Focusing on Age, we'll analyze the age distribution using a histogram
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], kde=True, bins=30, color='blue')
plt.title('Age Distribution of Passengers')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
# This code shows the distribution of ages in the dataset

- Satisfaction Distribution

In [None]:
# In this section, we will analyze the satisfaction distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='satisfaction', palette="Set2")
plt.title('Satisfaction Distribution')
plt.xlabel('Satisfaction (0: Neutral/Dissatisfied, 1: Satisfied)')
plt.ylabel('Count')
plt.show()
# This plot shows the count of passengers who are satisfied (1) versus those who are neutral or dissatisfied (0).

- Departure Delay Distribution

In [None]:
# Here, we will visualize the departure delay distribution 
plt.figure(figsize=(12, 6))
sns.histplot(df['Departure Delay in Minutes'], kde=True, bins=30, color='green')
plt.title('Departure Delay Distribution')
plt.xlabel('Departure Delay in Minutes')
plt.ylabel('Frequency')
plt.show()
# This analysis helps us understand the distribution of delays and how frequent they are.

- Correlation Heatmap (For Numeric Columns Only)

In [None]:
# Select only numeric columns and compute correlation matrix
numeric_df = df.select_dtypes(include=[np.number])

# Now, calculate and plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap of Numeric Features')
plt.show()
# This version of the heatmap will only show correlations between numerical columns.

> The heatmap shows a strong correlation between 'Departure Delay in Minutes' & 'Arrival Delay in Minutes', let's go to dive into it to discover more about this

In [None]:
# Set the font size for x and y axis tick labels to make the plot more readable
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
# Create a scatter plot to visualize the relationship between Arrival Delay and Departure Delay
sns.scatterplot(x='Arrival Delay in Minutes', y='Departure Delay in Minutes', data=df)
# Display the plot
plt.show()

> We can notice that Departure Delay is approximately equal to Arrival Delay. That tells us that the delay happens only before the start of the flight (not during the flight), it required feature engineering step 

In [None]:
# Calculate the flight delay by subtracting Departure Delay from Arrival Delay
df['Flight Delay'] = df['Arrival Delay in Minutes'] - df['Departure Delay in Minutes']

# Show the mean of the new column
print(df['Flight Delay'].mean())

> Since the difference between 'Departure Delay' and 'Arrival Delay' is minimal (~0.43 minutes on average). we can safely remove one of them to avoid redundancy. We'll keep 'Departure Delay' and drop 'Arrival Delay'.


In [None]:
# Drop 'Arrival Delay in Minutes' and the engineered 'Flight Delay' columns 
# because they are highly correlated with 'Departure Delay in Minutes' 
# and keeping all of them would cause multicollinearity in the model.
df.drop(columns=['Arrival Delay in Minutes', 'Flight Delay'], inplace=True)

- Missing Data Visualization

In [None]:
# Here we will visualize the missing values in the dataset using a heatmap.
plt.figure(figsize=(10, 6))
sns.heatmap(df.isna(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()
# This heatmap highlights the missing (NaN) values in the dataset and allows us to check if any columns are missing values.

In [None]:
# Displays summary info about the DataFrame: columns, non-null values, and data types
df.info()  

# 🪓📊 Data splitting (into train set and test set)

In [None]:
# Drop the 'satisfaction' column from the DataFrame (X will be all the features except 'satisfaction')
X = df.drop("satisfaction", axis=1)

# Select the 'satisfaction' column as the target variable (y is the target we want to predict)
y = df["satisfaction"]

# Split the dataset into training and testing sets
# 80% of the data will be used for training, and 20% for testing
# The random_state ensures reproducibility (same split every time)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the number of samples in the training set (X_train), indicating how many training data points we have.
print(f"The number of samples in the training set is: {len(X_train)}")

# Show the number of samples in the test set (X_test), showing how many data points are in the test set.
print(f"The number of samples in the test set is: {len(X_test)}")


# 🔡🔢 Data encoding

- Encoding X (Features) with Mapping Print

In [None]:
# Create a dictionary to store label encoders for each column
feature_encoders = {}

# Loop through object columns in X_train to fit and transform
for column in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])  
    feature_encoders[column] = le

    # Print mapping for this feature
    print(f"\nMapping for feature column: {column}")
    for i, class_ in enumerate(le.classes_):
        print(f"{class_} --> {i}")

- Encoding y (Target) with Mapping Print

In [None]:
# Encode the target variable using LabelEncoder
target_encoder = LabelEncoder()
y_train = target_encoder.fit_transform(y_train)
y_test = target_encoder.transform(y_test)

# Print mapping for the target
print("\nMapping for target:")
for i, class_ in enumerate(target_encoder.classes_):
    print(f"{class_} --> {i}")

- Casting All Numeric Columns to Integer

In [None]:
# Convert all values in the training and test sets (features and target) to integers
def cast_all_to_int(df):
    for column in df.select_dtypes(include=['float', 'int']).columns:
        df[column] = df[column].astype(int)
    return df

X_train = cast_all_to_int(X_train)                  # Cast all feature columns in X_train to integers
y_train = cast_all_to_int(pd.DataFrame(y_train))    # Cast the target column in y_train to integers
X_test = cast_all_to_int(X_test)                    # Cast all feature columns in X_test to integers
y_test = cast_all_to_int(pd.DataFrame(y_test))      # Cast the target column in y_test to integers

print(X_train.info())
print(y_train.info())
print(X_test.info())
print(y_test.info())

# 🛠️📈 ML Models

- Define the Models Dictionary

In [None]:
# A dictionary to store various machine learning models with their names as keys
models = {
    # Logistic Regression model
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),                   # Step to standardize the features before fitting the model
        ('logreg', LogisticRegression(max_iter=10000))  # Logistic Regression model with sufficient iterations for convergence
    ]),
    "Decision Tree": DecisionTreeClassifier(),          # Decision Tree Classifier
    "Random Forest": RandomForestClassifier(),          # Random Forest Classifier
    "Gradient Boosting": GradientBoostingClassifier(),  # Gradient Boosting Classifier
    # K-Nearest Neighbors Classifier
    "KNN": Pipeline([
        ('scaler', StandardScaler()),                   # Step to standardize the features before fitting the KNN model
        ('knn', KNeighborsClassifier())                 # KNN model (which benefits from scaling for distance calculations)
    ])
}

- Train and evaluate each model

In [None]:
#Initialize Variables to Track Best Model
best_model = None
best_score = 0
best_model_name = ""

In [None]:
for name, model in models.items():
    print(f"\n📌 {name}")

    # Convert y_train and y_test to 1D arrays to avoid warnings
    y_train = y_train.ravel()  # Flatten y_train
    y_test = y_test.ravel()    # Flatten y_test
    
    # Start measuring time
    start_time = time.time()
    
    # Perform 5-fold Cross-Validation on training data
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    mean_cv = cv_scores.mean()
    print(f"✅ Cross-validation scores: {cv_scores}")
    print(f"✅ Mean CV score: {mean_cv:.4f}")
    
    # Fit the model on the full training set
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Print Accuracy and Classification Report
    acc = accuracy_score(y_test, y_pred)
    print(f"🎯 Accuracy on Test Set: {acc:.4f}")
    print("📋 Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Display Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f'Confusion Matrix - {name}')
    plt.show()
    
    # Calculate time taken
    end_time = time.time()
    duration = end_time - start_time
    print(f"⏱️ Time taken: {duration:.2f} seconds")
    
    print("-" * 60)
    
    # Track best model based on highest mean CV score
    if mean_cv > best_score:
        best_score = mean_cv
        best_model = model
        best_model_name = name

# 📝 the Best Model Summary

In [None]:
#  Print the Best Model Summary
print("\n Best Model Selected:")
print(f" Model: {best_model_name}")
print(f" Best Mean CV Score: {best_score:.4f}")

# 🔍 Hyperparameter Tuning for Random Forest Model

In [None]:
#  Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],        # Number of trees in the forest
    'max_depth': [None, 10, 20],            # Maximum depth of the trees
    'min_samples_split': [2, 5],            # Minimum number of samples required to split a node
    'max_features': ['sqrt', 'log2']        # Number of features to consider when looking for the best split
}

# Create a base Random Forest model
rf_base = RandomForestClassifier(random_state=42)

# Perform Grid Search with 5-fold Cross-Validation
grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,            # Use all available CPU cores
    verbose=1             # Show progress
)

# Fit Grid Search on the training data
grid_search.fit(X_train, y_train)

# ✅ Display the best parameters and best score
print("\n✅✅ Best Parameters Found:")
print(grid_search.best_params_)
print(f"📈 Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

'''
✅✅ Best Parameters Found:
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}
📈 Best Cross-Validation Accuracy: 0.9619
'''

# 📝 Evaluate the Tuned Model on Test Set

In [None]:
# Get the best estimator after tuning
best_rf_model = grid_search.best_estimator_

# Predict on test data
y_pred_tuned = best_rf_model.predict(X_test)

# Evaluate performance
print(f"\n🎯 Accuracy on Test Set after Tuning: {accuracy_score(y_test, y_pred_tuned):.4f}")
print("📋 Classification Report:")
print(classification_report(y_test, y_pred_tuned))

# Display Confusion Matrix
cm = confusion_matrix(y_test, y_pred_tuned)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix - Tuned Random Forest')
plt.show()

# 💾✅ Saving the Trained Model with Joblib for Deployment

In [None]:
# Save the trained model
joblib.dump(best_model, 'random_forest_model.pkl')