# Machine Learning Assignment

**Dataset**:       AIRLINE SATISFACTION

**Student ID**:    s5510805

**Student Name**:  Daniel Harris

## Imports

Add imports here as needed.

Remember to **re-run the cell when you add imports**, so it gets loaded into the virtual notebook environment!

In [175]:
# Data and Datasets
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# Clustering
from sklearn.cluster import DBSCAN

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Imputing
from sklearn.impute import KNNImputer

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report


# Classifiers
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz

# Utils
import pprint
import numpy as np
from time import time
import openpyxl

## Loading the dataset

In [None]:
df = pd.read_csv('data/airline-satisfaction.csv')
df.head()

## Task 2.1 - ML Workflow to Critically Evaluate

In [177]:
# Dropping all rows with missing values
df = df.dropna()

# Converting all non-numeric (object) features to numeric
cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.astype('category')) # converting 'object' columns to 'category' type
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes) # converting the 'category' columns to integer encoded values

# Splits the Pandas DataFrame into a feature matrix (X) and class/label vector (y)
X = df.iloc[:,:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]

# Splitting dataset for hold-out validation
X_train, X_test, y_train, y_test = train_test_split(X, # feature matrix
                                                    y, # label vector
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=None
                                                   )

In [None]:
# Creating and testing a Logistic Regression Model
model = LogisticRegression()

# Training the model
model.fit(X_train, y_train)

# Testing the model
y_pred = model.predict(X_test)

# Printing out performance of the model
print("Accuracy: %s" % (metrics.accuracy_score(y_test, y_pred)*100))

## Task 2.3 - Evaluation of Improved ML Workflow

Add code for running your **improved** machine learning experiments below.


#Data Processing

In [None]:
df = pd.read_csv('data/airline-satisfaction.csv')
# Missing Values Handling

# Dropping rows with high missing values count and id column
df = df.dropna(thresh=len(df.columns) - 4)
df = df.drop(columns=['id'])

# Impute missing values in 'type_of_travel' and 'travel_class' with 'Unassigned'
for column in ['type_of_travel', 'travel_class']:
  df[column].fillna(value='Unassigned', inplace=True)

# K-NN Imputer for Rating Columns

# Columns identified as having rating data
rating_columns = ['inflight_wifi_service', 'online_boarding', 'inflight_entertainment']

# Apply KNN imputation only to the rating columns
df[rating_columns] = KNNImputer(n_neighbors=5).fit_transform(df[rating_columns])

# Linear Regression Imputation

# Dropping rows where both 'departure_delay_in_minutes' and 'arrival_delay_in_minutes' are missing
df = df.dropna(subset=['departure_delay_in_minutes', 'arrival_delay_in_minutes'], how='all')

def linear_regression_impute(df, target, predictor):
  # Filter out rows where either target or predictor is NaN
  not_null_df = df.dropna(subset=[target, predictor])

  # Train a linear regression model
  model = LinearRegression()
  model.fit(not_null_df[[predictor]], not_null_df[target])

  # Predict the missing values
  predict_df = df[df[target].isnull() & df[predictor].notnull()]
  predicted_values = model.predict(predict_df[[predictor]])

  return predicted_values, predict_df.index

# Impute 'departure_delay_in_minutes' using 'arrival_delay_in_minutes'
departure_pred, departure_idx = linear_regression_impute(df, 'departure_delay_in_minutes', 'arrival_delay_in_minutes')
df.loc[departure_idx, 'departure_delay_in_minutes'] = departure_pred

# Dropping arrival_delay_in_minutes
df = df.drop(columns=['arrival_delay_in_minutes'])

# One-Hot Encoding of Categorical Variables

# Exclude 'class' column from one-hot encoding
cat_columns = df.select_dtypes(include=['object']).columns
cat_columns = cat_columns[cat_columns != 'class']

# Perform one-hot encoding
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df[cat_columns])
encoded_feature_names = encoder.get_feature_names_out(cat_columns)

# Create a DataFrame from the one-hot encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=df.index)

# Drop original categorical columns from the DataFrame
df = df.drop(columns=cat_columns)

# Concatenate the original DataFrame with the one-hot encoded DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Convert float64 columns to int64 by rounding
for column in df.columns:
  if df[column].dtype == 'float64':
    df[column] = df[column].round().astype('int64')

# Encode the target variable 'class'
df['class'] = LabelEncoder().fit_transform(df['class'])

print(df.shape)


#Anomaly Handling

In [180]:
# Anomaly Removal

# Columns to check for 0 values
columns_to_check = ['on_board_service', 'leg_room_service', 'cleanliness', 'inflight_service', 'food_and_drink']

# Filter out rows where any of the specified columns have a 0 value
df = df[~df[columns_to_check].isin([0]).any(axis=1)]

# Remove 44 Anomaly and 999 Anomaly
df = df[df['inflight_entertainment'] != 44]
df = df[df['age'] != 999]

# Filter out records where 'flight_distance' is greater than 10000
df = df[df['flight_distance'] <= 10000]

# Filter out records where 'departure_delay_in_minutes' is over 600
df = df[df['departure_delay_in_minutes'] <= 600]


# Save the cleaned data to a new CSV file
df.to_csv('data/cleaned_airline_satisfaction_no_anomalies.csv')

# Test Train split

In [182]:
# Separate the features and the target variable
X = df.drop('class', axis=1)
y = df['class']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

#SMOTE

In [183]:
# Apply SMOTE to the training set
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

#Standardization

In [184]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Save the cleaned data to a new CSV file
df.to_csv('data/airline_satisfaction_standardized.csv')

#Classifier Evaluation Utility Function

In [185]:
def evaluate_classifier(y_true, y_pred):
    # Calculate the confusion matrix
    confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", confusion_matrix)

    # Calculate accuracy
    accuracy = metrics.accuracy_score(y_true, y_pred)
    print("Accuracy (Testing): %.2f" % accuracy)

    # Calculate macro-averaged precision and recall
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    print("Macro-Averaged Precision: %.2f" % precision)
    print("Macro-Averaged Recall: %.2f" % recall)

# Logistic Regression

In [None]:
# Creating and testing a Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_scaled, y_train_smote)

# Testing the model
y_pred_lr = model.predict(X_test_scaled)

# Printing out performance of the model
evaluate_classifier(y_test, y_pred_lr)

# Random Forest

In [None]:
# Create and train Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_scaled, y_train_smote)

# Predict on the test set
y_pred_rf = rf_classifier.predict(X_test_scaled)


evaluate_classifier(y_test, y_pred_rf)

#K-NN

In [None]:
# Create and train K-NN Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train_scaled, y_train_smote)

# Predict on the test set
y_pred_knn = knn_classifier.predict(X_test_scaled)

evaluate_classifier(y_test, y_pred_knn)