In [1]:
# importing libraries  
import numpy as nm  
from matplotlib import pyplot as plt 
import pandas as pd 
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('Flyzy Flight Cancellation - EDA.csv')

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Flight ID,Airline,Flight_Distance,Origin_Airport,Destination_Airport,Scheduled_Departure_Time,Day_of_Week,Month,Airplane_Type,Weather_Score,Previous_Flight_Delay_Minutes,Airline_Rating,Passenger_Load,Flight_Cancelled
0,0,0,7319483,Airline D,475,Airport 3,Airport 2,4,6,1,Type C,0.225122,5.0,2.151974,0.477202,0
1,1,1,4791965,Airline E,538,Airport 5,Airport 4,12,1,6,Type B,0.060346,68.0,1.600779,0.159718,1
2,2,2,2991718,Airline C,565,Airport 1,Airport 2,17,3,9,Type C,0.09392,18.0,4.406848,0.256803,0
3,3,3,4220106,Airline E,658,Airport 5,Airport 3,1,1,8,Type B,0.65675,13.0,0.998757,0.504077,1
4,4,4,2263008,Airline E,566,Airport 2,Airport 2,19,7,12,Type E,0.505211,4.0,3.806206,0.019638,0


In [4]:
del df['Unnamed: 0.1']

In [5]:
del df['Unnamed: 0']

In [6]:
df.head()

Unnamed: 0,Flight ID,Airline,Flight_Distance,Origin_Airport,Destination_Airport,Scheduled_Departure_Time,Day_of_Week,Month,Airplane_Type,Weather_Score,Previous_Flight_Delay_Minutes,Airline_Rating,Passenger_Load,Flight_Cancelled
0,7319483,Airline D,475,Airport 3,Airport 2,4,6,1,Type C,0.225122,5.0,2.151974,0.477202,0
1,4791965,Airline E,538,Airport 5,Airport 4,12,1,6,Type B,0.060346,68.0,1.600779,0.159718,1
2,2991718,Airline C,565,Airport 1,Airport 2,17,3,9,Type C,0.09392,18.0,4.406848,0.256803,0
3,4220106,Airline E,658,Airport 5,Airport 3,1,1,8,Type B,0.65675,13.0,0.998757,0.504077,1
4,2263008,Airline E,566,Airport 2,Airport 2,19,7,12,Type E,0.505211,4.0,3.806206,0.019638,0


In [7]:
df.shape

(3000, 14)

#### ● Encoding categorical variables: Some of the columns in the dataset are categorical. These need to be encoded into a format that can be understood by the model.

In [9]:
# Identifying categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

In [11]:
# Encoding categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

#### Split the dataset into a training set and a test set.

In [12]:
X = df_encoded.drop('Flight_Cancelled', axis=1)
y = df_encoded['Flight_Cancelled']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### ● Feature Scaling: The ranges of the features in the dataset are quite different. Scaling the features to a similar range can help the model perform better.

In [14]:
# Initializing the scaler
scaler = StandardScaler()

In [15]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### ● Model Building: Build a Logistic Regression model using the training data.

In [16]:
# Initializing the model
model = LogisticRegression()

In [18]:
model.fit(X_train, y_train)

In [19]:
model.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [20]:
model.score(X_test, y_test)

0.6883333333333334

In [21]:
model.predict_proba(X_test)

array([[0.39526729, 0.60473271],
       [0.39412465, 0.60587535],
       [0.40575401, 0.59424599],
       ...,
       [0.3031386 , 0.6968614 ],
       [0.39416347, 0.60583653],
       [0.39582917, 0.60417083]])

#### ● Model Evaluation: Evaluate the model using appropriate metrics and the test data.

In [22]:
# Making predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:\n', class_report)

Accuracy: 0.51
Confusion Matrix:
 [[ 90  97]
 [199 214]]
Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.48      0.38       187
           1       0.69      0.52      0.59       413

    accuracy                           0.51       600
   macro avg       0.50      0.50      0.48       600
weighted avg       0.57      0.51      0.52       600





In [23]:
# saving the dataframe
df.to_csv('Flyzy Flight Cancellation - Preprocessed.csv')