In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

flight_data = pd.read_csv('Airlines_with_dummies.csv')


In [28]:
X = flight_data.drop(columns=['Delay'])  
y = flight_data['Delay']

In [29]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (40000, 4612)
Shape of X_test: (10000, 4612)
Shape of y_train: (40000,)
Shape of y_test: (10000,)


In [32]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=10, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.77      0.82      6391
           1       0.66      0.80      0.73      3609

    accuracy                           0.78     10000
   macro avg       0.77      0.79      0.77     10000
weighted avg       0.80      0.78      0.78     10000



In [33]:
from sklearn.model_selection import cross_val_score

# Cross-validate the RandomForestClassifier
scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)
print(f"Cross-validated Accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")


Cross-validated Accuracy: 0.78 (+/- 0.02)


Metrics Analysis

Precision and Recall:
Precision: For class 0 (non-delayed flights), precision of 87%. 87% of all predictions for non-delayed flights were correct.
Recall: For class 1 (delayed flights), recall of 80%. The model correctly identified 80% of all actual delayed flights.


F1-score:
For class 0, F1-score of 0.82, for class 1, F1-score of 0.73.

Accuracy (Global Precision):
The model has an overall accuracy of 78%, indicating that 78% of all predictions made (both delayed and non-delayed flights) were correct.

Cross-validated Accuracy:
The cross-validated accuracy of 0.78 with a standard deviation of +/- 0.02 suggests that the model has good generalization ability to new data, as there is not a significant variation in performance across different cross-validation datasets.

In [40]:
# Export model

import joblib
joblib.dump(rf_classifier, 'rf_model.joblib')


['rf_model.joblib']