In [None]:
# Cell 1 - Import required libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import plot_tree

In [None]:
# Cell 2 - Load and preprocess the data
# Import dataset 
df = pd.read_csv('../Data/Processed_data15.csv')

# Display first few rows and info about the dataset
print("First few rows of the dataset:")
display(df.head())
print("\nDataset information:")
display(df.info())

In [None]:
# Cell 3 - Prepare the data
# Label Encoding
le_carrier = LabelEncoder()
df['carrier'] = le_carrier.fit_transform(df['carrier'])

le_dest = LabelEncoder()
df['dest'] = le_dest.fit_transform(df['dest'])

le_origin = LabelEncoder()
df['origin'] = le_origin.fit_transform(df['origin'])

# Converting Pandas DataFrame into a Numpy array
X = df.iloc[:, 0:6].values # from column(years) to column(distance)
y = df['delayed']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=61)

In [None]:
# Cell 4 - Create and train Decision Tree
# Create Decision Tree classifier
dt_classifier = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_classifier.fit(X_train, y_train)

# Make predictions
y_pred = dt_classifier.predict(X_test)

In [None]:
# Cell 5 - Model Evaluation
# Print accuracy score
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Cell 6 - Visualize Decision Tree
plt.figure(figsize=(20, 10))
feature_names = ['Year', 'Month', 'Day', 'Carrier', 'Origin', 'Destination']
class_names = ['On Time', 'Delayed']
plot_tree(dt_classifier, 
          feature_names=feature_names,
          class_names=class_names,
          filled=True,
          rounded=True,
          fontsize=10)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
# Cell 7 - Feature Importance
# Get feature importance
importances = dt_classifier.feature_importances_
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance in Decision Tree Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
# Cell 8 - Save the model (optional)
import pickle

# Save the model
with open('../models/decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt_classifier, f)

print("Model saved successfully!")