In [None]:
# for data handling
import pandas as pd
import numpy as np

# train-test split
from sklearn.model_selection import train_test_split

# loss functions for today
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# dummy models for comparison
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier

# regression models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# for removing outliers
from scipy import stats

# for identifying best parameters
from sklearn.model_selection import RandomizedSearchCV

In [None]:
flight_df = pd.read_csv('flights.csv', dtype={'DESTINATION_AIRPORT': str, 'ORIGIN_AIRPORT': str})
airports_df = pd.read_csv('airports.csv')

In [None]:
f_df = flight_df.copy()
a_df = airports_df.copy()

In [None]:
f_df = pd.merge(f_df, a_df, left_on='ORIGIN_AIRPORT', right_on='IATA_CODE', how='left')
f_df = f_df.rename(columns={'STATE': 'ORIGIN_STATE'})
f_df = f_df.drop(columns=['IATA_CODE'])

f_df = pd.merge(f_df, a_df, left_on='DESTINATION_AIRPORT', right_on='IATA_CODE', how='left')
f_df = f_df.rename(columns={'STATE': 'DESTINATION_STATE'})
f_df = f_df.drop(columns=['IATA_CODE'])

f_df['ORIGIN_STATE'] = f_df['ORIGIN_STATE']
f_df['DESTINATION_STATE'] = f_df['DESTINATION_STATE']

#f_df.head()    

In [None]:
f_df = f_df.drop(['COUNTRY_x', 'LATITUDE_x', 'LONGITUDE_x', 'AIRPORT_y', 'CITY_y', 'COUNTRY_y', 'LATITUDE_y',
                  'LONGITUDE_y', 'AIRPORT_x', 'CITY_x'], axis=1)

f_df = f_df[f_df['MONTH'] != 10]

In [None]:
f_df['SCHEDULED_DEPARTURE'] = pd.to_datetime(f_df['SCHEDULED_DEPARTURE'], format='%H%M', errors='coerce')

f_df['SCHEDULED_ARRIVAL'] = pd.to_datetime(f_df['SCHEDULED_ARRIVAL'], format='%H%M', errors='coerce')

f_df['DEPARTURE_HOUR'] = f_df['SCHEDULED_DEPARTURE'].dt.hour
f_df['DEPARTURE_MINUTE'] = f_df['SCHEDULED_DEPARTURE'].dt.minute

f_df['ARRIVAL_HOUR'] = f_df['SCHEDULED_ARRIVAL'].dt.hour
f_df['ARRIVAL_MINUTE'] = f_df['SCHEDULED_ARRIVAL'].dt.minute

f_df = f_df.drop(['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL'], axis=1)



In [None]:
f_df = f_df.drop(['YEAR', 'DEPARTURE_DELAY', 'DEPARTURE_TIME', 'TAXI_OUT','WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME',
                  'WHEELS_ON','TAXI_IN','ARRIVAL_TIME','DIVERTED','CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY',
                   'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'], axis=1)

# Drop TAIL_NUMBER for now due to value error
f_df = f_df.drop(['TAIL_NUMBER', 'FLIGHT_NUMBER', 'DISTANCE'], axis=1)



In [None]:
f_df = f_df.drop(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'], axis=1)


columns_to_encode = ['AIRLINE', 'ORIGIN_STATE', 'DESTINATION_STATE']

f_df = pd.get_dummies(f_df, columns=columns_to_encode,  drop_first=True)


In [None]:
f_df = f_df.dropna()

In [None]:

z_scores = stats.zscore(f_df['ARRIVAL_DELAY'])
abs_z_scores = abs(z_scores)

# Define a threshold (e.g., 3) for outlier detection
threshold = 3
outliers = (abs_z_scores > threshold)

# Remove outliers
f_df = f_df[~outliers]

In [None]:
y = f_df['ARRIVAL_DELAY']
X = f_df.drop('ARRIVAL_DELAY', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [None]:
# Define the parameter grid
param_distributions = {
    'n_estimators': np.arange(100, 1001, 100),
    'max_depth': [None] + list(np.arange(10, 101, 10)),
    'min_samples_split': np.arange(2, 11, 1),
    'min_samples_leaf': np.arange(1, 5, 1),
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt', 'log2'] + list(np.linspace(0.1, 1.0, 10))
}

#print(param_distributions)

In [None]:
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, 
                                   n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters and corresponding score
print("Best Parameters: ", random_search.best_params_)
#print("Best R-squared Score: {:.4f}".format(random_search.best_score_))



Switch out the parameters beloew with the 'best' ones given by the random search a see what score it produces.

In [None]:
rf = RandomForestRegressor(n_estimators=100, min_samples_split=2, min_samples_leaf=8, max_depth=30)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
print("Accuracy on training set: {:.3f}".format(rf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf.score(X_test, y_test)))

mse = mean_squared_error(y_test, rf_y_pred)

print(f'Mean Squared Error: {mse}')