In [None]:
#Configuration 
'''Project globals'''

# Paths
WORKING_DIRECTORY='..'
DATA_DIRECTORY=f'{WORKING_DIRECTORY}/data'
RAW_DATA_DIRECTORY=f'{DATA_DIRECTORY}/raw'
INTERIM_DATA_DIRECTORY=f'{DATA_DIRECTORY}/interim'
PROCESSED_DATA_DIRECTORY=f'{DATA_DIRECTORY}/processed'
MODEL_DIRECTORY=f'{WORKING_DIRECTORY}/models'

# Data files
RAW_INCIDENTS_MDB_FILE=f'{RAW_DATA_DIRECTORY}/avall.mdb'
RAW_INCIDENTS_CSV_FILE=f'{RAW_DATA_DIRECTORY}/incidents.csv'
EXTRACTED_INCIDENTS_FILE=f'{INTERIM_DATA_DIRECTORY}/incidents.csv'
RAW_ONTIME_CSV_FILE=f'{RAW_DATA_DIRECTORY}/ontime.csv'
EXTRACTED_ONTIME_FILE=f'{INTERIM_DATA_DIRECTORY}/ontime.csv'
COMBINED_DATAFILE=f'{PROCESSED_DATA_DIRECTORY}/combined_data.csv'
ENCODED_DATAFILE=f'{PROCESSED_DATA_DIRECTORY}/all_encoded.csv'
TRAINING_DATAFILE=f'{PROCESSED_DATA_DIRECTORY}/train_encoded.csv'
TESTING_DATAFILE=f'{PROCESSED_DATA_DIRECTORY}/test_encoded.csv'
MODEL=f'{MODEL_DIRECTORY}/model.pkl'

# Resource URLs
INCIDENT_DATA_URL='https://data.ntsb.gov/avdata/FileDirectory/DownloadFile?fileID=C%3A%5Cavdata%5Cavall.zip'
ONTIME_DATA_URL='https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time'
ONTIME_DATA_LINK_PREFIX='https://www.bts.dot.gov'

# Number of on-time performance files to download and parse
ONTIME_FILES=3

'''Functions to download extract and parse data.'''

import io
import glob
import zipfile
from pathlib import Path

import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
from access_parser import AccessParser


def download_data(url:str, raw_data_directory:str, raw_incidents_mdb_file:str) -> None:
    '''Downloads and extract zipfile from url. Saves to raw data directory.'''

    # Only download the file if we don't already have it
    if Path(raw_incidents_mdb_file).is_file() == False:

        # Get the archive from URL
        response=requests.get(url, timeout=10)

        # Extract to disk
        archive=zipfile.ZipFile(io.BytesIO(response.content))
        archive.extractall(raw_data_directory)


def parse_mdb(raw_incidents_mdb_file:str, raw_incidents_csv_file:str) -> None:
    '''Parses MDB file and saves aircraft table to csv.'''

    # Only parse the file if we don't already have the result
    if Path(raw_incidents_csv_file).is_file() == False:

        # Load database and extract aircraft table
        db=AccessParser(raw_incidents_mdb_file)
        table=db.parse_table('aircraft')

        # Convert to dataframe and save as csv
        table_df=pd.DataFrame.from_dict(table)
        table_df.to_csv(raw_incidents_csv_file, index=False)


def get_ontime_links(url:str) -> list:
    '''Uses requests and beautifulsoup to parse download links for on-time
    performance from bts.gov site'''

    # Get download links page as HTML string
    response=requests.get(url)
    html_content=response.text

    # Convert to BeautifulSoup object and get all links from page by taking only <a> tags.
    soup=BeautifulSoup(html_content, 'html.parser', parse_only=SoupStrainer('a'))
    
    # Loop on the links and collect those that point to a zip file
    links=[]

    for link in soup:
        if link.has_attr('href'):
            link_text=link['href']
            if link_text.split('.')[-1] == 'zip':
                links.append(link_text)

    return links


def download_ontime_data(links:list, ontime_data_link_prefix:str, raw_data_directory:str) -> None:
    '''Takes list of data download urls, download the files to disk.'''

    # Loop on list of links
    for link in links:
        
        # Only download if we don't already have the file
        if Path(f"{raw_data_directory}{link.split('/')[-1]}").is_file() == False:

            # Download the zip file
            complete_link=f'{ontime_data_link_prefix}/{link}'
            response=requests.get(complete_link, timeout=10)

            # Extract the zip file to the raw data directory
            archive=zipfile.ZipFile(io.BytesIO(response.content))
            archive.extractall(raw_data_directory)


def parse_asc_datafiles(n_files:int, raw_data_directory:str, raw_ontime_csv_file:str) -> pd.DataFrame:
    '''Reads .asc files from raw data directory, combines into
    pandas dataframe.'''

    # Get list of ASCII files from raw data directory
    data_dfs=[]
    asc_files=glob.glob(f'{raw_data_directory}/*.asc')

    # Loop on the ASCII data files
    for asc_file in asc_files[:n_files]:
        print(asc_file)

        # Read the file into a Pandas dataframe and collect in list
        data_df=pd.read_table(asc_file, sep='|', low_memory=False)
        data_dfs.append(data_df)

    # Combine the list of Pandas dataframes and clean the index
    data_df=pd.concat(data_dfs, axis=0)
    data_df.reset_index(inplace=True, drop=True)

    # Save to CSV
    data_df.to_csv(raw_ontime_csv_file, index=False)

    return data_df

# Data Acquisition 
from pathlib import Path

import pandas as pd
import functions.data_acquisition as data_funcs
import configuration as config

Path(config.RAW_DATA_DIRECTORY).mkdir(parents=True, exist_ok=True)
Path(config.INTERIM_DATA_DIRECTORY).mkdir(parents=True, exist_ok=True)
Path(config.PROCESSED_DATA_DIRECTORY).mkdir(parents=True, exist_ok=True)

data_funcs.download_data(config.INCIDENT_DATA_URL, config.RAW_DATA_DIRECTORY, config.RAW_INCIDENTS_MDB_FILE)
table=data_funcs.parse_mdb(config.RAW_INCIDENTS_MDB_FILE, config.RAW_INCIDENTS_CSV_FILE)

links=data_funcs.get_ontime_links(config.ONTIME_DATA_URL)
data_funcs.download_ontime_data(links[:config.ONTIME_FILES], config.ONTIME_DATA_LINK_PREFIX, config.RAW_DATA_DIRECTORY)

ontime_df=data_funcs.parse_asc_datafiles(config.ONTIME_FILES, config.RAW_DATA_DIRECTORY, config.RAW_ONTIME_CSV_FILE)

incidents_df=pd.read_csv(config.RAW_INCIDENTS_CSV_FILE, low_memory=False)
incidents_df.info()

pd.set_option('display.max_rows', 200)
incidents_df.head().transpose()

incident_features={
    'dprt_time': 'departure_time',
    'dprt_apt_id': 'origin',
    'dest_apt_id': 'destination',
    'regis_no': 'tail_number'
}

extracted_incident_df=incidents_df[incident_features.keys()].copy()
extracted_incident_df=extracted_incident_df.rename(columns=incident_features)
extracted_incident_df['incident']=[1]*len(extracted_incident_df)
extracted_incident_df.dropna(inplace=True)
extracted_incident_df.to_csv(config.EXTRACTED_INCIDENTS_FILE, index=False)
extracted_incident_df.head()

extracted_incident_df.info()
ontime_df.info()

ontime_features={
    # 'carrier': 0,
    # 'flight_number': 1,
    'origin': 6,
    'destination': 7,
    # 'date': 8,
    'departure_time': 12,
    'tail_number': 25 
}

extracted_ontime_df=ontime_df.iloc[:,list(ontime_features.values())].copy()
extracted_ontime_df.columns=ontime_features.keys()
extracted_ontime_df['incident']=[0]*len(extracted_ontime_df)
extracted_ontime_df.dropna(inplace=True)
extracted_ontime_df.to_csv(config.EXTRACTED_ONTIME_FILE, index=False)
extracted_ontime_df.head()
extracted_ontime_df.info()

data_df=pd.concat([extracted_ontime_df, extracted_incident_df], axis=0)
data_df.reset_index(inplace=True, drop=True)
data_df['origin']=data_df['origin'].astype(str)
data_df['destination']=data_df['destination'].astype(str)
data_df['departure_time']=data_df['departure_time'].astype(float)
data_df['tail_number']=data_df['tail_number'].astype(str)
data_df['incident']=data_df['incident'].astype(int)
data_df.info()
data_df.to_csv(config.COMBINED_DATAFILE, index=False)

# data Preparation
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import configuration as config 

data_df=pd.read_csv(config.COMBINED_DATAFILE)
data_df.info()
print(data_df.head())
airport_features=['origin','destination']

print('Unique levels by feature')
data_df[airport_features].nunique()
print(data_df['origin'].value_counts().head(15))
print(data_df['destination'].value_counts().head(15))

# Plot origin airport level counts
plt.figure(figsize=(15, 5))
level_counts=data_df['origin'].value_counts().head(15).index
sns.countplot(data=data_df, y='origin', order=level_counts)

plt.title('Top 15 origin airports')
plt.xlabel('No. of flights')
plt.ylabel('Origin airport');
plt.show()
# Plot destination airport level counts
plt.figure(figsize=(15, 5))
level_counts=data_df['destination'].value_counts().head(15).index
sns.countplot(data=data_df, y='destination', order=level_counts)

plt.title('Top 15 destination airports')
plt.xlabel('No. of flights')
plt.ylabel('Destination airport');
plt.show()
data_df['incident'].value_counts()
level_counts=data_df['incident'].value_counts()
new_labels = ['Safe', 'Incident']
plt.bar(list(range(len(level_counts))), level_counts, tick_label=level_counts.index, color=('green', 'red'))
plt.title('Safe Vs Incident Flights')
plt.xlabel('All Flights')
plt.xticks(range(len(level_counts)), new_labels)
plt.ylabel('Flights')
plt.show()
print(data_df.head())
data_df['route'] = data_df['origin'] + '_' + data_df['destination']

print(data_df.head())
cyclical_encoded_data_df = data_df.copy()

cyclical_encoded_data_df.head().T
# Function to convert HHMM format to minutes since midnight
def hhmm_to_minutes(hhmm):
    hhmm_int = int(hhmm)
    hours = hhmm // 100
    minutes = hhmm % 100
    return hours * 60 + minutes

# Add minutes since midnight column
cyclical_encoded_data_df['Time'] = cyclical_encoded_data_df['departure_time'].apply(hhmm_to_minutes)

# Add formatted time label for display
cyclical_encoded_data_df['time_label'] = cyclical_encoded_data_df['departure_time'].apply(lambda x: f"{int(x)//100:02d}:{int(x)%100:02d}")

# Apply cyclical encoding - add two new columns
cyclical_encoded_data_df['time_sin'] = np.sin(2 * np.pi * cyclical_encoded_data_df['Time'] / 1440)  # 1440 minutes in a day
cyclical_encoded_data_df['time_cos'] = np.cos(2 * np.pi * cyclical_encoded_data_df['Time'] / 1440)

# Display the DataFrame with the new cyclical encoding columns
print("DataFrame with cyclical time encoding:")
print(cyclical_encoded_data_df.head(3).round(4).T)

# Filter the DataFrame to only include the first 10K flights
filtered_data = cyclical_encoded_data_df.head(10000)

# Visualization of cyclical encoding
plt.figure(figsize=(10, 6))

# Plot sine and cosine values
plt.scatter(filtered_data['Time'], filtered_data['time_sin'], label='Sine Encoding', marker='o')
plt.scatter(filtered_data['Time'], filtered_data['time_cos'], label='Cosine Encoding', marker='o')

# Add labels, title, and legend
plt.title("Visualization of Cyclical Encoding for Departure Times (First 10K Rows)")
plt.xlabel("Minutes Since Midnight")
plt.ylabel("Encoded Values")
plt.legend()
plt.grid()
plt.show()
# Filter the DataFrame to only include the first 1000 flights
filtered_data = cyclical_encoded_data_df.head(1000)

# Circular Visualization
plt.figure(figsize=(8, 8))

# Plot the circular points
plt.scatter(filtered_data['time_cos'], filtered_data['time_sin'], c=filtered_data['Time'], cmap='viridis', s=50)

# # Add labels for each time point
# for i in range(filtered_data.shape[0]):
#     plt.text(filtered_data['time_cos'].iloc[i] * 1.2,  # Adjust position slightly outside the circle
#              filtered_data['time_sin'].iloc[i] * 1.2,
#              filtered_data['time_label'].iloc[i],
#              fontsize=9, ha='center', va='center')

# Plot a circle for reference
circle = plt.Circle((0, 0), 1, color='black', fill=False, linestyle='--', linewidth=0.8)
plt.gca().add_artist(circle)

# Set aspect ratio and labels
plt.gca().set_aspect('equal', adjustable='datalim')
plt.title("Circular Visualization of Cyclical Encoded Time")
plt.xlabel("Cosine Component")
plt.ylabel("Sine Component")
plt.grid(alpha=0.3)
plt.colorbar(label="Minutes Since Midnight")
plt.show()
cyclical_encoded_data_df.head(5)
cyclical_encoded_data_df.drop(columns=['origin','destination','departure_time','tail_number','Time', 'time_label'], inplace=True)

cyclical_encoded_data_df.head(5)

# Frequency encoding for 'route'
route_frequency = cyclical_encoded_data_df['route'].value_counts()
cyclical_encoded_data_df['route_encoded'] = cyclical_encoded_data_df['route'].map(route_frequency)
cyclical_encoded_data_df.drop(columns=['route'], inplace=True)

# Display the updated dataset
print("DataFrame with frequency-encoded features:")
print(cyclical_encoded_data_df.head(5).T)

data_df=cyclical_encoded_data_df.copy()
data_df.head(5).T

train_df, test_df=train_test_split(
    data_df,
    test_size=0.25, 
    random_state=315
)

train_df.head(5).T

train_df.reset_index(drop=True, inplace=True)
train_df.head(5).T
test_df.reset_index(drop=True, inplace=True)
test_df.head(5).T

Path(config.PROCESSED_DATA_DIRECTORY).mkdir(exist_ok=True)

data_df.to_csv(config.ENCODED_DATAFILE)
train_df.to_csv(config.TRAINING_DATAFILE)
test_df.to_csv(config.TESTING_DATAFILE)

# Model Training
import pickle
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import uniform, randint
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, HalvingGridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

import configuration as config

train_df=pd.read_csv(config.TRAINING_DATAFILE)
test_df=pd.read_csv(config.TESTING_DATAFILE)

def cross_val_boosting_model(training_data: pd.DataFrame, testing_data: pd.DataFrame, target_variable: str, model):
    X_train = training_data.drop(columns=target_variable)
    y_train = training_data[target_variable]
    X_test = testing_data.drop(columns=target_variable)
    y_test = testing_data[target_variable]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(scores)
    return X_train, y_train, y_test, y_pred

hist_boost_model = HistGradientBoostingClassifier(random_state=42)
X_train, y_train, y_test, y_pred = cross_val_boosting_model(train_df, test_df, 'incident', hist_boost_model)

# Plot a confusion matrix to evaluate the model's performance on unseen data
def confusion_plot(y_test, y_pred, model):
    acc = accuracy_score(y_test, y_pred)*100
    conf_matrix = confusion_matrix(y_test, y_pred, normalize='true')
    fig, ax = plt.subplots(figsize=(8,6), dpi=100)
    display = ConfusionMatrixDisplay(conf_matrix, display_labels=model.classes_)
    ax.set(title=f'Confusion Matrix for the Diabetes Detection Model with {acc:.2f}% overall accuracy')
    display.plot(ax=ax, values_format='.2%');

confusion_plot(y_test, y_pred, hist_boost_model)
# Your code here... Use one of the sklearn hyperparameter optimization functions to optimize the model
def get_best_params(model, optimizer):
    hyper_params = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'max_iter': [100, 200, 500, 1000],
        'max_leaf_nodes': [15, 31, 63, 127],
        'l2_regularization': [0.0, 0.1, 1.0, 10.0],
    }

    grid = optimizer(model, hyper_params, scoring='balanced_accuracy', cv=5, n_jobs=-1)
    return grid

grid = get_best_params(hist_boost_model, HalvingGridSearchCV)

run_grid = True

if run_grid:
    def warn(*args, **kwargs):
        pass
    import warnings
    warnings.warn = warn

    grid.fit(X_train, y_train)

    print(f"Best hyperparameters: {grid.best_params_}")

optimized_hist_boost_model = HistGradientBoostingClassifier(l2_regularization= 0.1, learning_rate= 0.01, max_iter= 1000, max_leaf_nodes= 15, random_state=42)

X_train, y_train, y_test, y_pred = cross_val_boosting_model(train_df, test_df, 'incident', optimized_hist_boost_model) 

def get_predictions(training_data: pd.DataFrame, testing_data: pd.DataFrame, target_variable: str, model):
    X_train = training_data.drop(columns=target_variable)
    y_train = training_data[target_variable]
    X_test = testing_data.drop(columns=target_variable)
    y_test = testing_data[target_variable]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(y_pred) 
    return y_pred, y_test

y_pred, y_test = get_predictions(train_df, test_df, 'incident', optimized_hist_boost_model)

confusion_plot(y_test, y_pred, optimized_hist_boost_model)

def probabilities_plot(model, testing_data):
    y_true = testing_data['incident']
    X_test = testing_data.drop(columns=['incident'])
    prob_incident = model.predict_proba(X_test)[:, 1]
    plot_df = pd.DataFrame({
        'probability': prob_incident,
        'actual': y_true.map({0: 'Non-Incident', 1: 'Incident'})
    })

    # Plot the distributions
    plt.figure(figsize=(10, 6))
    sns.histplot(data=plot_df, x='probability', hue='actual', bins=25, kde=True, stat='density', common_norm=False)
    plt.title('Predicted Probabilities by Actual Outcome')
    plt.xlabel('Predicted Probability of Incident')
    plt.ylabel('Density')
    plt.show()

probabilities_plot(optimized_hist_boost_model, test_df)

Path(config.MODEL_DIRECTORY).mkdir(parents=True, exist_ok=True)

with open(config.MODEL, 'wb') as output_file:
    pickle.dump(hist_boost_model, output_file)
    pickle.dump(optimized_hist_boost_model, output_file)






