In [1]:
#importing our dependencies

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
from pathlib import Path

from sklearn.linear_model import LinearRegression
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import inspect
import sqlite3

## Filtering and Processing Data

In [3]:
engine = create_engine("sqlite:///Resources/flights.db")

inspector = inspect(engine)
inspector.get_table_names()

['flights']

In [4]:
#import flights_csv
#flights_df = pd.read_csv('Resources/flights_update.csv')
# Make a connection to the SQL database
conn = engine.connect()

In [None]:
#check dtypes
flights_df.dtypes

In [5]:
# Query All Records in the the Database
flights_data_df = pd.read_sql("SELECT * FROM flights", conn)
flights_data_df.head()

Unnamed: 0,MONTH,DATE,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DISTANCE_miles,MINUTES_DELAY
0,1,1,4,AS,98,N407AS,ANC,SEA,1448,-22
1,1,1,4,AA,2336,N3KUAA,LAX,PBI,2330,-9
2,1,1,4,US,840,N171US,SFO,CLT,2296,5
3,1,1,4,AA,258,N3HYAA,LAX,MIA,2342,-9
4,1,1,4,AS,135,N527AS,SEA,ANC,1448,-21


In [None]:
#using minutes_delayed as y value, has to be int
flights_df['MINUTES_DELAY'] = flights_df['MINUTES_DELAY'].values.astype(np.int64)

In [None]:
#checking dtypes again
flights_df.dtypes

In [None]:
#drop columns SORRY MU COMPUTER SUCKS, ADD COLUMNS BACK IN 
flights_df = flights_df.drop([ 'TAIL_NUMBER', ], axis=1)
flights_df

In [None]:
flights_df['IS_Delayed'] = np.where(flights_df['MINUTES_DELAY']<= 0, '0', '1')
flights_df.head(20)

In [None]:
#drop columns SORRY MU COMPUTER SUCKS, ADD COLUMNS BACK IN 
flights_df = flights_df.drop([ 'MINUTES_DELAY', ], axis=1)
flights_df

In [None]:
#new_flights_df datatypes to make sure the minutes_delayed is int
#new_flights_df.dtypes

In [None]:
#unque values
#new_flights_df.nunique()

## Get Dummies

In [None]:
new_flights_encoded = pd.get_dummies(flights_df, columns=["AIRLINE"])
new_flights_encoded.head()

In [None]:
#OA_counts = flights_df.ORIGIN_AIRPORT.value_counts()
#OA_counts.head(20)

In [None]:
# Determine which values to replace if counts are less than ...?
#replace_OA = list(OA_counts[OA_counts <16186].index)
# Replace in dataframe
#for app in replace_OA:
#    flights_df.ORIGIN_AIRPORT = flights_df.ORIGIN_AIRPORT
# Check to make sure binning was successful
#flights_df.ORIGIN_AIRPORT.value_counts()

In [None]:
#DA_counts = flights_df.DESTINATION_AIRPORT.value_counts()
#DA_counts.head(20)

In [None]:
# Determine which values to replace if counts are less than ...?
#replace_DA = list(DA_counts[DA_counts <16184].index)

# Replace in dataframe
#for app in replace_DA:
#    flights_df.DESTINATION_AIRPORT = flights_df.DESTINATION_AIRPORT.replace(app,"Other")
    
# Check to make sure binning was successful
#flights_df.DESTINATION_AIRPORT.value_counts().drop("Other")


In [None]:
# Generate our categorical variable lists
#flight_cat = flights_df.dtypes[flights_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
#flights_df[flight_cat].nunique()

In [None]:
#sample only 100K rows for machine learning
flights_df = flights_df.sample(n = 200000)


## Functions to make data ready for machine learning

In [None]:
# one Hot Encoder
def onehot_encode(flights_df, column_dict):
    flights_df = flights_df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(flights_df[column], prefix=prefix)
        flights_df = pd.concat([flights_df, dummies], axis=1)
        flights_df = flights_df.drop(column, axis=1)
        
    return flights_df
        
  

In [None]:
flights_df

In [None]:
#some functions to help 
#first were gonna make a copy of the flights_df

def preprocessing_inputs(flights_df):
    flights_df = flights_df.copy()
    
    #one-hot encoder nominal feature columns
    flights_df = onehot_encode(
        flights_df,
        column_dict={
            'AIRLINE': 'AL',
            'ORIGIN_AIRPORT': 'OA',
            'DESTINATION_AIRPORT': 'DA'
        })
    
    #fill remaining missing values with columns means
    #Makes everything numaric and fills in null values
    remaining_na_columns = flights_df.loc[:, flights_df.isna().sum() > 0].columns
    for column in remaining_na_columns:
        flights_df[column] = flights_df[column].fillna(flights_df[column].mean())
        
    #training the data in X and y
    #I CHANGED IT TO MINUTES_DELAYED, HE HAD IT AS THE 'CANCELLED' COLUMN WITH WE REMOVED
    #Maybe thats why it wont show anything??

    y = flights_df['IS_Delayed'].copy()
    X = flights_df.drop('IS_Delayed', axis=1).copy()
    
    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, random_state=123)
    
    #scaling the data with standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns= X.columns)
    
    return X_train, X_test, y_train, y_test


In [None]:
def evaluate_model(model, X_test, y_test):
    
    model_acc = model.score(X_test, y_test)
    print("Test Accuracy: {:.2f}%".format(model_acc * 100))
    
    #confusion matrix
    y_true = np.array(y_test)
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_true, y_pred)
    clr = classification_report(y_true, y_pred)
    
    plt.figure(figsize=(8,8))
    sns.heatmap(cm, annot=True, vmin=0, fmt='g', cmap='Blues', cbar=False)
    plt.xticks(np.arange(2) + 0.5, ["ON TIME", "DELAYED"])
    plt.yticks(np.arange(2) + 0.5, ["ON TIME", "DELAYED"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
#checking dtypes again
flights_df.dtypes

In [None]:
X_train, X_test, y_train, y_pred = preprocessing_inputs(flights_df)

In [None]:
X_train

In [None]:
y_train.value_counts()

## Logistical Regression and other Machine Learning

In [None]:
#APPARENTLLY THIS IS STILL TOO MUCH DATA IDK HOW TO FIX
#UNLESS ITS MY COMPUTER
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
evaluate_model(model, X_test, y_pred)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
classifier = LogisticRegression(solver='lbfgs', max_iter=700, random_state=1242)
classifier

In [None]:
LogisticRegression(C=1.0, class_weight="balanced", dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=500, multi_class='warn', penalty='12',
   random_state=1234, solver='lbfgs', tol=0.0001, warm_start=False)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_pred)}")


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec = EasyEnsembleClassifier(random_state=1,n_estimators=100).fit(X_train_scaled, y_train)


In [None]:

from sklearn.metrics import balanced_accuracy_score
y_pred = eec.predict(X_test_scaled)
EE_Boost_bas = balanced_accuracy_score(y_test, y_pred)
print(EE_Boost_bas)