In [3]:
# constanten
BASIC_PATH = '../Data/'
ALL_FILES = BASIC_PATH + '*.csv'

from math import radians, sin, cos, acos


usaStates = [
    "AL",
    "AK",
    "AZ",
    "AR",
    "CA",
    "CO",
    "CT",
    "DE",
    "FL",
    "GA",
    "HI",
    "ID",
    "IL",
    "IN",
    "IA",
    "KS",
    "KY",
    "LA",
    "ME",
    "MD",
    "MA",
    "MI",
    "MN",
    "MS",
    "MO",
    "MT",
    "NE",
    "NV",
    "NH",
    "NJ",
    "NM",
    "NY",
    "NC",
    "ND",
    "OH",
    "OK",
    "OR",
    "PA",
    "RI",
    "SC",
    "SD",
    "TN",
    "TX",
    "UT",
    "VT",
    "VA",
    "WA",
    "WV",
    "WI",
    "WY",
]

In [4]:
# zelf geschreven functies

import pandas as pd
import numpy as np
import glob as glob

def readAllFiles():
    files = glob.glob(ALL_FILES)
    frames = []

    for file in files:
        df = pd.read_csv(file, index_col = 0)
        frames.append(df)

    return pd.concat(frames)

def readOneFile(url):
    return pd.read_csv(url, index_col = 0)

def exportDfToCsvFiles(df):
    step = 1000000
    start = 0
    stop = step
    i = 0

    while start < len(df):
        if stop >= len(df):
            stop = len(df)
        fileName = BASIC_PATH +  'flights_2010_' + str(i) + '.csv'
        
        data = df.iloc[start:stop, 0:]
        data.to_csv( fileName, sep=',')

        start += step
        stop += step
        i += 1
        
def deleteWrongStates(df):
    print("Aantal records:", len(df))
    copy = df
    
    for el in copy.departure_state.unique():
        if(el not in usaStates):
            copy = copy.drop(copy[copy['departure_state'] == el].index)
    print("Aantal records na verwijderen foute vertrek staat:", len(copy))

    for el in copy.arrival_state.unique():
        if(el not in usaStates):
            copy = copy.drop(copy[copy['arrival_state'] == el].index)
    print("Aantal records na verwijderen foute aankomst staat:", len(copy))
          
    return copy

def convertColumnTypes(df):
    df.departure_schedule = df.departure_schedule.astype(int)
    df.departure_delay = df.departure_delay.astype(float)
    df.arrival_schedule = df.arrival_schedule.astype(int)
    df.arrival_delay = df.arrival_delay.astype(float)
    df.arrival_actual = df.arrival_actual.astype(int)
    df.departure_actual = df.departure_actual.astype(int)
    return df

def dropMoreAdvancedDuplicates(df):
    copy = df
    copy = copy.groupby(['date', 'airline', 'airline_code', 'departure_airport', 'departure_state', 'departure_lat', 'departure_lon', 'departure_schedule', 'arrival_airport', 'arrival_state', 'arrival_lat', 'arrival_lon', 'arrival_schedule']).mean().reset_index()
    copy = convertColumnTypes(copy)
    copy = copy.drop(['index'], axis=1) #remove old index
    return copy

def calcDistance(df):
    array = [];
    for index, row in df.iterrows():
        array.append(calcTheDistance(row))
    return array;

def calcTheDistance(el):
    slat = radians(float(el["arrival_lat"]))
    slon = radians(float(el["arrival_lon"]))
    elat = radians(float(el["departure_lat"]))
    elon = radians(float(el["departure_lon"]))
    return 6371.01 * acos(sin(slat)*sin(elat) + cos(slat)*cos(elat)*cos(slon - elon))


def cleanData(df):
    print("Aantal records om te beginnen:", len(df))
    copy = df
    
    copy = copy.drop_duplicates()
    print("Aantal records na verwijderen van dubbels:", len(copy))
    
    copy = copy.reset_index()
    print("Aantal records na nieuwe index:", len(copy))
    
    copy = copy.dropna()
    print("Aantal records na verwijderen lege waarden:", len(copy))
    
    copy = deleteWrongStates(copy)
    copy = convertColumnTypes(copy)
    
    copy = dropMoreAdvancedDuplicates(copy)
    print("Aantal records na het verwijderen van de verborgen dubbels:", len(copy))
    
    distance = calcDistance(df2)

    copy['distance'] = distance
    
    return copy

In [5]:
df = readAllFiles()
df2 = cleanData(df)

Aantal records om te beginnen: 10642028
Aantal records na verwijderen van dubbels: 10642028
Aantal records na nieuwe index: 10642028
Aantal records na verwijderen lege waarden: 10642028
Aantal records: 10642028
Aantal records na verwijderen foute vertrek staat: 10642028
Aantal records na verwijderen foute aankomst staat: 10642028
Aantal records na het verwijderen van de verborgen dubbels: 10642028


In [31]:
# ML
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
enc = LabelEncoder();

X = df.loc[:,['date', 'airline', 'departure_airport', 'arrival_airport']]
X.date = enc.fit_transform(X.date)
X.airline = enc.fit_transform(X.airline)
X.departure_airport = enc.fit_transform(X.departure_airport)
X.arrival_airport = enc.fit_transform(X.arrival_airport)

y = df.loc[:,['arrival_delay']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

y_predict = model.predict(X_test)


# The coefficients
print('Coefficients: \n', model.coef_)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_predict))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.5f' % r2_score(y_test, y_predict))


Coefficients: 
 [[-0.00441251 -0.01723296 -0.00230018 -0.00156591]]
Mean squared error: 1329.10
Variance score: 0.00061
0.000613246714151


In [None]:
# ML model 2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
enc = LabelEncoder();

X = df.loc[:,['date', 'airline', 'departure_airport', 'arrival_airport']]
X.date = enc.fit_transform(X.date)
X.airline = enc.fit_transform(X.airline)
X.departure_airport = enc.fit_transform(X.departure_airport)
X.arrival_airport = enc.fit_transform(X.arrival_airport)

y = df.loc[:,['arrival_delay']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

y_predict = model.predict(X_test)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_predict))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.5f' % r2_score(y_test, y_predict))




In [28]:
exportDfToCsvFiles(df2)

In [7]:
#creating a json file
df2.to_json('flights_2010_file_time.json', orient='records')

In [4]:
# Controleer of er records zijn die een vertrektijd hebben die vroeger is dan de aankomsttijd
# Omdat de aankomstdatum niet bijgehouden wordt, kan je niet zeker zijn of het de volgende dag is een een fout record
# Daarom controleren we ook nog of de geplande vliegtijd meer dan x aantal minuten te snel is
# We hebben geen extreme waardes gevonden en daarom geen records verwijderd
df2[(df2.departure_schedule > df2.arrival_schedule) & ((df2.arrival_delay - df2.departure_delay) < -80)]

Unnamed: 0,date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,departure_schedule,arrival_airport,arrival_state,arrival_lat,arrival_lon,arrival_schedule,departure_actual,departure_delay,arrival_actual,arrival_delay


In [25]:
#selecting SouthWest Airlines
df_southWest = df2.loc[df['airline'] == "WN"]
#selecting Alaska Airlines
df_alaska = df2.loc[df['airline'] == "AS"]
#selecting Hawaiian Airlines
df_hawaiian = df2.loc[df['airline'] == "HA"]

In [28]:
df_southWest = df_southWest[['date', 'departure_airport','departure_state',
                          'departure_lat', 'departure_lon',
                          'arrival_airport', 'arrival_state',
                          'arrival_lat', 'arrival_lon']]
df_alaska = df_alaska[['date', 'departure_airport','departure_state',
                          'departure_lat', 'departure_lon',
                          'arrival_airport', 'arrival_state',
                          'arrival_lat', 'arrival_lon']]
df_hawaiian = df_hawaiian[['date', 'departure_airport','departure_state',
                          'departure_lat', 'departure_lon',
                          'arrival_airport', 'arrival_state',
                          'arrival_lat', 'arrival_lon']]

In [29]:
#creating a json file for SouthWest Airlines
df_southWest.to_json('SouthWest_Airlines.json', orient='records')

In [30]:
#creating a json file for Alaska Airlines
df_southWest.to_json('Alaska_Airlines.json', orient='records')

In [31]:
#creating a json file for Hawaiian Airlines
df_southWest.to_json('Hawaiian_Airlines.json', orient='records')