In [1]:
import pandas as pd
import sklearn as sk
import numpy as np
import datetime
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('finalfinalCleaning.gzip',compression='gzip',index_col=0)

  mask |= (ar1 == a)


In [3]:
def parseAsTime(row):
    row.departure_schedule = datetime.datetime.strptime(row.departure_schedule,"%H:%M:%S").time()
    row.departure_actual = datetime.datetime.strptime(row.departure_actual,"%H:%M:%S").time()
    row.arrival_schedule = datetime.datetime.strptime(row.arrival_schedule,"%H:%M:%S").time()
    row.arrival_actual = datetime.datetime.strptime(row.arrival_actual,"%H:%M:%S").time()
    return row

In [4]:
df_ml = df.loc[0:9999].apply(parseAsTime,axis=1)
#df_ml = df.apply(parseAsTime,axis=1)

In [5]:
df_ml["flight_duration"] = np.nan
def addFlightDuration(row):
    now = datetime.datetime.now()
    row.flight_duration = datetime.datetime.combine(now, row.arrival_schedule) - datetime.datetime.combine(datetime.date.min, row.departure_schedule)
    row.flight_duration = (datetime.datetime.min + row.flight_duration).time()
    return row

In [6]:
df_ml = df_ml.apply(addFlightDuration, axis=1)

In [7]:
def parseAsInt(row):
    row.departure_schedule = row.departure_schedule.minute + (row.departure_schedule.hour*60)
    row.arrival_schedule = row.arrival_schedule.minute + (row.arrival_schedule.hour*60)
    row.flight_duration = row.flight_duration.minute + (row.flight_duration.hour*60)
    return row

In [8]:
df_ml = df_ml.apply(parseAsInt, axis=1)

In [9]:
if(type(df_ml.date[0]) == str):
    df_ml.date = pd.to_datetime(df_ml.date)

In [10]:
df_ml["month"] = np.nan
def addMonth(row):
    row.month = row.date.month
    return row

In [11]:
df_ml = df_ml.apply(addMonth, axis=1)

In [12]:
df_ml["day"] = np.nan
def addDay(row):
    row.day = row.date.day
    return row

In [13]:
df_ml = df_ml.apply(addDay, axis=1)

In [14]:
df_ml["year"] = np.nan
def addYear(row):
    row.year = row.date.year
    return row

In [15]:
df_ml = df_ml.apply(addYear, axis=1)

In [16]:
df_ml.date = df_ml.date.map(datetime.datetime.toordinal)

In [17]:
def meanPerAirport(group):
    return {'mean': round(group.mean(),2)}

airportMean = df.departure_delay.groupby(df_ml.departure_airport).apply(meanPerAirport).unstack()

In [18]:
df_ml["airport_mean"] = np.nan
def addAveragePerAirport(row):
    row["airport_mean"] = airportMean.loc[row.departure_airport].loc["mean"]
    return row

In [19]:
df_ml = df_ml.apply(addAveragePerAirport,axis=1)

In [20]:
def meanPerAirline(group):
    return {'mean': round(group.mean(),2)}

airlineMean = df.departure_delay.groupby(df_ml.airline).apply(meanPerAirline).unstack()

In [21]:
df_ml["airline_mean"] = np.nan
def addAveragePerAirline(row):
    row["airline_mean"] = airlineMean.loc[row.airline].loc["mean"]
    return row

In [22]:
df_ml = df_ml.apply(addAveragePerAirline,axis=1)

In [23]:
def doRidge(alphaParam, errorMargin):
    x = df_ml_input.drop("departure_delay",axis=1).values
    y = df_ml_input.departure_delay.values
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 1)
    ridge = Ridge(alpha = alphaParam, normalize=True)
    ridge.fit(x_train, y_train)
    pred = ridge.predict(x_test)
    correct = 0
    incorrect = 0
    maximum = len(y_test) - 1
    
    for i in range(0,maximum):
        if(abs(pred[i]-y_test[i])<errorMargin):
            correct += 1
        else:
            incorrect += 1
    
    return correct/maximum
 

In [56]:
#df_ml_input = df_ml.loc[:,["airline_mean","airport_mean","date","year","month","day","departure_schedule","arrival_schedule","departure_delay"]]
df_ml_input = df_ml.drop(["arrival_delay", "arrival_lon", "arrival_lat", "departure_lon", "departure_lat", "arrival_actual","departure_actual"],axis=1)

df_ml_input = pd.get_dummies(df_ml_input, drop_first=True)

df_ml_input.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 95 columns):
date                     10000 non-null int64
airline_code             10000 non-null int64
departure_schedule       10000 non-null int64
departure_delay          10000 non-null int64
arrival_schedule         10000 non-null int64
flight_duration          10000 non-null int64
month                    10000 non-null int64
day                      10000 non-null int64
year                     10000 non-null int64
airport_mean             10000 non-null float64
airline_mean             10000 non-null float64
airline_AS               10000 non-null uint8
airline_B6               10000 non-null uint8
airline_CO               10000 non-null uint8
airline_DL               10000 non-null uint8
airline_EV               10000 non-null uint8
airline_F9               10000 non-null uint8
airline_FL               10000 non-null uint8
airline_HA               10000 non-null uint8
airline_MQ    

In [57]:
#df_ml_input = df_ml.drop(["arrival_delay", "arrival_lon", "arrival_lat", "departure_lon", "departure_lat", "arrival_state", "departure_state","departure_airport", "airline","airline_code", "arrival_airport", "arrival_actual","departure_actual","flight_duration"],axis=1)
#df_ml_input = pd.get_dummies(df_ml_input, drop_first=True)

#df_ml_input.info()

In [58]:
#0.5065021673891297 at 10000 rows
#0.8558093015502584 at 20000 rows
doRidge(0,10)

0.5065021673891297

In [61]:
for error in range(1,21):
    scoreArr = []
    for a in (0,.1,.2,.3,.4,.5):
        scoreArr.append(doRidge(a,error))
    print(round(max(scoreArr)*100,2),"%", "is predicted correctly with an error margin of", error, "minutes")
    #print("ridge \n",max(scoreArr),"\n", "alpha:", scoreArr.index(max(scoreArr)),"\n", "errorMargin:", error, "\n", round(max(scoreArr)*100,2),"%")

4.97 % is predicted correctly with an error margin of 1 minutes
9.3 % is predicted correctly with an error margin of 2 minutes
13.9 % is predicted correctly with an error margin of 3 minutes
18.74 % is predicted correctly with an error margin of 4 minutes
23.74 % is predicted correctly with an error margin of 5 minutes
29.21 % is predicted correctly with an error margin of 6 minutes
35.01 % is predicted correctly with an error margin of 7 minutes
40.35 % is predicted correctly with an error margin of 8 minutes
45.82 % is predicted correctly with an error margin of 9 minutes
50.65 % is predicted correctly with an error margin of 10 minutes
55.52 % is predicted correctly with an error margin of 11 minutes
60.15 % is predicted correctly with an error margin of 12 minutes
65.19 % is predicted correctly with an error margin of 13 minutes
69.66 % is predicted correctly with an error margin of 14 minutes
73.79 % is predicted correctly with an error margin of 15 minutes
77.29 % is predicted co