In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error

In [2]:
df_SD = pd.read_csv('SecurityDelay.csv')
df_CD = pd.read_csv('CarrierDelay_Proba.csv')
df_NASD = pd.read_csv('NASDelay_Proba.csv')

In [3]:
df_SD['SecurityDelayMean'].median()

2.7804391045174324

In [4]:
'''
Cette fonction prédit le temps de retard total en minutes et prend en paramètres:
- compagnie: le code de la compagnie aérienne uilisée pour le voyage
- mois : le mois pendant lequel le voyage a lieu. Ici le mois est utilisé par soucis simplicité de programmation sinon
         on devrait plutôt renseigner la date(jour, mois, année)
- temperature : la valeur de la temperature (en Fahrenheit)
- relativeHumidity : la proportion d'humidité (en Pourcentage)
- windSpeed: la vitesse du vent (en miles par heure)
- heure : l'heure de départ (en heure)
- aeroport: le code de l'aeroport de départ

Cette fonction fait une requête dans les differents fichiers csv pour recupérer les valeurs de SecurityDelay, 
CarrierDelay,NASDelay
'''


def prediction(compagnie,mois,temperature,relativeHumidity,windSpeed,visibility,heure,aeroport):
    WeatherDelay=0.2362*temperature+0.2887*relativeHumidity+0.3618*windSpeed+0.6123*visibility
    SecurityDelay = df_SD[(df_SD['Month'] == mois) & (df_SD['Origin'] == aeroport)]['SecurityDelayMean'].values[0]
    CarrierDelay = df_CD[(df_CD['Month'] == mois) & (df_CD['UniqueCarrier'] == compagnie)]['proba_CarrierDelay'].values[0]
    NASDelay = df_NASD[(df_NASD['Hour'] == heure)]['proba_NASDelay'].values[0]
    tempsDeRetardTotal=WeatherDelay+SecurityDelay+CarrierDelay+NASDelay
    
    return tempsDeRetardTotal,SecurityDelay,CarrierDelay

# Exemples de prédicion

In [5]:
prediction('AA','April',71,49.66,18.9,10,19,'IND')

(47.9509180600622, 3.171007927519818, 0.03963149564970343)

In [6]:
prediction('WN','January',16.6,70.99,8.1,10,17,'IND')

(48.85260531899966, 14.421997755331088, 0.03430967222771876)

# Comparaison temps de retard total réél et temps de retard total prédit

In [7]:
# Convertion des numeros des mois en chaine de caractères
def numToMonth(x):
    if x==1:
        return "January"
    if x==2:
        return "February"
    if x==3:
        return "March"
    if x==4:
        return "April"
    if x==5:
        return "May"
    if x==6:
        return "June"
    if x==7:
        return "July"
    if x==8:
        return "August"
    if x==9:
        return "September"
    if x==10:
        return "October"
    if x==11:
        return "November"
    if x==12:
        return "December"

In [8]:
df=pd.read_csv('Jointed_ficher_v1.csv')

In [9]:
df['Month']= df['Month'].apply(lambda x : numToMonth(x) )

In [10]:
df_RP=df.loc[:,['Start_Date','Month','Hour','UniqueCarrier','Origin','Temperature','Relative Humidity','Wind Speed','Visibility','ArrDelay']]
df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0
1,2008-01-03 17:00:00,January,17,EV,IND,16.6,70.99,8.1,10.0,49.0
2,2008-01-03 17:00:00,January,17,9E,IND,16.6,70.99,8.1,10.0,201.0
3,2008-01-03 18:00:00,January,18,WN,IND,19.1,67.4,7.7,10.0,57.0
4,2008-01-03 18:00:00,January,18,NW,IND,19.1,67.4,7.7,10.0,24.0


In [11]:
df_RP['PredictWeatherDelay']=0.2362*df_RP['Temperature']+0.2887*df_RP['Relative Humidity']+0.3618*df_RP['Wind Speed']+0.6123*df_RP['Visibility']  

In [12]:
df_RP=pd.merge(df_RP, df_SD,on=['Month','Origin'])

In [13]:
df_RP=pd.merge(df_RP, df_CD,on=['Month','UniqueCarrier'])

In [14]:
df_RP=pd.merge(df_RP, df_NASD,on=['Hour'])

In [15]:
df_RP['PredictDelay']=df_RP['PredictWeatherDelay']+df_RP['SecurityDelayMean']+df_RP['proba_CarrierDelay']+df_RP['proba_NASDelay']
df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,...,Unnamed: 0_x,CarrierDelayMean,proba_company_carrier,proba_mois_carrier,proba_CarrierDelay,Unnamed: 0_y,NASDelayMean,proba_hour_Nas,proba_NASDelay,PredictDelay
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,...,93,10.618113,0.07206,0.044841,0.03431,16,18.891582,0.049069,0.926985,48.852605
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,...,93,10.618113,0.07206,0.044841,0.03431,16,18.891582,0.049069,0.926985,58.084195
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,...,93,10.618113,0.07206,0.044841,0.03431,16,18.891582,0.049069,0.926985,63.363853
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,...,93,10.618113,0.07206,0.044841,0.03431,16,18.891582,0.049069,0.926985,54.680207
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,...,93,10.618113,0.07206,0.044841,0.03431,16,18.891582,0.049069,0.926985,46.454645


In [16]:
def reduceTime(x):
    if x<60:
        return x-15
    if (x>=60 and x<180):
        return x-30
    if x>=180:
        return x-60

In [17]:
def addTime(x):
    if x<60:
        return x+15
    if (x>=60 and x<180):
        return x+30
    if x>=180:
        return x+60

In [18]:
df_RP['PredictDelayLess']=df_RP['PredictDelay'].apply(lambda x : reduceTime(x))
df_RP['PredictDelayMore']=df_RP['PredictDelay'].apply(lambda x : addTime(x))

In [19]:
df_RP

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,...,proba_company_carrier,proba_mois_carrier,proba_CarrierDelay,Unnamed: 0_y,NASDelayMean,proba_hour_Nas,proba_NASDelay,PredictDelay,PredictDelayLess,PredictDelayMore
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,48.852605,33.852605,63.852605
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,58.084195,43.084195,73.084195
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,63.363853,33.363853,93.363853
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,54.680207,39.680207,69.680207
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,46.454645,31.454645,61.454645
5,2008-01-27 17:00:00,January,17,WN,IND,35.1,62.77,10.2,9.6,35.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,51.364051,36.364051,66.364051
6,2008-01-29 17:00:00,January,17,WN,IND,47.0,90.80,12.0,2.2,34.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,58.387312,43.387312,73.387312
7,2008-01-31 17:00:00,January,17,WN,IND,24.2,53.23,12.3,10.0,21.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,47.039973,32.039973,62.039973
8,2008-01-31 17:00:00,January,17,WN,IND,24.2,53.23,12.3,10.0,110.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,47.039973,32.039973,62.039973
9,2008-01-01 17:00:00,January,17,WN,IND,23.1,69.75,18.6,2.8,135.0,...,0.072060,0.044841,0.034310,16,18.891582,0.049069,0.926985,49.420257,34.420257,64.420257


# Calcul du score R2 (qualité de prédiction d'une regression linéaire)

In [20]:
moyenne=df_RP['ArrDelay'].mean()
moyenne

55.71428236849833

In [21]:

df_RP['SSEi']=(df_RP['PredictDelay']-moyenne)**2

df_RP['SSELessi']=(df_RP['PredictDelayLess']-moyenne)**2

df_RP['SSEMorei']=(df_RP['PredictDelayMore']-moyenne)**2

df_RP['SSTi']=(df_RP['ArrDelay']-moyenne)**2

df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,...,NASDelayMean,proba_hour_Nas,proba_NASDelay,PredictDelay,PredictDelayLess,PredictDelayMore,SSEi,SSELessi,SSEMorei,SSTi
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,...,18.891582,0.049069,0.926985,48.852605,33.852605,63.852605,47.082612,477.932923,66.2323,471.510059
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,...,18.891582,0.049069,0.926985,58.084195,43.084195,73.084195,5.616487,159.519099,301.713876,22.224458
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,...,18.891582,0.049069,0.926985,63.363853,33.363853,93.363853,58.515936,499.541679,1417.490193,106462.36953
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,...,18.891582,0.049069,0.926985,54.680207,39.680207,69.680207,1.069311,257.091563,195.04706,1070.224271
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,...,18.891582,0.049069,0.926985,46.454645,31.454645,61.454645,85.740878,588.52999,32.951767,350.224365


In [22]:
r2_scoreMe=df_RP['SSEi'].sum()/df_RP['SSTi'].sum()
r2_scoreMe

0.0524313652541214

In [23]:
r2_scoreLess=df_RP['SSELessi'].sum()/df_RP['SSTi'].sum()
r2_scoreLess

0.23319178668144225

In [24]:
r2_scoreMore=df_RP['SSEMorei'].sum()/df_RP['SSTi'].sum()
r2_scoreMore

0.038124250077882

In [25]:
df_RP['ArrDelay']=df_RP['ArrDelay'].astype(float)

In [26]:
#df_RP['ArrDelay']

In [27]:
#type(list(df_RP['ArrDelay'].values))

In [28]:
r2_score(list(df_RP['ArrDelay'].values),list(df_RP['PredictDelay'].values))

-0.03680210767341996

In [29]:
r2_score(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values)

-0.21834517449996826

In [30]:
r2_score(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values)

-0.02171234709795189

In [31]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values)))

Mean Absolute Error: 33.25529783464585
Mean Squared Error: 3587.9415927915124
Root Mean Squared Error: 59.899428985521325


In [32]:
median_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values)

22.269129679124674

In [33]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values)))

Mean Absolute Error: 33.29600769886835
Mean Squared Error: 3053.309918629685
Root Mean Squared Error: 55.25676355551133


In [34]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values)))

Mean Absolute Error: 38.171021156747706
Mean Squared Error: 3008.871625831252
Root Mean Squared Error: 54.8531824585525
