In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error

In [2]:
df_SD = pd.read_csv('SecurityDelay.csv')
df_CD = pd.read_csv('CarrierDelay.csv')
df_NASD = pd.read_csv('NASDelay.csv')

In [3]:
df_SD['SecurityDelayMean'].median()

0.035495783455006716

In [4]:
'''
Cette fonction prédit le temps de retard total en minutes et prend en paramètres:
- compagnie: le code de la compagnie aérienne uilisée pour le voyage
- mois : le mois pendant lequel le voyage a lieu. Ici le mois est utilisé par soucis simplicité de programmation sinon
         on devrait plutôt renseigner la date(jour, mois, année)
- temperature : la valeur de la temperature (en Fahrenheit)
- relativeHumidity : la proportion d'humidité (en Pourcentage)
- windSpeed: la vitesse du vent (en miles par heure)
- heure : l'heure de départ (en heure)
- aeroport: le code de l'aeroport de départ

Cette fonction fait une requête dans les differents fichiers csv pour recupérer les valeurs de SecurityDelay, 
CarrierDelay,NASDelay
'''


def prediction(compagnie,mois,temperature,relativeHumidity,windSpeed,visibility,heure,aeroport):
    WeatherDelay=0.2362*temperature+0.2887*relativeHumidity+0.3618*windSpeed+0.6123*visibility
    SecurityDelay = df_SD[(df_SD['Month'] == mois) & (df_SD['Origin'] == aeroport)]['SecurityDelayMean'].values[0]
    CarrierDelay = df_CD[(df_CD['Month'] == mois) & (df_CD['UniqueCarrier'] == compagnie)]['CarrierDelayMean'].values[0]
    NASDelay = df_NASD[(df_NASD['Hour'] == heure)]['NASDelayMean'].values[0]
    tempsDeRetardTotal=WeatherDelay+SecurityDelay+CarrierDelay+NASDelay
    
    return tempsDeRetardTotal,SecurityDelay,CarrierDelay

# Exemples de prédicion

In [5]:
prediction('AA','April',71,49.66,18.9,10,19,'IND')

(76.28489116190781, 0.04087591240875912, 17.61548277328796)

In [6]:
prediction('WN','January',16.6,70.99,8.1,10,17,'IND')

(63.303503076289644, 0.3244949494949495, 10.618112995934391)

# Comparaison temps de retard total réél et temps de retard total prédit

In [7]:
# Convertion des numeros des mois en chaine de caractères
def numToMonth(x):
    if x==1:
        return "January"
    if x==2:
        return "February"
    if x==3:
        return "March"
    if x==4:
        return "April"
    if x==5:
        return "May"
    if x==6:
        return "June"
    if x==7:
        return "July"
    if x==8:
        return "August"
    if x==9:
        return "September"
    if x==10:
        return "October"
    if x==11:
        return "November"
    if x==12:
        return "December"

In [8]:
df=pd.read_csv('Jointed_ficher_v1.csv')

In [9]:
df['Month']= df['Month'].apply(lambda x : numToMonth(x) )

In [10]:
df_RP=df.loc[:,['Start_Date','Month','Hour','UniqueCarrier','Origin','Temperature','Relative Humidity','Wind Speed','Visibility','ArrDelay']]
df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0
1,2008-01-03 17:00:00,January,17,EV,IND,16.6,70.99,8.1,10.0,49.0
2,2008-01-03 17:00:00,January,17,9E,IND,16.6,70.99,8.1,10.0,201.0
3,2008-01-03 18:00:00,January,18,WN,IND,19.1,67.4,7.7,10.0,57.0
4,2008-01-03 18:00:00,January,18,NW,IND,19.1,67.4,7.7,10.0,24.0


In [11]:
df_RP['PredictWeatherDelay']=0.2362*df_RP['Temperature']+0.2887*df_RP['Relative Humidity']+0.3618*df_RP['Wind Speed']+0.6123*df_RP['Visibility']  

In [12]:
df_RP=pd.merge(df_RP, df_SD,on=['Month','Origin'])

In [13]:
df_RP=pd.merge(df_RP, df_CD,on=['Month','UniqueCarrier'])

In [14]:
df_RP=pd.merge(df_RP, df_NASD,on=['Hour'])

In [15]:
df_RP['PredictDelay']=0.0037*df_RP['PredictWeatherDelay']+0.0002*df_RP['SecurityDelayMean']+0.0391*df_RP['CarrierDelayMean']+0.0481*df_RP['NASDelayMean']
df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,PredictWeatherDelay,SecurityDelayMean,CarrierDelayMean,NASDelayMean,PredictDelay
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,33.469313,0.324495,10.618113,18.891582,1.447755
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,42.700903,0.324495,10.618113,18.891582,1.481912
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,47.980561,0.324495,10.618113,18.891582,1.501446
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,39.296915,0.324495,10.618113,18.891582,1.469317
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,31.071353,0.324495,10.618113,18.891582,1.438882


In [16]:
def reduceTime(x):
    if x<60:
        return x-15
    if (x>=60 and x<180):
        return x-30
    if x>=180:
        return x-60

In [17]:
def addTime(x):
    if x<60:
        return x+15
    if (x>=60 and x<180):
        return x+30
    if x>=180:
        return x+60

In [18]:
df_RP['PredictDelayLess']=df_RP['PredictDelay'].apply(lambda x : reduceTime(x))
df_RP['PredictDelayMore']=df_RP['PredictDelay'].apply(lambda x : addTime(x))

In [19]:
df_RP

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,PredictWeatherDelay,SecurityDelayMean,CarrierDelayMean,NASDelayMean,PredictDelay,PredictDelayLess,PredictDelayMore
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,33.469313,0.324495,10.618113,18.891582,1.447755,-13.552245,16.447755
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,42.700903,0.324495,10.618113,18.891582,1.481912,-13.518088,16.481912
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,47.980561,0.324495,10.618113,18.891582,1.501446,-13.498554,16.501446
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,39.296915,0.324495,10.618113,18.891582,1.469317,-13.530683,16.469317
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,31.071353,0.324495,10.618113,18.891582,1.438882,-13.561118,16.438882
5,2008-01-27 17:00:00,January,17,WN,IND,35.1,62.77,10.2,9.6,35.0,35.980759,0.324495,10.618113,18.891582,1.457047,-13.542953,16.457047
6,2008-01-29 17:00:00,January,17,WN,IND,47.0,90.80,12.0,2.2,34.0,43.004020,0.324495,10.618113,18.891582,1.483033,-13.516967,16.483033
7,2008-01-31 17:00:00,January,17,WN,IND,24.2,53.23,12.3,10.0,21.0,31.656681,0.324495,10.618113,18.891582,1.441048,-13.558952,16.441048
8,2008-01-31 17:00:00,January,17,WN,IND,24.2,53.23,12.3,10.0,110.0,31.656681,0.324495,10.618113,18.891582,1.441048,-13.558952,16.441048
9,2008-01-01 17:00:00,January,17,WN,IND,23.1,69.75,18.6,2.8,135.0,34.036965,0.324495,10.618113,18.891582,1.449855,-13.550145,16.449855


# Calcul du score R2 (qualité de prédiction d'une regression linéaire)

In [20]:
moyenne=df_RP['ArrDelay'].mean()
moyenne

55.71428236849833

In [21]:

df_RP['SSEi']=(df_RP['PredictDelay']-moyenne)**2

df_RP['SSELessi']=(df_RP['PredictDelayLess']-moyenne)**2

df_RP['SSEMorei']=(df_RP['PredictDelayMore']-moyenne)**2

df_RP['SSTi']=(df_RP['ArrDelay']-moyenne)**2

df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,...,SecurityDelayMean,CarrierDelayMean,NASDelayMean,PredictDelay,PredictDelayLess,PredictDelayMore,SSEi,SSELessi,SSEMorei,SSTi
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,...,0.324495,10.618113,18.891582,1.447755,-13.552245,16.447755,2944.856028,4797.851859,1541.860197,471.510059
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,...,0.324495,10.618113,18.891582,1.481912,-13.518088,16.481912,2941.150044,4793.121168,1539.178919,22.224458
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,...,0.324495,10.618113,18.891582,1.501446,-13.498554,16.501446,2939.031595,4790.416678,1537.646513,106462.36953
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,...,0.324495,10.618113,18.891582,1.469317,-13.530683,16.469317,2942.516289,4794.865256,1540.167322,1070.224271
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,...,0.324495,10.618113,18.891582,1.438882,-13.561118,16.438882,2945.819061,4799.081065,1542.557057,350.224365


In [22]:
r2_scoreMe=df_RP['SSEi'].sum()/df_RP['SSTi'].sum()
r2_scoreMe

0.9951457353347839

In [23]:
r2_scoreLess=df_RP['SSELessi'].sum()/df_RP['SSTi'].sum()
r2_scoreLess

1.6230185372273658

In [24]:
r2_scoreMore=df_RP['SSEMorei'].sum()/df_RP['SSTi'].sum()
r2_scoreMore

0.520077909316526

In [25]:
df_RP['ArrDelay']=df_RP['ArrDelay'].astype(float)

In [26]:
#df_RP['ArrDelay']

In [27]:
#type(list(df_RP['ArrDelay'].values))

In [28]:
r2_score(list(df_RP['ArrDelay'].values),list(df_RP['PredictDelay'].values))

-0.9947753676569624

In [29]:
r2_score(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values)

-1.6226481695495432

In [30]:
r2_score(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values)

-0.5197075416387034

In [31]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values)))

Mean Absolute Error: 69.13472082306275
Mean Squared Error: 7723.51600165145
Root Mean Squared Error: 87.88353657910821


In [32]:
median_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values)

35.252700410203786

In [33]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values)))

Mean Absolute Error: 54.134720823062764
Mean Squared Error: 5874.474376959567
Root Mean Squared Error: 76.64511972043339


In [34]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values)))

Mean Absolute Error: 39.2955574298468
Mean Squared Error: 4475.432752267682
Root Mean Squared Error: 66.89867526541674
