In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error

In [2]:
df_SD = pd.read_csv('SecurityDelay.csv')
df_CD = pd.read_csv('CarrierDelay.csv')
df_NASD = pd.read_csv('NASDelay.csv')

In [3]:
'''
Cette fonction prédit le temps de retard total en minutes et prend en paramètres:
- compagnie: le code de la compagnie aérienne uilisée pour le voyage
- mois : le mois pendant lequel le voyage a lieu. Ici le mois est utilisé par soucis simplicité de programmation sinon
         on devrait plutôt renseigner la date(jour, mois, année)
- temperature : la valeur de la temperature (en Fahrenheit)
- relativeHumidity : la proportion d'humidité (en Pourcentage)
- windSpeed: la vitesse du vent (en miles par heure)
- heure : l'heure de départ (en heure)
- aeroport: le code de l'aeroport de départ

Cette fonction fait une requête dans les differents fichiers csv pour recupérer les valeurs de SecurityDelay, 
CarrierDelay,NASDelay
'''


def prediction(compagnie,mois,temperature,relativeHumidity,windSpeed,visibility,heure,aeroport):
    WeatherDelay=0.0263*temperature+ 0.2482*relativeHumidity+1.2027*windSpeed-0.4090*visibility
    SecurityDelay = df_SD[(df_SD['Month'] == mois) & (df_SD['Origin'] == aeroport)]['SecurityDelayMean'].values[0]
    CarrierDelay = df_CD[(df_CD['Month'] == mois) & (df_CD['UniqueCarrier'] == compagnie)]['CarrierDelayMean'].values[0]
    NASDelay = df_NASD[(df_NASD['Hour'] == heure)]['NASDelayMean'].values[0]
    tempsDeRetardTotal=WeatherDelay+SecurityDelay+CarrierDelay+NASDelay
    
    return tempsDeRetardTotal,SecurityDelay,CarrierDelay

# Exemples de prédicion

In [4]:
prediction('AA','April',71,49.66,18.9,10,19,'IND')

(65.05077116190779, 0.04087591240875912, 17.61548277328796)

In [5]:
prediction('WN','January',16.6,70.99,8.1,10,17,'IND')

(53.542358076289645, 0.3244949494949495, 10.618112995934391)

# Comparaison temps de retard total réél et temps de retard total prédit

In [6]:
# Convertion des numeros des mois en chaine de caractères
def numToMonth(x):
    if x==1:
        return "January"
    if x==2:
        return "February"
    if x==3:
        return "March"
    if x==4:
        return "April"
    if x==5:
        return "May"
    if x==6:
        return "June"
    if x==7:
        return "July"
    if x==8:
        return "August"
    if x==9:
        return "September"
    if x==10:
        return "October"
    if x==11:
        return "November"
    if x==12:
        return "December"

In [7]:
df=pd.read_csv('Jointed_ficher_v1.csv')

In [8]:
df['Month']= df['Month'].apply(lambda x : numToMonth(x) )

In [9]:
df_RP=df.loc[:,['Start_Date','Month','Hour','UniqueCarrier','Origin','Temperature','Relative Humidity','Wind Speed','Visibility','ArrDelay']]
df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0
1,2008-01-03 17:00:00,January,17,EV,IND,16.6,70.99,8.1,10.0,49.0
2,2008-01-03 17:00:00,January,17,9E,IND,16.6,70.99,8.1,10.0,201.0
3,2008-01-03 18:00:00,January,18,WN,IND,19.1,67.4,7.7,10.0,57.0
4,2008-01-03 18:00:00,January,18,NW,IND,19.1,67.4,7.7,10.0,24.0


In [10]:
df_RP['PredictWeatherDelay']=0.0214*df_RP['Temperature']+ 0.2499*df_RP['Relative Humidity']+1.1926*df_RP['Wind Speed']-0.3787*df_RP['Visibility']  

In [11]:
df_RP=pd.merge(df_RP, df_SD,on=['Month','Origin'])

In [12]:
df_RP=pd.merge(df_RP, df_CD,on=['Month','UniqueCarrier'])

In [13]:
df_RP=pd.merge(df_RP, df_NASD,on=['Hour'])

In [14]:
df_RP['PredictDelay']=df_RP['PredictWeatherDelay']+df_RP['SecurityDelayMean']+df_RP['CarrierDelayMean']+df_RP['NASDelayMean']
df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,PredictWeatherDelay,SecurityDelayMean,CarrierDelayMean,NASDelayMean,PredictDelay
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,23.968701,0.324495,10.618113,18.891582,53.802891
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,29.321531,0.324495,10.618113,18.891582,59.155721
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,34.705197,0.324495,10.618113,18.891582,64.539387
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,39.727395,0.324495,10.618113,18.891582,69.561585
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,20.966981,0.324495,10.618113,18.891582,50.801171


In [15]:
def reduceTime(x):
    if x<60:
        return x-15
    if (x>=60 and x<180):
        return x-30
    if x>=180:
        return x-60

In [16]:
def addTime(x):
    if x<60:
        return x+15
    if (x>=60 and x<180):
        return x+30
    if x>=180:
        return x+60

In [17]:
df_RP['PredictDelayLess']=df_RP['PredictDelay'].apply(lambda x : reduceTime(x))
df_RP['PredictDelayMore']=df_RP['PredictDelay'].apply(lambda x : addTime(x))

In [18]:
df_RP

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,PredictWeatherDelay,SecurityDelayMean,CarrierDelayMean,NASDelayMean,PredictDelay,PredictDelayLess,PredictDelayMore
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,23.968701,0.324495,10.618113,18.891582,53.802891,38.802891,68.802891
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,29.321531,0.324495,10.618113,18.891582,59.155721,44.155721,74.155721
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,34.705197,0.324495,10.618113,18.891582,64.539387,34.539387,94.539387
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,39.727395,0.324495,10.618113,18.891582,69.561585,39.561585,99.561585
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,20.966981,0.324495,10.618113,18.891582,50.801171,35.801171,65.801171
5,2008-01-27 17:00:00,January,17,WN,IND,35.1,62.77,10.2,9.6,35.0,24.966363,0.324495,10.618113,18.891582,54.800553,39.800553,69.800553
6,2008-01-29 17:00:00,January,17,WN,IND,47.0,90.80,12.0,2.2,34.0,37.174780,0.324495,10.618113,18.891582,67.008970,37.008970,97.008970
7,2008-01-31 17:00:00,January,17,WN,IND,24.2,53.23,12.3,10.0,21.0,24.702037,0.324495,10.618113,18.891582,54.536227,39.536227,69.536227
8,2008-01-31 17:00:00,January,17,WN,IND,24.2,53.23,12.3,10.0,110.0,24.702037,0.324495,10.618113,18.891582,54.536227,39.536227,69.536227
9,2008-01-01 17:00:00,January,17,WN,IND,23.1,69.75,18.6,2.8,135.0,39.046865,0.324495,10.618113,18.891582,68.881055,38.881055,98.881055


# Calcul du score R2 (qualité de prédiction d'une regression linéaire)

In [19]:
moyenne=df_RP['ArrDelay'].mean()
df_RP['SSEi']=(df_RP['PredictDelay']-moyenne)**2

df_RP['SSELessi']=(df_RP['PredictDelayLess']-moyenne)**2

df_RP['SSEMorei']=(df_RP['PredictDelayMore']-moyenne)**2

df_RP['SSTi']=(df_RP['ArrDelay']-moyenne)**2

df_RP.head()

Unnamed: 0,Start_Date,Month,Hour,UniqueCarrier,Origin,Temperature,Relative Humidity,Wind Speed,Visibility,ArrDelay,...,SecurityDelayMean,CarrierDelayMean,NASDelayMean,PredictDelay,PredictDelayLess,PredictDelayMore,SSEi,SSELessi,SSEMorei,SSTi
0,2008-01-03 17:00:00,January,17,WN,IND,16.6,70.99,8.1,10.0,34.0,...,0.324495,10.618113,18.891582,53.802891,38.802891,68.802891,3.653417,285.995155,171.311678,471.510059
1,2008-01-05 17:00:00,January,17,WN,IND,41.1,80.29,10.2,10.0,51.0,...,0.324495,10.618113,18.891582,59.155721,44.155721,74.155721,11.8435,133.600339,340.086662,22.224458
2,2008-01-07 17:00:00,January,17,WN,IND,65.0,71.63,16.1,10.0,382.0,...,0.324495,10.618113,18.891582,64.539387,34.539387,94.539387,77.882473,448.376191,1507.388756,106462.36953
3,2008-01-13 17:00:00,January,17,WN,IND,32.1,90.95,13.9,0.7,23.0,...,0.324495,10.618113,18.891582,69.561585,39.561585,99.561585,191.747792,260.90963,1922.585955,1070.224271
4,2008-01-23 17:00:00,January,17,WN,IND,23.1,56.99,8.4,10.0,37.0,...,0.324495,10.618113,18.891582,50.801171,35.801171,65.801171,24.138663,396.532001,101.745324,350.224365


In [20]:
r2_scoreMe=df_RP['SSEi'].sum()/df_RP['SSTi'].sum()
r2_scoreMe

0.041028962588435886

In [21]:
r2_scoreLess=df_RP['SSELessi'].sum()/df_RP['SSTi'].sum()
r2_scoreLess

0.14410400930778225

In [22]:
r2_scoreMore=df_RP['SSEMorei'].sum()/df_RP['SSTi'].sum()
r2_scoreMore

0.2608245541041357

In [23]:
type(df_RP['PredictDelay'].values)

numpy.ndarray

In [24]:
from sklearn.metrics import mean_squared_error, r2_score
r2_score(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values)

-0.00016745852685851048

In [25]:
r2_score(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values)

-0.19816671521536056

In [26]:
r2_score(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values)

-0.12503884007340238

In [27]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayLess'].values)))

Mean Absolute Error: 32.47673314330539
Mean Squared Error: 3313.1609434592
Root Mean Squared Error: 57.560063789568545


In [28]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelay'].values)))

Mean Absolute Error: 36.25687969113768
Mean Squared Error: 2945.4234311535733
Root Mean Squared Error: 54.271755371957276


In [29]:
print('Mean Absolute Error:', mean_absolute_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values))
print('Mean Squared Error:', mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(df_RP['ArrDelay'].values,df_RP['PredictDelayMore'].values)))

Mean Absolute Error: 46.33020152443314
Mean Squared Error: 3528.517437091624
Root Mean Squared Error: 59.401325213261224
