# Tasca: Aprenentatge Supervisat - Regressions

In [1]:
# Càrrega de llibreries
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

import scipy.stats as stats
import statsmodels.api as sm
    
pd.set_option('display.max_columns', None)

In [2]:
# Lectura de dades
dfl = pd.read_csv('DelayedFlights.csv')
dfl[:3]

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,N,0,,,,,


In [3]:
# Es crea una copia del dataset
df = dfl.copy()

In [4]:
# S'elimina la primera columna
df = df.drop(df.columns[0],axis=1)
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,N,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,N,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,N,0,,,,,
3,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,N464WN,90.0,90.0,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
4,2008,1,3,4,1940.0,1915,2121.0,2110,WN,378,N726SW,101.0,115.0,87.0,11.0,25.0,IND,JAX,688,4.0,10.0,0,N,0,,,,,


In [5]:
# Dimensió del dataset
df.shape

(1936758, 29)

<span style='color:blue;font-size:20px'> <b> Depuració de dades </b> </span>

In [6]:
# Quantitat de null de cada variable
df.isna().sum()

Year                      0
Month                     0
DayofMonth                0
DayOfWeek                 0
DepTime                   0
CRSDepTime                0
ArrTime                7110
CRSArrTime                0
UniqueCarrier             0
FlightNum                 0
TailNum                   5
ActualElapsedTime      8387
CRSElapsedTime          198
AirTime                8387
ArrDelay               8387
DepDelay                  0
Origin                    0
Dest                      0
Distance                  0
TaxiIn                 7110
TaxiOut                 455
Cancelled                 0
CancellationCode          0
Diverted                  0
CarrierDelay         689270
WeatherDelay         689270
NASDelay             689270
SecurityDelay        689270
LateAircraftDelay    689270
dtype: int64

In [7]:
# S'eliminen les columnes amb més de 500000 valors nuls
df_2 = df.drop(['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'], axis=1)

In [8]:
# S'eliminen totes les observacions amb algun registre nul
df_2 = df_2.dropna(axis=0, how='any')
df_2.isna().sum()

Year                 0
Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
CRSDepTime           0
ArrTime              0
CRSArrTime           0
UniqueCarrier        0
FlightNum            0
TailNum              0
ActualElapsedTime    0
CRSElapsedTime       0
AirTime              0
ArrDelay             0
DepDelay             0
Origin               0
Dest                 0
Distance             0
TaxiIn               0
TaxiOut              0
Cancelled            0
CancellationCode     0
Diverted             0
dtype: int64

In [9]:
df_2.shape

(1928368, 24)

<span style='color:blue;font-size:20px'> <b> Eliminació de variables </b> </span>

In [10]:
# Descriptiu de les variables numèriques
df_2.describe().round(3)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
count,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0
mean,2008.0,6.108,15.752,3.985,1518.648,1467.717,1610.242,1634.196,2184.292,133.306,134.198,108.277,42.2,43.092,764.949,6.811,18.217,0.0,0.0
std,0.0,3.481,8.777,1.996,450.436,424.728,548.001,464.629,1944.448,72.06,71.233,68.643,56.784,53.266,573.886,5.268,14.308,0.0,0.0
min,2008.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,14.0,-21.0,0.0,-109.0,6.0,11.0,0.0,0.0,0.0,0.0
25%,2008.0,3.0,8.0,2.0,1203.0,1135.0,1316.0,1325.0,611.0,80.0,82.0,58.0,9.0,12.0,338.0,4.0,10.0,0.0,0.0
50%,2008.0,6.0,16.0,4.0,1545.0,1510.0,1715.0,1705.0,1543.0,116.0,116.0,90.0,24.0,24.0,606.0,6.0,14.0,0.0,0.0
75%,2008.0,9.0,23.0,6.0,1900.0,1815.0,2030.0,2014.0,3423.0,165.0,165.0,137.0,56.0,53.0,997.0,8.0,21.0,0.0,0.0
max,2008.0,12.0,31.0,7.0,2400.0,2359.0,2400.0,2359.0,9741.0,1114.0,660.0,1091.0,2461.0,2467.0,4962.0,240.0,422.0,0.0,0.0


In [11]:
# Descriptiu de les variables categòriques
df_2.describe(include='object')

Unnamed: 0,UniqueCarrier,TailNum,Origin,Dest,CancellationCode
count,1928368,1928368,1928368,1928368,1928368
unique,20,5360,303,302,1
top,WN,N325SW,ATL,ORD,N
freq,376201,961,131213,108265,1928368


In [12]:
# La variable 'FlightNum' es tracta d'una variable categòrica encara que s'hagi importat com a numèrica.
# Es calcula el nombre de categories de la variable
len(df_2['FlightNum'].unique())

7498

In [13]:
# S'eliminen les variables categòriques DayofMonth, TailNum, Origin, Dest i FlightNum per tenir moltes categories.
df_2 = df_2.drop(['TailNum', 'Origin', 'Dest', 'DayofMonth','FlightNum'], axis=1)

In [14]:
# S'eliminen les variables Cancelled, Diverted, CancellationCode i Year porquè només tenen un únic valor.
df_2 = df_2.drop(['Year', 'Cancelled', 'Diverted', 'CancellationCode'], axis=1)

In [15]:
# S'eliminen les variables DepTime, CRSDepTime, ArrTime i CRSArrTime perquè indiquen l'hora d'un esdeveniment.
df_2 = df_2.drop(['DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime'], axis=1)

In [16]:
# S'eliminen les variables CRSElapsedTime i DayOfWeek perquè es considera que no serveixen per predir la variable objectiu 'ArrDelay'.
df_2 = df_2.drop(['DayOfWeek', 'CRSElapsedTime'], axis=1)

In [17]:
# Correlació lineal entre ArrDelay i ActualElapsedTime
df_2['ArrDelay'].corr(df_2['ActualElapsedTime'])

0.06813024884504432

In [18]:
# S'elimina la variable ActualElapsedTime perquè no està correlacionada linealment amb la variable objectiu ArrDelay
df_2 = df_2.drop(['ActualElapsedTime'], axis=1)

In [19]:
# Es crea una copia
df_3 = df_2.copy()

In [20]:
# Dataset fins el moment
df_3.head()

Unnamed: 0,Month,UniqueCarrier,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut
0,1,WN,116.0,-14.0,8.0,810,4.0,8.0
1,1,WN,113.0,2.0,19.0,810,5.0,10.0
2,1,WN,76.0,14.0,8.0,515,3.0,17.0
3,1,WN,77.0,34.0,34.0,515,3.0,10.0
4,1,WN,87.0,11.0,25.0,688,4.0,10.0


<span style='color:blue;font-size:20px'> <b> Selecció de la mostra </b> </span>

In [21]:
df_3.shape

(1928368, 8)

<span style='color:blue;font-size:15px'> Com que la base de dades és massa gran, es decideix obtenir una mostra de 10000 observacions. El métode de mostreig escollit és l'aleatori simple, encara que caldria estudiar si alguna de les variables restants serviria per a estratificar la mostra. </span>

In [22]:
# Mostra aleatoria simple de 10000 observacions
k = 10000
dfs = df_3.sample(k,random_state=1234)

<span style='color:blue;font-size:20px'> <b> Creació de noves variables </b> </span>

In [23]:
# Variable Velocitat Mitja, que és la divisió entre la distancia (en milles) i el temps (en hores).
dfs['VelMitja'] = round(dfs['Distance']/(dfs['AirTime']/60),2)
dfs.head()

Unnamed: 0,Month,UniqueCarrier,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,VelMitja
276255,2,US,98.0,21.0,31.0,728,7.0,12.0,445.71
1253643,7,CO,191.0,0.0,6.0,1417,8.0,24.0,445.13
607829,4,XE,30.0,72.0,47.0,143,4.0,65.0,286.0
1059724,6,AS,33.0,45.0,34.0,95,3.0,17.0,172.73
407269,3,WN,72.0,18.0,21.0,397,3.0,12.0,330.83


In [24]:
# S'eliminen les variables AirTime i Distance.
dfs = dfs.drop(['AirTime','Distance'],axis=1)

In [25]:
# Es crea la variable Trimestre, que dividieix els mesos en quartre parts.
dfs['Trimestre'] = np.where((dfs['Month'] == 1) | (dfs['Month'] == 2) | (dfs['Month'] == 3), 'Trim_1', 
                             np.where((dfs['Month'] == 4) | (dfs['Month'] == 5) | (dfs['Month'] == 6), 'Trim_2', 
                                      np.where((dfs['Month'] == 7) | (dfs['Month'] == 8) | (dfs['Month'] == 9), 'Trim_3', 'Trim_4')))
dfs[['Month','Trimestre']]

Unnamed: 0,Month,Trimestre
276255,2,Trim_1
1253643,7,Trim_3
607829,4,Trim_2
1059724,6,Trim_2
407269,3,Trim_1
...,...,...
1136053,7,Trim_3
1389963,8,Trim_3
1668722,11,Trim_4
1726767,11,Trim_4


In [26]:
# S'elimina la variable Month
dfs = dfs.drop(['Month'],axis=1)

In [27]:
# Es creen les variables dummies per a Trimestre i UniqueCarrier
dummies = pd.get_dummies(dfs[['Trimestre','UniqueCarrier']], prefix=[None, 'UC'])
dummies

Unnamed: 0,Trim_1,Trim_2,Trim_3,Trim_4,UC_9E,UC_AA,UC_AQ,UC_AS,UC_B6,UC_CO,UC_DL,UC_EV,UC_F9,UC_FL,UC_HA,UC_MQ,UC_NW,UC_OH,UC_OO,UC_UA,UC_US,UC_WN,UC_XE,UC_YV
276255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1253643,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
607829,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1059724,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
407269,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136053,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1389963,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1668722,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1726767,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
# Dataset final
df_final = dfs.join(dummies)
df_final = df_final.drop(['UniqueCarrier','Trimestre'],axis=1)
df_final.head()

Unnamed: 0,ArrDelay,DepDelay,TaxiIn,TaxiOut,VelMitja,Trim_1,Trim_2,Trim_3,Trim_4,UC_9E,UC_AA,UC_AQ,UC_AS,UC_B6,UC_CO,UC_DL,UC_EV,UC_F9,UC_FL,UC_HA,UC_MQ,UC_NW,UC_OH,UC_OO,UC_UA,UC_US,UC_WN,UC_XE,UC_YV
276255,21.0,31.0,7.0,12.0,445.71,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1253643,0.0,6.0,8.0,24.0,445.13,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
607829,72.0,47.0,4.0,65.0,286.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1059724,45.0,34.0,3.0,17.0,172.73,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
407269,18.0,21.0,3.0,12.0,330.83,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


<span style='color:blue;font-size:20px'> <b> Descriptiu del dataset final </b> </span>

In [29]:
# Dimensió del dataset final
df_final.shape

(10000, 29)

In [30]:
# Descriptiu de les dades
df_final.describe().round()

Unnamed: 0,ArrDelay,DepDelay,TaxiIn,TaxiOut,VelMitja,Trim_1,Trim_2,Trim_3,Trim_4,UC_9E,UC_AA,UC_AQ,UC_AS,UC_B6,UC_CO,UC_DL,UC_EV,UC_F9,UC_FL,UC_HA,UC_MQ,UC_NW,UC_OH,UC_OO,UC_UA,UC_US,UC_WN,UC_XE,UC_YV
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,43.0,44.0,7.0,18.0,396.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,58.0,54.0,6.0,15.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,-51.0,6.0,0.0,1.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.0,12.0,4.0,10.0,349.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,24.0,6.0,14.0,404.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,56.0,54.0,8.0,21.0,448.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1129.0,1090.0,240.0,269.0,1296.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


<span style='color:blue;font-size:20px'> <b> Definició de la variable objectiu i dels predictors </b> </span>

In [31]:
# Variable objectiu
y = df_final['ArrDelay']
# Predictors
X = df_final.drop(['ArrDelay'], axis = 1)
# LLista amb el nom dels predictors
X_list = list(X.columns)

<span style='color:blue;font-size:20px'> <b> Mostra d'entrenament i mostra de prova </b> </span>

In [32]:
# train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.20, random_state = 1234)
print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

Training Features Shape: (8000, 28)
Training Labels Shape: (8000,)
Testing Features Shape: (2000, 28)
Testing Labels Shape: (2000,)


## Exercici 1
__Crea almenys tres models de regressió diferents per intentar predir el millor possible l’endarreriment dels vols (ArrDelay) de DelayedFlights.csv.__

<span style='color:blue;font-size:18px'> <b> Model 1: Stepwise Regression </b> </span>

In [33]:
# Es crea una copia de les dades d'entrenament dels predictors
x_train = train_x.copy()

In [34]:
# Funció que aplica la regressió por Mínims Quadrats Ordinaris d'un dataset de predictors y la variable objectiu
def model_est(tx,ty,const=False):
    if const:
        tx = sm.add_constant(tx)
    results = sm.OLS(ty, tx).fit()
    return results

In [35]:
# S'aplica la regressió amb totes les variables
results = model_est(x_train,train_y,const=True)
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               ArrDelay   R-squared:                       0.965
Model:                            OLS   Adj. R-squared:                  0.965
Method:                 Least Squares   F-statistic:                     8480.
Date:                Wed, 19 May 2021   Prob (F-statistic):               0.00
Time:                        17:15:13   Log-Likelihood:                -30084.
No. Observations:                8000   AIC:                         6.022e+04
Df Residuals:                    7973   BIC:                         6.041e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.8363      0.600      8.060      0.0

<span style='color:blue'> Mitjançant el criteri alpha = 0.05, es rebutja totes aquelles variables que tinguin un p-valor de la t de Student superior a 0.05. </span>

<span style='color:blue'> Es comença eliminant la variable amb el p-valor més alt, que és 'UC_OO' </span>

In [36]:
# S'elimina la variable amb p-valor més gran i es torna a aplicar el model
x_train = x_train.drop(['UC_OO'],axis=1)
results = model_est(x_train,train_y,const=True)
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               ArrDelay   R-squared:                       0.965
Model:                            OLS   Adj. R-squared:                  0.965
Method:                 Least Squares   F-statistic:                     8480.
Date:                Wed, 19 May 2021   Prob (F-statistic):               0.00
Time:                        17:15:17   Log-Likelihood:                -30084.
No. Observations:                8000   AIC:                         6.022e+04
Df Residuals:                    7973   BIC:                         6.041e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.3492      0.603      7.216      0.0

<span style='color:blue'> Els següents passos consisteixen en anar eliminant la resta de variables amb p-valors superiors a 0.05 cada cop que es descarta una variable del model. La següent variable que s'elimina és UC_NW. </span>

In [None]:
'''
x_train = x_train.drop(['UC_NW'],axis=1)
results = model_est(x_train,train_y,const=True)
print(results.summary())

x_train = x_train.drop(['Trim_3'],axis=1)
results = model_est(x_train,train_y,const=True)
print(results.summary())

x_train = x_train.drop(['Trim_4'],axis=1)
results = model_est(x_train,train_y,const=True)
print(results.summary())

x_train = x_train.drop(['UC_US'],axis=1)
results = model_est(x_train,train_y,const=True)
print(results.summary())

x_train = x_train.drop(['UC_MQ'],axis=1)
results = model_est(x_train,train_y,const=True)
print(results.summary())

x_train = x_train.drop(['UC_B6'],axis=1)
results = model_est(x_train,train_y,const=True)
print(results.summary())

x_train = x_train.drop(['UC_EV'],axis=1)
results = model_est(x_train,train_y,const=True)
print(results.summary())
'''

<span style='color:blue'> Per a simplificar la tasca, es procedeix al resultat final del model un cop eliminades totes les variables mitjançant el métode backward-looking </span>

In [37]:
# Model final 
x_train = x_train.drop(['UC_NW','Trim_3','Trim_4','UC_US','UC_MQ','UC_B6','UC_EV'],axis=1)
model_1 = model_est(x_train,train_y,const=True)
print(model_1.summary())

                            OLS Regression Results                            
Dep. Variable:               ArrDelay   R-squared:                       0.965
Model:                            OLS   Adj. R-squared:                  0.965
Method:                 Least Squares   F-statistic:                 1.102e+04
Date:                Wed, 19 May 2021   Prob (F-statistic):               0.00
Time:                        17:15:22   Log-Likelihood:                -30089.
No. Observations:                8000   AIC:                         6.022e+04
Df Residuals:                    7979   BIC:                         6.037e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.8771      0.705      5.500      0.0

<span style='color:blue;font-size:18px'> <b> Model 2: Ridge Regression </b> </span>

In [38]:
# Es crea una copa del dataset d'entrenament dels predictors
x_train_2 = train_x.copy()

In [39]:
# Es genera el model d'entrenament mitjançant la Ridge Regression (S'utilitzen totes les variables)
model_2 = RidgeCV()
model_2 = model_2.fit(x_train_2, train_y)

In [40]:
# Intercept del model
print(model_2.intercept_)

6.0888485458055115


In [41]:
# Coeficients del model
print(model_2.coef_)

[ 0.99548104  0.89019316  0.87808833 -0.07135815  0.1521441   1.29231297
 -0.78916727 -0.6552898  -5.21241594  2.04510711 13.32264446  3.49962674
 -2.54704101 -5.60350408 -3.95787325 -2.45980873  3.960371    2.34600399
  7.26188128 -2.09069913 -1.19747629 -5.72325794 -0.66897386  1.22001688
 -1.84263272  4.00867776 -3.00094792 -3.35969835]


<span style='color:blue;font-size:18px'> <b> Model 3: LASSO  </b> </span>

In [42]:
# Es crea una copa del dataset d'entrenament dels predictors
x_train_3 = train_x.copy()

In [43]:
# Aquest métode treballa amb poques variables. S'eliminen les variables dummy
x_train_3 = x_train_3.drop(x_train_3.columns[4:],axis=1)
x_train_3.head()

Unnamed: 0,DepDelay,TaxiIn,TaxiOut,VelMitja
131399,30.0,9.0,18.0,519.48
1221172,130.0,13.0,11.0,431.83
650342,18.0,8.0,9.0,390.0
447821,150.0,4.0,26.0,461.79
125634,13.0,6.0,21.0,318.26


In [44]:
# Es genera el model d'entrenament mitjançant LASSO (S'utilitzen només les variables numèriques)
model_3 = LassoCV()
model_3 = model_3.fit(x_train_3, train_y)

In [45]:
# Intercept del model
print(model_3.intercept_)

6.2970439327316825


In [46]:
# Coeficients del model
print(model_3.coef_)

[ 0.99281541  0.73385591  0.81574998 -0.06666479]


## Exercici 2
__Compara’ls en base al MSE i al R2__

<span style='color:blue;font-size:18px'> <b> Comparació dels MSE (Mitjana del quadrat dels errors) </b> </span>

In [47]:
# Model 1: Stepwise Regression
model_1.mse_resid

108.52755141644293

In [48]:
# Model 2: Ridge Regression
pred_2 = model_2.predict(x_train_2)
MSE_2 = mean_squared_error(train_y, pred_2)
MSE_2

108.10423845222758

In [49]:
# Model 3: LASSO
pred_3 = model_3.predict(x_train_3)
MSE_3 = mean_squared_error(train_y, pred_3)
MSE_3

118.55330687042348

<span style='color:blue;font-size:15px'> El model amb menys MSE és el model 2 Ridge Regression. </span>

<span style='color:blue;font-size:18px'> <b> Comparació dels R2 (Coeficient de determinació) </b> </span>

In [50]:
# Model 1: Stepwise Regression
R2_1 = model_1.rsquared
print('R2:', round(R2_1,4))
R2_1_Adj = 1-(1-R2_1)*(8000-1)/(8000-20-1)
print('R2 Ajustat:', round(R2_1_Adj,4))

R2: 0.9651
R2 Ajustat: 0.965


In [51]:
# Model 2: Ridge Regression
R2_2 = model_2.score(x_train_2, train_y)
print('R2:', round(R2_2,4))
R2_2_Adj = 1-(1-R2_2)*(8000-1)/(8000-26-1)
print('R2 Ajustat:', round(R2_2_Adj,4))

R2: 0.9651
R2 Ajustat: 0.965


In [52]:
# Model 3: LASSO
R2_3 = model_3.score(x_train_3, train_y)
print('R2:',round(R2_3,4))
R2_3_Adj = 1-(1-R2_3)*(8000-1)/(8000-4-1)
print('R2 Ajustat:', round(R2_3_Adj,4))

R2: 0.9617
R2 Ajustat: 0.9617


<span style='color:blue;font-size:15px'> Els models Stepwise i Ridge tenen els R2 ajustats més alts. </span>

## Exercici 3
__Entrena’ls utilitzant els diferents paràmetres que admeten.__

## Exercici 4
__Compara el seu rendiment utilitzant l’aproximació traint/test o utilitzant totes les dades (validació interna)__

<span style='color:blue;font-size:18px'> <b> Rendiment Stepwise Regression </b> </span>

In [53]:
# Dataset de prova dels predictors
x_test_SR = test_x.copy()
x_test_SR = x_test_SR.drop(['UC_OO','UC_NW','Trim_3','Trim_4','UC_US','UC_MQ','UC_B6','UC_EV'],axis=1)

In [54]:
# Prediccions
x_test_SR = sm.add_constant(x_test_SR)
pred_SR = model_1.predict(x_test_SR)

In [55]:
# Error Absolut Mig
errors_SR = abs(pred_SR - test_y)
print('Error Absolut Mig:', round(np.mean(errors_SR), 2), 'degrees.')

Error Absolut Mig: 7.34 degrees.


In [56]:
# Error Percentual Mig Arctangent
errors_p_SR = abs((pred_SR - test_y)/test_y)
print('Error Percentual Mig Arctangent:', round(np.mean(np.arctan(errors_p_SR)),4))

Error Percentual Mig Arctangent: 0.377


In [57]:
# Mean Squared Error
MSE_SR = mean_squared_error(test_y, pred_SR)
MSE_SR

101.05914184799374

In [58]:
# R2
residus = pred_SR - test_y
R2_SR = 1-residus.var()/test_y.var()
print('R2_SR:', round(R2_SR,4))
R2_SR_Adj = 1-(1-R2_SR)*(2000-1)/(2000-20-1)
print('R2_SR Ajustat:', round(R2_SR_Adj,4))

R2_SR: 0.9766
R2_SR Ajustat: 0.9764


<span style='color:blue;font-size:18px'> <b> Rendiment Ridge Regression </b> </span>

In [59]:
# Dataset de prova dels predictors
x_test_RR = test_x.copy()

In [60]:
# Prediccions
pred_RR = model_2.predict(x_test_RR)

In [61]:
# Error Absolut Mig
errors_2 = abs(pred_RR - test_y)
print('Error Absolut Mig:', round(np.mean(errors_2), 2), 'degrees.')

Error Absolut Mig: 7.34 degrees.


In [62]:
# Error Percentual Mig Arctangent
errors_p_RR = abs((pred_RR - test_y)/test_y)
print('Error Percentual Mig Arctangent:', round(np.mean(np.arctan(errors_p_RR)),4))

Error Percentual Mig Arctangent: 0.3764


In [63]:
# Mean Squared Error
MSE_RR = mean_squared_error(test_y, pred_RR)
MSE_RR

101.0898708173139

In [64]:
# R2
R2_RR = model_2.score(x_test_RR, test_y)
print('R2_RR:', round(R2_RR,4))
R2_RR_Adj = 1-(1-R2_RR)*(2000-1)/(2000-20-1)
print('R2_RR Ajustat:', round(R2_RR_Adj,4))

R2_RR: 0.9766
R2_RR Ajustat: 0.9763


<span style='color:blue;font-size:18px'> <b> Rendiment LASSO </b> </span>

In [65]:
# Dataset de prova dels predictors
x_test_L = test_x.copy()
x_test_L = x_test_L.drop(x_test_L.columns[4:],axis=1)

In [66]:
# Prediccions
pred_L = model_3.predict(x_test_L)

In [67]:
# Error Absolut Mig
errors_L = abs(pred_L - test_y)
print('Error Absolut Mig:', round(np.mean(errors_L), 2), 'degrees.')

Error Absolut Mig: 7.71 degrees.


In [68]:
# Error Percentual Mig Arctangent
errors_p_L = abs((pred_L - test_y)/test_y)
print('Error Percentual Mig Arctangent:', round(np.mean(np.arctan(errors_p_L)),4))

Error Percentual Mig Arctangent: 0.3885


In [69]:
# Mean Squared Error
MSE_L = mean_squared_error(test_y, pred_L)
MSE_L

110.043138567327

In [70]:
# R2
R2_L = model_3.score(x_test_L, test_y)
print('R2_L:', round(R2_L,4))
R2_L_Adj = 1-(1-R2_L)*(2000-1)/(2000-4-1)
print('R2_L Ajustat:', round(R2_L_Adj,4))

R2_L: 0.9745
R2_L Ajustat: 0.9745


<span style='color:blue;font-size:15px'> 
    <p> Els models Stepwise i Ridge tenen l'error absolut mig més baix, que és de 7,34. </p> 
    <p> El model Ridge té l'error percentual mig arctanget més baix, que és de 0,3764. </p>
    <p> El model Stepwise té el MSE més baix, que és de 101,0591. </p>
    <p> Els model Stepwise té el R2 ajustat més alt, que és de 0,9764. </p>
</span>

## Exercici 5
__Realitza algun procés d’enginyeria de variables per millorar-ne la predicció.__

In [71]:
x_train_norm = x_train.copy()

In [72]:
# Función que calcula el mínimo y el máximo de las columnas de un dataset, y luego las normaliza (valores entre 0 y 1)
def norm_dataset(dataset):
    for i in dataset.columns:
        if dataset[i].dtypes != 'O':
            value_min = min(dataset[i])
            value_max = max(dataset[i])
            dataset[i] = (dataset[i] - value_min) / (value_max - value_min)
            df = dataset
    return df

In [73]:
X_norm_SR = norm_dataset(x_train_norm)

In [74]:
model_1_norm = model_est(X_norm_SR,train_y,const=True)
print(model_1_norm.summary())

                            OLS Regression Results                            
Dep. Variable:               ArrDelay   R-squared:                       0.965
Model:                            OLS   Adj. R-squared:                  0.965
Method:                 Least Squares   F-statistic:                 1.102e+04
Date:                Wed, 19 May 2021   Prob (F-statistic):               0.00
Time:                        17:16:23   Log-Likelihood:                -30089.
No. Observations:                8000   AIC:                         6.022e+04
Df Residuals:                    7979   BIC:                         6.037e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.5879      0.584      7.859      0.0

In [75]:
x_train_est = x_train.copy()

In [76]:
# Función que calcula la media y desviación tipo de cada columna, y luego calcula sus valores estandarizados
def est_dataset(dataset):
    for i in dataset.columns:
        if dataset[i].dtypes != 'O':
            media = dataset[i].mean()
            desv = dataset[i].std()
            dataset[i] = (dataset[i] - media) / desv
    return dataset

In [77]:
X_est_SR = est_dataset(x_train)

In [78]:
model_1_est = model_est(X_est_SR,train_y,const=True)
print(model_1_est.summary())

                            OLS Regression Results                            
Dep. Variable:               ArrDelay   R-squared:                       0.965
Model:                            OLS   Adj. R-squared:                  0.965
Method:                 Least Squares   F-statistic:                 1.102e+04
Date:                Wed, 19 May 2021   Prob (F-statistic):               0.00
Time:                        17:16:29   Log-Likelihood:                -30089.
No. Observations:                8000   AIC:                         6.022e+04
Df Residuals:                    7979   BIC:                         6.037e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         43.0134      0.116    369.299      0.0

## Exercici 6
__No utilitzis la variable DepDelay a l’hora de fer prediccions.__

<span style='color:blue;font-size:18px'> <b> Model 1: Stepwise Regression </b> </span>

In [79]:
# Es crea una copia de les dades d'entrenament dels predictors
x_train_DD = train_x.copy()
x_train_DD = x_train_DD.drop('DepDelay',axis=1)
x_train_DD.head()

Unnamed: 0,TaxiIn,TaxiOut,VelMitja,Trim_1,Trim_2,Trim_3,Trim_4,UC_9E,UC_AA,UC_AQ,UC_AS,UC_B6,UC_CO,UC_DL,UC_EV,UC_F9,UC_FL,UC_HA,UC_MQ,UC_NW,UC_OH,UC_OO,UC_UA,UC_US,UC_WN,UC_XE,UC_YV
131399,9.0,18.0,519.48,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1221172,13.0,11.0,431.83,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
650342,8.0,9.0,390.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
447821,4.0,26.0,461.79,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
125634,6.0,21.0,318.26,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [80]:
# S'aplica la regressió amb totes les variables
results = model_est(x_train_DD,train_y,const=True)
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               ArrDelay   R-squared:                       0.126
Model:                            OLS   Adj. R-squared:                  0.124
Method:                 Least Squares   F-statistic:                     46.10
Date:                Wed, 19 May 2021   Prob (F-statistic):          1.53e-211
Time:                        17:16:36   Log-Likelihood:                -42965.
No. Observations:                8000   AIC:                         8.598e+04
Df Residuals:                    7974   BIC:                         8.616e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         38.5848      2.977     12.960      0.0

In [81]:
# S'elimina totes les variables amb p-valor major a 0.05
x_train_DD = x_train_DD.drop(['UC_WN','UC_AQ','UC_9E','UC_OH','UC_MQ','UC_F9','UC_CO','UC_HA','UC_DL','UC_US','UC_NW'],axis=1)
model_1_DD = model_est(x_train_DD,train_y,const=True)
print(model_1_DD.summary())

                            OLS Regression Results                            
Dep. Variable:               ArrDelay   R-squared:                       0.125
Model:                            OLS   Adj. R-squared:                  0.124
Method:                 Least Squares   F-statistic:                     76.32
Date:                Wed, 19 May 2021   Prob (F-statistic):          9.21e-219
Time:                        17:16:39   Log-Likelihood:                -42969.
No. Observations:                8000   AIC:                         8.597e+04
Df Residuals:                    7984   BIC:                         8.608e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         38.4742      2.746     14.011      0.0

<span style='color:blue;font-size:18px'> <b> Model 2: Ridge Regression </b> </span>

In [82]:
# Es crea una copia de les dades d'entrenament dels predictors
x_train_DD = train_x.copy()
x_train_DD = x_train_DD.drop('DepDelay',axis=1)

In [83]:
# Es genera el model d'entrenament mitjançant la Ridge Regression (S'utilitzen totes les variables)
model_2_DD = RidgeCV()
model_2_DD = model_2_DD.fit(x_train_DD, train_y)

In [84]:
# Intercept del model
print(model_2_DD.intercept_)

50.51138508538594


In [85]:
# Intercept del model
print(model_2_DD.coef_)

[  1.46097516   1.01460951  -0.09231934   0.33970016   0.5397014
  -0.57056383  -0.30883773  -2.64229463   4.61331195  -0.83402252
   7.53691391   5.91705334  -5.59594985  -6.21942306   5.48842825
  -5.86142755   3.5286457  -10.61888974  -3.44647726  -8.0298589
  -0.88957265   3.22600774  10.82261109  -6.36910814  -2.19681469
   2.35096246   9.21990456]


<span style='color:blue;font-size:18px'> <b> Model 3: LASSO </b> </span>

In [86]:
# Es crea una copia de les dades d'entrenament dels predictors
x_train_DD = train_x.copy()
x_train_DD = x_train_DD.iloc[:,1:4]
x_train_DD.head()

Unnamed: 0,TaxiIn,TaxiOut,VelMitja
131399,9.0,18.0,519.48
1221172,13.0,11.0,431.83
650342,8.0,9.0,390.0
447821,4.0,26.0,461.79
125634,6.0,21.0,318.26


In [87]:
# Es genera el model d'entrenament mitjançant la LASSO (només variables numèriques)
model_3_DD = LassoCV()
model_3_DD = model_3_DD.fit(x_train_DD, train_y)

In [88]:
# Intercept del model
print(model_3_DD.intercept_)

51.426856239837896


In [89]:
# Intercept del model
print(model_3_DD.coef_)

[ 1.38746596  1.01098511 -0.09186316]
