# Tasca: Aprenentatge Supervisat - Classificació

## Càrrega de llibreries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.neural_network import MLPClassifier

import xgboost as xgb

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_columns', None)

## Lectura de la base de dades

In [2]:
# Lectura de dades
dfl = pd.read_csv('DelayedFlights.csv')
dfl[:3]

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,N,0,,,,,


## Tractament de la base de dades

In [3]:
# Es crea una copia del dataset
df = dfl.copy()

In [4]:
# S'elimina la primera columna
df = df.drop(df.columns[0],axis=1)
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,N,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,N,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,N,0,,,,,
3,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,N464WN,90.0,90.0,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
4,2008,1,3,4,1940.0,1915,2121.0,2110,WN,378,N726SW,101.0,115.0,87.0,11.0,25.0,IND,JAX,688,4.0,10.0,0,N,0,,,,,


In [5]:
# Dimensió del dataset
df.shape

(1936758, 29)

In [6]:
# Quantitat de null de cada variable
df.isna().sum()

Year                      0
Month                     0
DayofMonth                0
DayOfWeek                 0
DepTime                   0
CRSDepTime                0
ArrTime                7110
CRSArrTime                0
UniqueCarrier             0
FlightNum                 0
TailNum                   5
ActualElapsedTime      8387
CRSElapsedTime          198
AirTime                8387
ArrDelay               8387
DepDelay                  0
Origin                    0
Dest                      0
Distance                  0
TaxiIn                 7110
TaxiOut                 455
Cancelled                 0
CancellationCode          0
Diverted                  0
CarrierDelay         689270
WeatherDelay         689270
NASDelay             689270
SecurityDelay        689270
LateAircraftDelay    689270
dtype: int64

In [7]:
# S'eliminen les columnes amb més de 500000 valors nuls
df_2 = df.drop(['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'], axis=1)

In [8]:
# S'eliminen totes les observacions amb algun registre nul
df_2 = df_2.dropna(axis=0, how='any')
df_2.isna().sum()

Year                 0
Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
CRSDepTime           0
ArrTime              0
CRSArrTime           0
UniqueCarrier        0
FlightNum            0
TailNum              0
ActualElapsedTime    0
CRSElapsedTime       0
AirTime              0
ArrDelay             0
DepDelay             0
Origin               0
Dest                 0
Distance             0
TaxiIn               0
TaxiOut              0
Cancelled            0
CancellationCode     0
Diverted             0
dtype: int64

In [9]:
# Dimensions de la base de dades
df_2.shape

(1928368, 24)

<span style='color:blue;font-size:18px'> <b> Eliminació de variables </b> </span>

In [10]:
# Descriptiu de les variables numèriques
df_2.describe().round(3)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
count,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0,1928368.0
mean,2008.0,6.108,15.752,3.985,1518.648,1467.717,1610.242,1634.196,2184.292,133.306,134.198,108.277,42.2,43.092,764.949,6.811,18.217,0.0,0.0
std,0.0,3.481,8.777,1.996,450.436,424.728,548.001,464.629,1944.448,72.06,71.233,68.643,56.784,53.266,573.886,5.268,14.308,0.0,0.0
min,2008.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,14.0,-21.0,0.0,-109.0,6.0,11.0,0.0,0.0,0.0,0.0
25%,2008.0,3.0,8.0,2.0,1203.0,1135.0,1316.0,1325.0,611.0,80.0,82.0,58.0,9.0,12.0,338.0,4.0,10.0,0.0,0.0
50%,2008.0,6.0,16.0,4.0,1545.0,1510.0,1715.0,1705.0,1543.0,116.0,116.0,90.0,24.0,24.0,606.0,6.0,14.0,0.0,0.0
75%,2008.0,9.0,23.0,6.0,1900.0,1815.0,2030.0,2014.0,3423.0,165.0,165.0,137.0,56.0,53.0,997.0,8.0,21.0,0.0,0.0
max,2008.0,12.0,31.0,7.0,2400.0,2359.0,2400.0,2359.0,9741.0,1114.0,660.0,1091.0,2461.0,2467.0,4962.0,240.0,422.0,0.0,0.0


In [11]:
# Descriptiu de les variables categòriques
df_2.describe(include='object')

Unnamed: 0,UniqueCarrier,TailNum,Origin,Dest,CancellationCode
count,1928368,1928368,1928368,1928368,1928368
unique,20,5360,303,302,1
top,WN,N325SW,ATL,ORD,N
freq,376201,961,131213,108265,1928368


In [12]:
# La variable 'FlightNum' es tracta d'una variable categòrica encara que s'hagi importat com a numèrica.
# Es calcula el nombre de categories de la variable
len(df_2['FlightNum'].unique())

7498

In [12]:
# S'eliminen les variables categòriques DayofMonth, TailNum, Origin, Dest i FlightNum per tenir moltes categories.
df_2 = df_2.drop(['TailNum', 'Origin', 'Dest', 'DayofMonth','FlightNum'], axis=1)

In [13]:
# S'eliminen les variables Cancelled, Diverted, CancellationCode i Year porquè només tenen un únic valor.
df_2 = df_2.drop(['Year', 'Cancelled', 'Diverted', 'CancellationCode'], axis=1)

In [14]:
# S'eliminen les variables DepTime, CRSDepTime, ArrTime i CRSArrTime perquè indiquen l'hora d'un esdeveniment.
df_2 = df_2.drop(['DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime'], axis=1)

In [15]:
# S'eliminen les variables CRSElapsedTime i DayOfWeek perquè es considera que no serveixen per predir la variable objectiu 'ArrDelay'.
df_2 = df_2.drop(['DayOfWeek', 'CRSElapsedTime'], axis=1)

In [16]:
# Correlació lineal entre ArrDelay i ActualElapsedTime
df_2['ArrDelay'].corr(df_2['ActualElapsedTime'])

0.06813024884504432

In [17]:
# S'elimina la variable ActualElapsedTime perquè no està correlacionada linealment amb la variable objectiu ArrDelay
df_2 = df_2.drop(['ActualElapsedTime'], axis=1)

In [18]:
# Es crea una copia
df_3 = df_2.copy()

In [19]:
# Dataset fins el moment
df_3.head()

Unnamed: 0,Month,UniqueCarrier,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut
0,1,WN,116.0,-14.0,8.0,810,4.0,8.0
1,1,WN,113.0,2.0,19.0,810,5.0,10.0
2,1,WN,76.0,14.0,8.0,515,3.0,17.0
3,1,WN,77.0,34.0,34.0,515,3.0,10.0
4,1,WN,87.0,11.0,25.0,688,4.0,10.0


## Selecció de la mostra

In [20]:
# Dimensions
df_3.shape

(1928368, 8)

<span style='color:blue;font-size:15px'> Com que la base de dades és massa gran, es decideix obtenir una mostra de 10000 observacions. El métode de mostreig escollit és l'aleatori simple, encara que caldria estudiar si alguna de les variables restants serviria per a estratificar la mostra. </span>

In [21]:
# Mostra aleatoria simple de 10000 observacions
k = 10000
dfs = df_3.sample(k,random_state=1234)

## Creació de noves variables

In [22]:
# Variable Velocitat Mitja, que és la divisió entre la distancia (en milles) i el temps (en hores).
dfs['VelMitja'] = round(dfs['Distance']/(dfs['AirTime']/60),2)
dfs.head()

Unnamed: 0,Month,UniqueCarrier,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,VelMitja
276255,2,US,98.0,21.0,31.0,728,7.0,12.0,445.71
1253643,7,CO,191.0,0.0,6.0,1417,8.0,24.0,445.13
607829,4,XE,30.0,72.0,47.0,143,4.0,65.0,286.0
1059724,6,AS,33.0,45.0,34.0,95,3.0,17.0,172.73
407269,3,WN,72.0,18.0,21.0,397,3.0,12.0,330.83


In [23]:
# S'eliminen les variables AirTime i Distance.
dfs = dfs.drop(['AirTime','Distance'],axis=1)

In [24]:
# Es crea la variable Trimestre, que dividieix els mesos en quartre parts.
dfs['Trimestre'] = np.where((dfs['Month'] == 1) | (dfs['Month'] == 2) | (dfs['Month'] == 3), 'Trim_1', 
                             np.where((dfs['Month'] == 4) | (dfs['Month'] == 5) | (dfs['Month'] == 6), 'Trim_2', 
                                      np.where((dfs['Month'] == 7) | (dfs['Month'] == 8) | (dfs['Month'] == 9), 'Trim_3', 'Trim_4')))
dfs[['Month','Trimestre']]

Unnamed: 0,Month,Trimestre
276255,2,Trim_1
1253643,7,Trim_3
607829,4,Trim_2
1059724,6,Trim_2
407269,3,Trim_1
...,...,...
1136053,7,Trim_3
1389963,8,Trim_3
1668722,11,Trim_4
1726767,11,Trim_4


In [25]:
# S'elimina la variable Month
dfs = dfs.drop(['Month'],axis=1)

In [26]:
# Es creen les variables dummies per a Trimestre i UniqueCarrier
dummies = pd.get_dummies(dfs[['Trimestre','UniqueCarrier']], prefix=[None, 'UC'])
dummies

Unnamed: 0,Trim_1,Trim_2,Trim_3,Trim_4,UC_9E,UC_AA,UC_AQ,UC_AS,UC_B6,UC_CO,UC_DL,UC_EV,UC_F9,UC_FL,UC_HA,UC_MQ,UC_NW,UC_OH,UC_OO,UC_UA,UC_US,UC_WN,UC_XE,UC_YV
276255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1253643,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
607829,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1059724,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
407269,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136053,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1389963,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1668722,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1726767,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
# Es crea la variable objectiu 'AD_label', que val 1 si ArrDelay > 0 i 0 en cas contrari
dfs['AD_label'] = np.where((dfs['ArrDelay'] > 0), 1, 0)
dfs['AD_label'][:10]

276255     1
1253643    0
607829     1
1059724    1
407269     1
1211467    1
1157579    1
1770029    0
1085831    1
1003736    1
Name: AD_label, dtype: int32

In [28]:
# S'elimina la variable ArrDelay
dfs = dfs.drop('ArrDelay',axis=1)

In [29]:
# Dataset final
df_final = dfs.join(dummies)
df_final = df_final.drop(['UniqueCarrier','Trimestre'],axis=1)
df_final.head()

Unnamed: 0,DepDelay,TaxiIn,TaxiOut,VelMitja,AD_label,Trim_1,Trim_2,Trim_3,Trim_4,UC_9E,UC_AA,UC_AQ,UC_AS,UC_B6,UC_CO,UC_DL,UC_EV,UC_F9,UC_FL,UC_HA,UC_MQ,UC_NW,UC_OH,UC_OO,UC_UA,UC_US,UC_WN,UC_XE,UC_YV
276255,31.0,7.0,12.0,445.71,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1253643,6.0,8.0,24.0,445.13,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
607829,47.0,4.0,65.0,286.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1059724,34.0,3.0,17.0,172.73,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
407269,21.0,3.0,12.0,330.83,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


<span style='color:blue;font-size:20px'> <b> Descriptiu del dataset final </b> </span>

In [30]:
# Dimensió del dataset final
df_final.shape

(10000, 29)

In [31]:
# Descriptiu de les dades
df_final.describe().round()

Unnamed: 0,DepDelay,TaxiIn,TaxiOut,VelMitja,AD_label,Trim_1,Trim_2,Trim_3,Trim_4,UC_9E,UC_AA,UC_AQ,UC_AS,UC_B6,UC_CO,UC_DL,UC_EV,UC_F9,UC_FL,UC_HA,UC_MQ,UC_NW,UC_OH,UC_OO,UC_UA,UC_US,UC_WN,UC_XE,UC_YV
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,44.0,7.0,18.0,396.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,54.0,6.0,15.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,6.0,0.0,1.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12.0,4.0,10.0,349.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,24.0,6.0,14.0,404.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,54.0,8.0,21.0,448.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1090.0,240.0,269.0,1296.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Definició de la variable objectiu i dels predictors 

In [32]:
# Variable objectiu
y = df_final['AD_label']
# Predictors
X = df_final.drop(['AD_label'], axis = 1)
# LLista amb el nom dels predictors
X_list = list(X.columns)

## Mostra d'entrenament i mostra de prova

In [33]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1234)
print('Training Features Shape:', X_train.shape)
print('Testing Features Shape:', X_test.shape)

Training Features Shape: (8000, 28)
Testing Features Shape: (2000, 28)


In [34]:
# Percentatge de positius en la mostra d'entrenament
y_train.sum()/y_train.count()

0.896125

In [35]:
# Percentatge de positius en la mostra de prova
y_test.sum()/y_test.count()

0.894

## Exercici 1
__Crea almenys tres models de classificació diferents per intentar predir el millor possible l’endarreriment dels vols (ArrDelay) de DelayedFlights.csv. Considera si el vol ha arribat tard o no (ArrDelay > 0).__

<span style='color:blue;font-size:18px'> <b> Model 1: Regressió Logística </b> </span>

In [36]:
# Creació del model logístic
logreg = LogisticRegression(max_iter=1000)

# Ajust del model
logreg.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [37]:
# Prediccions amb la mostra d'entrenament
pred_train_RL = logreg.predict(X_train)

<span style='color:blue;font-size:18px'> <b> Model 2: Xarxes neuronals </b> </span>

In [38]:
# Es normalitzen les variables numèriques
norm = MinMaxScaler()
norm.fit(X_train)

# S'aplica la transformació
X_train_2 = norm.transform(X_train)
X_test_2 = norm.transform(X_test)

In [39]:
# Construcció i ajust del model
mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500, random_state=1234)
mlp.fit(X_train_2,y_train)

MLPClassifier(hidden_layer_sizes=(8, 8, 8), max_iter=500, random_state=1234)

In [40]:
# Prediccions
pred_train_XN = mlp.predict(X_train_2)

<span style='color:blue;font-size:18px'> <b> Model 3: Extreme Gradient Boosting  </b> </span>

In [41]:
# Creació i ajust del model
xg_reg = xgb.XGBClassifier(objective ='binary:logistic',use_label_encoder=False,random_state=1234)
xg_reg.fit(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              random_state=1234, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [42]:
# Prediccions
pred_train_XGB = xg_reg.predict(X_train)

## Exercici 2
__Compara els models de classificació utilitzant la precisió (accuracy), una matriu de confiança i d’altres mètriques més avançades.__

<span style='color:blue;font-size:18px'> <b> Accuracy </b> </span>

In [43]:
# Model 1: Regressió Logística
print(metrics.accuracy_score(y_train, pred_train_RL))

0.917625


In [44]:
# Model 2: Xarxes neuronals
print(metrics.accuracy_score(y_train, pred_train_XN))

0.9195


In [45]:
# Model 3: XGBoost
print(metrics.accuracy_score(y_train, pred_train_XGB))

0.9765


<span style='color:blue;font-size:15px'> El model amb millor Accuracy és el model 3 XGBoost. </span>

<span style='color:blue;font-size:18px'> <b> Matriz de Confusión </b> </span>

In [46]:
# Model 1: Regressió Logística
confusion_RL = metrics.confusion_matrix(y_train, pred_train_RL)
print(confusion_RL)

[[ 315  516]
 [ 143 7026]]


In [47]:
# Model 2: Xarxes neuronals
confusion_XN = metrics.confusion_matrix(y_train, pred_train_XN)
print(confusion_XN)

[[ 359  472]
 [ 172 6997]]


In [48]:
# Model 1: XGBoost
confusion_XGB = metrics.confusion_matrix(y_train, pred_train_XGB)
print(confusion_XGB)

[[ 676  155]
 [  33 7136]]


<span style='color:blue;font-size:15px'> La Matriux del model 3 (XGBoost) indica que hi ha més TP i TN que en la resta de models </span>

<span style='color:blue;font-size:18px'> <b> Puntuación F1 </b> </span>

In [49]:
# Model 1: Regressió Logística
f1_RL = metrics.f1_score(y_train, pred_train_RL)
print(f1_RL)

0.9552035891509755


In [50]:
# Model 2: Xarxes neuronals
f1_XN = metrics.f1_score(y_train, pred_train_XN)
print(f1_XN)

0.956004918704741


In [51]:
# Model 3: XGBoost
f1_XGB = metrics.f1_score(y_train, pred_train_XGB)
print(f1_XGB)

0.9869986168741356


<span style='color:blue;font-size:15px'> El model amb millor F1 és el model 3 XGBoost. </span>

<span style='color:blue;font-size:18px'> <b> Precisión </b> </span>

In [52]:
# Model 1: Regressió Logística
pres_RL = metrics.precision_score(y_train, pred_train_RL)
print(pres_RL)

0.9315831344470963


In [53]:
# Model 2: Xarxes 
pres_XN = metrics.precision_score(y_train, pred_train_XN)
print(pres_XN)

0.9368054625786585


In [54]:
# Model 3: XGBoost
pres_XGB = metrics.precision_score(y_train, pred_train_XGB)
print(pres_XGB)

0.9787409134549444


<span style='color:blue;font-size:15px'> El model amb millor presició és el model 3 XGBoost. </span>

<span style='color:blue;font-size:18px'> <b> Área bajo la curva ROC </b> </span>

In [55]:
# Model 1: Regressió Logística
AUC_RL = metrics.roc_auc_score(y_train, pred_train_RL)
print(AUC_RL)

0.6795571889196013


In [56]:
# Model 2: Xarxes Neuronals
AUC_XN = metrics.roc_auc_score(y_train, pred_train_XN)
print(AUC_XN)

0.704008719182857


In [57]:
# Model 3: XGBoost
AUC_XGB = metrics.roc_auc_score(y_train, pred_train_XGB)
print(AUC_XGB)

0.904437292601737


<span style='color:blue;font-size:15px'> El model amb millor AUC és el del model 3 XGBoost. </span>

## Exercici 3
__Entrena’ls utilitzant els diferents paràmetres que admeten.__

<span style='color:blue;font-size:18px'> <b> Model 1: Regressió Logística </b> </span>

In [58]:
# Selecció de diversos paràmetres i valors
tuned_parameters = {
    'penalty': ['l2', 'none'],
    'C': [0.5,1.0,1.5],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
}

# Selecció de la mètrica a optimitzar
#scores = ['precision', 'recall', 'f1']
#scores = [ 'f1_weighted','f1_micro','f1_macro','accuracy','roc_auc']
score='f1'

print("# Afinació dels hiper-paràmetres de %s" % score)
print()
clf_RL = GridSearchCV(estimator= LogisticRegression(max_iter=1000), param_grid=tuned_parameters, cv=10,scoring='%s' % score,n_jobs= -1)
clf_RL.fit(X_train, y_train)
    
print("Millors paràmetres trobats:")
print()
print(clf_RL.best_params_)
print()
print("Puntuacions de les combinacions d'hiper-paràmetres:")
print()
means = clf_RL.cv_results_['mean_test_score']
stds = clf_RL.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_RL.cv_results_['params']):
     print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

# Afinació dels hiper-paràmetres de f1

Millors paràmetres trobats:

{'C': 0.5, 'penalty': 'l2', 'solver': 'newton-cg'}

Puntuacions de les combinacions d'hiper-paràmetres:

0.954 (+/-0.011) for {'C': 0.5, 'penalty': 'l2', 'solver': 'newton-cg'}
0.953 (+/-0.011) for {'C': 0.5, 'penalty': 'l2', 'solver': 'lbfgs'}
0.952 (+/-0.008) for {'C': 0.5, 'penalty': 'l2', 'solver': 'sag'}
0.952 (+/-0.009) for {'C': 0.5, 'penalty': 'l2', 'solver': 'saga'}
0.953 (+/-0.011) for {'C': 0.5, 'penalty': 'none', 'solver': 'newton-cg'}
0.954 (+/-0.012) for {'C': 0.5, 'penalty': 'none', 'solver': 'lbfgs'}
0.952 (+/-0.008) for {'C': 0.5, 'penalty': 'none', 'solver': 'sag'}
0.952 (+/-0.009) for {'C': 0.5, 'penalty': 'none', 'solver': 'saga'}
0.954 (+/-0.011) for {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.953 (+/-0.011) for {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.952 (+/-0.008) for {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
0.952 (+/-0.009) for {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}

<span style='color:blue;font-size:15px'> La millor combinació de paràmetres del model de Regressió Logística que optimitza f1  és C=0.5, penalty=l2 i solver=newton-cg. </span>

<span style='color:blue;font-size:18px'> <b> Model 2: Xarxes neuronals </b> </span>

In [59]:
# Selecció de diversos paràmetres i valors
tuned_parameters = {
    'hidden_layer_sizes': [[8,8],[8,8,8],[8,8,8,8]],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
}

# Selecció de la mètrica a optimitzar
#scores = ['precision', 'recall', 'f1']
#scores = [ 'f1_weighted','f1_micro','f1_macro','accuracy','roc_auc']
score='f1'

print("# Afinació dels hiper-paràmetres de %s" % score)
print()
clf_XN = GridSearchCV(estimator= MLPClassifier(max_iter=500,random_state=1234), param_grid=tuned_parameters, cv=10,scoring='%s' % score,n_jobs= -1)
clf_XN.fit(X_train_2, y_train)
    
print("Millors paràmetres trobats:")
print()
print(clf_XN.best_params_)
print()
print("Puntuacions de les combinacions d'hiper-paràmetres:")
print()
means = clf_XN.cv_results_['mean_test_score']
stds = clf_XN.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_XN.cv_results_['params']):
     print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

# Afinació dels hiper-paràmetres de f1

Millors paràmetres trobats:

{'activation': 'tanh', 'hidden_layer_sizes': [8, 8], 'solver': 'adam'}

Puntuacions de les combinacions d'hiper-paràmetres:

0.953 (+/-0.011) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8], 'solver': 'lbfgs'}
0.945 (+/-0.001) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8], 'solver': 'sgd'}
0.953 (+/-0.010) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8], 'solver': 'adam'}
0.954 (+/-0.011) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8], 'solver': 'lbfgs'}
0.945 (+/-0.000) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8], 'solver': 'sgd'}
0.953 (+/-0.011) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8], 'solver': 'adam'}
0.954 (+/-0.012) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8, 8], 'solver': 'lbfgs'}
0.945 (+/-0.000) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8, 8], 'solver': 'sgd'}
0.952 (+/-0.014) 

<span style='color:blue;font-size:15px'> La millor combinació de paràmetres del model de Xarxes Neuronals que optimitza f1 és activation=tanh, hidden_layer_size=[8,8] i solver=adam. </span>

<span style='color:blue;font-size:18px'> <b> Model 3: Extreme Gradient Boosting  </b> </span>

In [60]:
# Selecció de diversos paràmetres i valors
tuned_parameters = {
    'booster': ['gbtree', 'gblinear', 'dart'],
}

# Selecció de la mètrica a optimitzar
#scores = ['precision', 'recall', 'f1']
#scores = [ 'f1_weighted','f1_micro','f1_macro','accuracy','roc_auc']
score='f1'

print("# Afinació dels hiper-paràmetres de %s" % score)
print()
clf_XGB = GridSearchCV(estimator= xgb.XGBClassifier(objective ='binary:logistic',use_label_encoder=False,random_state=1234), param_grid=tuned_parameters, cv=10,scoring='%s' % score,n_jobs= -1)
clf_XGB.fit(X_train, y_train)
    
print("Millors paràmetres trobats:")
print()
print(clf_XGB.best_params_)
print()
print("Puntuacions de les combinacions d'hiper-paràmetres:")
print()
means = clf_XGB.cv_results_['mean_test_score']
stds = clf_XGB.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_XGB.cv_results_['params']):
     print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

# Afinació dels hiper-paràmetres de f1

Millors paràmetres trobats:

{'booster': 'gblinear'}

Puntuacions de les combinacions d'hiper-paràmetres:

0.952 (+/-0.010) for {'booster': 'gbtree'}
0.953 (+/-0.009) for {'booster': 'gblinear'}
0.952 (+/-0.010) for {'booster': 'dart'}


<span style='color:blue;font-size:15px'> La millor combinació de paràmetres del model de XGBoost que optimitza f1 és booster=gblinear. </span>

## Exercici 4
__Compara el seu rendiment utilitzant l’aproximació traint/test o utilitzant totes les dades (validació interna)__

<span style='color:blue;font-size:18px'> <b> Model 1: Regressió Logística </b> </span>

In [61]:
pred_test_RL = clf_RL.predict(X_test)
print(classification_report(y_test, pred_test_RL))
print("Accuracy score: ", metrics.accuracy_score(y_test, pred_test_RL))
print("AUC: ", metrics.roc_auc_score(y_test, pred_test_RL))

              precision    recall  f1-score   support

           0       0.68      0.35      0.47       212
           1       0.93      0.98      0.95      1788

    accuracy                           0.91      2000
   macro avg       0.80      0.67      0.71      2000
weighted avg       0.90      0.91      0.90      2000

Accuracy score:  0.914
AUC:  0.667099320417036


<span style='color:blue;font-size:18px'> <b> Model 2: Xarxes Neuronals </b> </span>

In [62]:
pred_test_XN = clf_XN.predict(X_test_2)
print(classification_report(y_test, pred_test_XN))
print("Accuracy score: ", metrics.accuracy_score(y_test, pred_test_XN))
print("AUC: ", metrics.roc_auc_score(y_test, pred_test_XN))

              precision    recall  f1-score   support

           0       0.67      0.37      0.48       212
           1       0.93      0.98      0.95      1788

    accuracy                           0.91      2000
   macro avg       0.80      0.67      0.71      2000
weighted avg       0.90      0.91      0.90      2000

Accuracy score:  0.914
AUC:  0.6733358659406525


<span style='color:blue;font-size:18px'> <b> Model 3: XGBoost </b> </span>

In [63]:
pred_test_XGB = clf_XGB.predict(X_test)
print(classification_report(y_test, pred_test_XGB))
print("Accuracy score: ", metrics.accuracy_score(y_test, pred_test_XGB))
print("AUC: ", metrics.roc_auc_score(y_test, pred_test_XGB))

              precision    recall  f1-score   support

           0       0.68      0.29      0.41       212
           1       0.92      0.98      0.95      1788

    accuracy                           0.91      2000
   macro avg       0.80      0.64      0.68      2000
weighted avg       0.90      0.91      0.89      2000

Accuracy score:  0.9105
AUC:  0.6381167954075387


<span style='color:blue;font-size:15px'> 
    <p> El model amb més Accuracy és el de Regressió Logística i Xarxes neuronals </p> 
    <p> El model amb més AUC és el de Xarxes neuronals </p>
</span>

## Exercici 5
__Realitza algun procés d’enginyeria de variables per millorar-ne la predicció.__

<span style='color:blue;font-size:15px'> <b> Aquest exercici s'ha realitzat a l'apartat "Creació de variables", abans de modelitzar. </b> </span>

<span style='color:blue;font-size:15px'> S'ha creat una nova variable 'VelMitja' que és una combinació de 'Distance' i 'AirTime' (dues variables molt correlaciones). </span>

<span style='color:blue;font-size:15px'> S'ha creat la variable 'Trimestre', que agrupa els mesos del any </span>

<span style='color:blue;font-size:15px'> S'han creat les variables dummy per 'Trimestre' i 'UniqueCarrier' </span>

<span style='color:blue;font-size:15px'> Per a Xarxes Neuronals s'ha aplicat una escala MinMax a les variables: (Valor - Mínim) / (Máxim - Mínim), que transforma els valors en nombres entre 0 i 1 aproximadament. Aquesta transformació millora el rendiment del model </span>

In [64]:
X_train_2

array([[0.03030303, 0.0375    , 0.06343284, ..., 0.        , 0.        ,
        0.        ],
       [0.15656566, 0.05416667, 0.03731343, ..., 0.        , 0.        ,
        0.        ],
       [0.01515152, 0.03333333, 0.02985075, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.01767677, 0.02916667, 0.05223881, ..., 0.        , 0.        ,
        0.        ],
       [0.02020202, 0.00833333, 0.02238806, ..., 1.        , 0.        ,
        0.        ],
       [0.01767677, 0.025     , 0.07462687, ..., 1.        , 0.        ,
        0.        ]])

## Exercici 6
__No utilitzis la variable DepDelay a l’hora de fer prediccions.__

In [65]:
# S'elimina la variable DepDelay de les mostres d'entrenament i prova
X_train_DD = X_train.drop('DepDelay',axis=1)
X_test_DD = X_test.drop('DepDelay',axis=1)

<span style='color:blue;font-size:18px'> <b> Model 1: Regressió Logística </b> </span>

In [66]:
# Selecció de diversos paràmetres i valors
tuned_parameters = {
    'penalty': ['l2', 'none'],
    'C': [0.5,1.0,1.5],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
}

# Selecció de la mètrica a optimitzar
#scores = ['precision', 'recall', 'f1']
#scores = [ 'f1_weighted','f1_micro','f1_macro','accuracy','roc_auc']
score='f1'

print("# Afinació dels hiper-paràmetres de %s" % score)
print()
clf_RL_DD = GridSearchCV(estimator= LogisticRegression(max_iter=1000), param_grid=tuned_parameters, cv=10,scoring='%s' % score,n_jobs= -1)
clf_RL_DD.fit(X_train_DD, y_train)
    
print("Millors paràmetres trobats:")
print()
print(clf_RL_DD.best_params_)
print()
print("Puntuacions de les combinacions d'hiper-paràmetres:")
print()
means = clf_RL_DD.cv_results_['mean_test_score']
stds = clf_RL_DD.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_RL_DD.cv_results_['params']):
     print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

# Afinació dels hiper-paràmetres de f1

Millors paràmetres trobats:

{'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}

Puntuacions de les combinacions d'hiper-paràmetres:

0.945 (+/-0.003) for {'C': 0.5, 'penalty': 'l2', 'solver': 'newton-cg'}
0.945 (+/-0.003) for {'C': 0.5, 'penalty': 'l2', 'solver': 'lbfgs'}
0.945 (+/-0.000) for {'C': 0.5, 'penalty': 'l2', 'solver': 'sag'}
0.945 (+/-0.000) for {'C': 0.5, 'penalty': 'l2', 'solver': 'saga'}
0.945 (+/-0.003) for {'C': 0.5, 'penalty': 'none', 'solver': 'newton-cg'}
0.945 (+/-0.003) for {'C': 0.5, 'penalty': 'none', 'solver': 'lbfgs'}
0.945 (+/-0.000) for {'C': 0.5, 'penalty': 'none', 'solver': 'sag'}
0.945 (+/-0.000) for {'C': 0.5, 'penalty': 'none', 'solver': 'saga'}
0.945 (+/-0.003) for {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.945 (+/-0.003) for {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.945 (+/-0.000) for {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
0.945 (+/-0.000) for {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}
0.9

In [67]:
# Rendiment del model 1
pred_test_RL_DD = clf_RL_DD.predict(X_test_DD)
print(classification_report(y_test, pred_test_RL_DD))
print("Accuracy score: ", metrics.accuracy_score(y_test, pred_test_RL_DD))
print("AUC: ", metrics.roc_auc_score(y_test, pred_test_RL_DD))

              precision    recall  f1-score   support

           0       0.80      0.02      0.04       212
           1       0.90      1.00      0.94      1788

    accuracy                           0.90      2000
   macro avg       0.85      0.51      0.49      2000
weighted avg       0.89      0.90      0.85      2000

Accuracy score:  0.8955
AUC:  0.5091543202059854


<span style='color:blue;font-size:18px'> <b> Model 2: Xarxes Neuronals </b> </span>

In [68]:
# S'elimina la variable DepDelay de la mostra d'entrenament i de prova
X_train_2_DD = np.delete(X_train_2,0,axis=1)
X_test_2_DD = np.delete(X_test_2,0,axis=1)

In [69]:
# Selecció de diversos paràmetres i valors
tuned_parameters = {
    'hidden_layer_sizes': [[8,8],[8,8,8],[8,8,8,8]],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
}

# Selecció de la mètrica a optimitzar
#scores = ['precision', 'recall', 'f1']
#scores = [ 'f1_weighted','f1_micro','f1_macro','accuracy','roc_auc']
score='f1'

print("# Afinació dels hiper-paràmetres de %s" % score)
print()
clf_XN_DD = GridSearchCV(estimator= MLPClassifier(max_iter=500,random_state=1234), param_grid=tuned_parameters, cv=10,scoring='%s' % score,n_jobs= -1)
clf_XN_DD.fit(X_train_2_DD, y_train)
    
print("Millors paràmetres trobats:")
print()
print(clf_XN_DD.best_params_)
print()
print("Puntuacions de les combinacions d'hiper-paràmetres:")
print()
means = clf_XN_DD.cv_results_['mean_test_score']
stds = clf_XN_DD.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_XN_DD.cv_results_['params']):
     print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

# Afinació dels hiper-paràmetres de f1

Millors paràmetres trobats:

{'activation': 'logistic', 'hidden_layer_sizes': [8, 8], 'solver': 'adam'}

Puntuacions de les combinacions d'hiper-paràmetres:

0.945 (+/-0.003) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8], 'solver': 'lbfgs'}
0.945 (+/-0.000) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8], 'solver': 'sgd'}
0.945 (+/-0.003) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8], 'solver': 'adam'}
0.945 (+/-0.003) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8], 'solver': 'lbfgs'}
0.945 (+/-0.000) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8], 'solver': 'sgd'}
0.945 (+/-0.002) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8], 'solver': 'adam'}
0.945 (+/-0.003) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8, 8], 'solver': 'lbfgs'}
0.945 (+/-0.000) for {'activation': 'identity', 'hidden_layer_sizes': [8, 8, 8, 8], 'solver': 'sgd'}
0.945 (+/-0.0

In [70]:
# Rendiment del model 2
pred_test_XN_DD = clf_XN_DD.predict(X_test_2_DD)
print(classification_report(y_test, pred_test_XN_DD))
print("Accuracy score: ", metrics.accuracy_score(y_test, pred_test_XN_DD))
print("AUC: ", metrics.roc_auc_score(y_test, pred_test_XN_DD))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       212
           1       0.89      1.00      0.94      1788

    accuracy                           0.89      2000
   macro avg       0.45      0.50      0.47      2000
weighted avg       0.80      0.89      0.84      2000

Accuracy score:  0.894
AUC:  0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<span style='color:blue;font-size:18px'> <b> Model 3: XGBoost </b> </span>

In [71]:
# Selecció de diversos paràmetres i valors
tuned_parameters = {
    'booster': ['gbtree', 'gblinear', 'dart'],
}

# Selecció de la mètrica a optimitzar
#scores = ['precision', 'recall', 'f1']
#scores = [ 'f1_weighted','f1_micro','f1_macro','accuracy','roc_auc']
score='f1'

print("# Afinació dels hiper-paràmetres de %s" % score)
print()
clf_XGB_DD = GridSearchCV(estimator= xgb.XGBClassifier(objective ='binary:logistic',use_label_encoder=False,random_state=1234), param_grid=tuned_parameters, cv=10,scoring='%s' % score,n_jobs= -1)
clf_XGB_DD.fit(X_train_DD, y_train)
    
print("Millors paràmetres trobats:")
print()
print(clf_XGB_DD.best_params_)
print()
print("Puntuacions de les combinacions d'hiper-paràmetres:")
print()
means = clf_XGB_DD.cv_results_['mean_test_score']
stds = clf_XGB_DD.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_XGB_DD.cv_results_['params']):
     print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

# Afinació dels hiper-paràmetres de f1

Millors paràmetres trobats:

{'booster': 'gblinear'}

Puntuacions de les combinacions d'hiper-paràmetres:

0.939 (+/-0.007) for {'booster': 'gbtree'}
0.945 (+/-0.001) for {'booster': 'gblinear'}
0.939 (+/-0.007) for {'booster': 'dart'}


In [72]:
# Rendiment del model 3
pred_test_XGB_DD = clf_XGB_DD.predict(X_test_DD)
print(classification_report(y_test, pred_test_XGB_DD))
print("Accuracy score: ", metrics.accuracy_score(y_test, pred_test_XGB_DD))
print("AUC: ", metrics.roc_auc_score(y_test, pred_test_XGB_DD))

              precision    recall  f1-score   support

           0       1.00      0.01      0.02       212
           1       0.89      1.00      0.94      1788

    accuracy                           0.90      2000
   macro avg       0.95      0.50      0.48      2000
weighted avg       0.91      0.90      0.85      2000

Accuracy score:  0.895
AUC:  0.5047169811320755


<span style='color:blue;font-size:15px'> <b> El rendiment dels tres model ha empitjorat. </b> </span>

<span style='color:blue;font-size:15px'> El model de Xarxes Neuronals ha considertat totes les prediccions com el valor de la classe majoritaria (1) (Null Accuracy) </span>

In [73]:
pred_test_XN_DD.sum()

2000