# Data Mining Project: Flights delays prediction 

The purpose of the task is:

> **Problem statement**: If we are given information on US commercial flights in January 2020, check if a flight is delayed, where delay means that the flight arrives at its destination at least 15 minutes after arrival time.

---

## Import and settings

In [None]:
from pandas import DataFrame, Series
import numpy as np
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

def describe(a):
    if type(a) is np.ndarray:
        print("data:\n{}\nshape:{}\ndtype:{}\ntype: {}".format(a, a.shape, a.dtype, type(a)))
    elif type(a) is pd.Series:
        print("data:\n{}\nshape:{}\ndtype:{}\nname:{}\nindex-name:{}\nindex-type:{}\ntype:{}".format(a, a.shape, a.dtype, a.name, a.index.name,type(a.index), type(a)))
    elif type(a) is pd.DataFrame:
        print("data:\n{}\nshape:{}\ntype:{}".format(a, a.shape,type(a)))
    else:
        print("{}, type:{}".format(a, type(a)))

### Datasets loading

- **Jan_2020_ontime**
- **airports_data**
- **airlines_data**

**Jan_2020_ontime** is composed by 607346 rows and 21 columns,

**airports_data** and **airlines_data** are respective composed by: 342 rows and 7 columns and 20 rows and 2 columns.


In [None]:
dataset_1 = pd.read_csv("dataset/Jan_2020_ontime.csv")
airports_data = pd.read_csv("dataset/airports.csv")
airlines_data = pd.read_csv("dataset/airlines.csv")

dataset_1.info()
airports_data.info()
airlines_data.info()   

Originally **airport_data** was characterized by some null values. Specifically, only 4 rows has null values at **CITY** and **STATE**.

![](images/update%20airports.png).

The problem was resolved by update manually the missing informations.

---

In [None]:
delays_data = dataset_1.copy()

In [None]:
delays_data.describe()

## Data cleaning

### Cleaning

A first exploration of **delays_data** revealed that the last column is useless, so it will be delete. Furthermore exists 36 airport contained in **delays_data** but not in **airports_data**, below iata code of missing airports.

    'ECP', 'SWO', 'HVN', 'HHH', 'STS', 'PBG', 'IAG', 'LBE', 'XWA', 'SLN', 'CKB', 'OGS', 'LBL', 'EAR', 'UIN', 'LBF', 'BFF', 'CGI', 'ATY', 'SCK', 'PRC', 'PAE', 'PGD', 'AZA', 'PVU', 'SFB', 'USA', 'BLV', 'LCK', 'HGR', 'BFM', 'PSM', 'OWB', 'OGD', 'SHR', 'RIW'.


Due to the impossibility of plotting these airports on a graph becouse of the missing data such as LATITUDE and LONGITUDE, it has been decided to delete these airports.


In [None]:
def data_cleaning(delays_data):
    origin = Series(delays_data["ORIGIN"].unique())
    destination = Series(delays_data["DEST"].unique())
    airports_delays = pd.concat([origin,destination]).drop_duplicates().reset_index(drop=True).values
    airports_data_series = Series(np.arange(airports_data["IATA"].values.size), index = airports_data["IATA"])
    missing_airports = []
    
    i = 0
    for airport in airports_delays:
        if(not(airport in airports_data_series)): 
            i+=1
            missing_airports.append(airport)
    print("Total of missing airports:",i)
    print("Details:\n", missing_airports)

    origin_series = Series(np.arange(delays_data.shape[0]), index = delays_data["ORIGIN"])
    dest_series = Series(np.arange(delays_data.shape[0]), index = delays_data["DEST"])

    rows = []
    for airport in missing_airports:        
        rows.append(origin_series[airport].values)
        rows.append(dest_series[airport].values)
    
    rows_to_be_deleted = []
    for l in rows:
        for e in l:
            rows_to_be_deleted.append(e)

    delays_data.drop(rows_to_be_deleted,inplace = True)

    delays_data.reset_index(inplace=True)
    delays_data.drop(columns = "index",inplace=True)

    origin = Series(delays_data["ORIGIN"].unique())
    destination = Series(delays_data["DEST"].unique())
    airports_delays = pd.concat([origin,destination]).drop_duplicates().reset_index(drop=True).values

    missing_airports = []
    i = 0
    for airport in airports_delays:
        if(not(airport in airports_data_series)): 
            i+=1
            missing_airports.append(airport)
    print("Total of missing airports:",i)
    print("Details:\n", missing_airports)
    delays_data.info()
    print("-------------end data_cleaning\n")
    
#it prove that missing airports no longer contained in cleaned data
data_cleaning(delays_data)

### Dimensionality reduction
In order to decrease the dimension of dataset, it has been decided to delete the following columns:

    1. DEST_AIRPORT_ID
    2. ORIGIN_AIRPORT_ID
    3. OP_CARRIER_AIRLINE_ID
    4. OP_UNIQUE_CARRIER
    5. ORIGIN_AIRPORT_SEQ_ID
    6. DEST_AIRPORT_SEQ_ID

becouse of they're coding redundat information.
 
More specifically:
1. **OP_UNIQUE_CARRIER** e **OP_CARRIER_AIRLINE_ID**, are repetition of **OP_CARRIER**,
2. **ORIGIN_AIRPORT_SEQ_ID** e **ORIGIN_AIRPORT_ID** are repetition of **ORIGIN**,
3. **ORIGIN_AIRPORT_SEQ_ID** e **DEST_AIRPORT_ID** are repetition of **DEST**

Furthermore, as mentioned before, it has been deleted **Unnamed: 21** columns too.

In [None]:
def remove_useless_attributes(delays_data):
    print("before")
    delays_data.info()
    delays_data.drop(columns=["DEST_AIRPORT_ID","ORIGIN_AIRPORT_ID","OP_CARRIER_AIRLINE_ID","OP_UNIQUE_CARRIER","ORIGIN_AIRPORT_SEQ_ID","DEST_AIRPORT_SEQ_ID","Unnamed: 21"],inplace=True)
    print("after")
    delays_data.info()
    print("------------end remove_useless_attributs \n")
remove_useless_attributes(delays_data)

In [None]:
delays_data[delays_data.DIVERTED==1]

Notice that, only 1106 flights on 598841 are affected by **DIVERTED** status.

### Null values

These attibutes are characterized by null values:

    1. TAIL_NUM    (type object)
    2. DEP_TIME    (type float64)
    3. DEP_DEL15   (type float64)
    4. ARR_TIME    (type float64)
    5. ARR_DEL15   (type float64)

It has been decided to delete all records that are characterized at least by a column containing null value. This choise has been made, taking into consideration that all the automatic generation algorithm for null values (e.x. replace by mean) cannot be applied to this particular problem. Moreover, the largeness of the dataset allow the deletions of rows without loss of generality. 

Finally, it should be noted that the elimination of all rows characterized by null values ​​in correspondence of the **ARR_TIME** and **ARR_DEL15** columns entails the total loss of the information about the cancellation of the flight as all the canceled flights are characterized by null values ​​in correspondence of the aforementioned columns. For this reason the **CANCELED** feature is deleted. This elimination does not involve any problem as the information about the canceled flights has no weight on the flight delay, as canceled flights have never taken off, therefore the delay is not definable for these particular records.

In [None]:
def remove_null(delays_data):
    print("before")
    delays_data.info()
    delays_data.dropna(subset = ["TAIL_NUM","DEP_TIME","DEP_DEL15","ARR_TIME","ARR_DEL15"],inplace=True)
    delays_data.reset_index(inplace=True)
    delays_data.drop(columns = "index",inplace=True)    
    print("after")
    delays_data.info()
    print("-----------end remove_null\n")    
remove_null(delays_data)

In [None]:
delays_data[delays_data.DIVERTED==1]

As regards the **DIVERTED** attribute (diverted flight), the elimination of rows containing null values ​​on the columns defined above brings to zero the number of flights affected by this status.

For this reason, in addition to the **CANCELED** attribute, **DIVERTED** is also deleted.

In [None]:
delays_data.drop(columns=["CANCELLED","DIVERTED"],inplace=True)
delays_data.reset_index(inplace=True)
delays_data.drop(columns = "index",inplace=True)  

### Time preprocessing

Define block structure for arrival time, such as **DEP_TIME_BLK**.

In [None]:
def arr_time(x):
  if x >= 600 and x <= 659:
    return '0600-0659'
  elif x>=1400 and x<=1459:
    return '1400-1459'
  elif x>=1200 and x<=1259:
    return '1200-1259'
  elif x>=1500 and x<=1559:
    return '1500-1559'
  elif x>=1900 and x<=1959:
    return '1900-1959'
  elif x>=900 and x<=959:
    return '0900-0959'
  elif x>=1000 and x<=1059:
    return  '1000-1059'
  elif x>=2000 and x<=2059:
    return '2000-2059'
  elif x>=1300 and x<=1359:
    return '1300-1359'
  elif x>=1100 and x<=1159:
    return '1100-1159'
  elif x>=800 and x<=859:
    return '0800-0859'
  elif x>=2200 and x<=2259:
    return '2200-2259'
  elif x>=1600 and x<=1659:
    return '1600-1659'
  elif x>=1700 and x<=1759:
    return '1700-1759'
  elif x>=2100 and x<=2159:
    return '2100-2159'
  elif x>=700 and x<=759:
    return '0700-0759'
  elif x>=1800 and x<=1859:
    return '1800-1859'
  elif x>=1 and x<=559:
    return '0001-0559'
  elif x>=2300 and x<=2400:
    return '2300-2400'

In [None]:
delays_data['ARR_TIME_BLK'] = delays_data["ARR_TIME"].astype(int).apply(lambda x :arr_time(x))

The exploratory analysis shows that exists numerical values that aren't conductible to times, such as mentioned below.

In [None]:
delays_data['DEP_TIME'] = delays_data['DEP_TIME'].astype(str)
array_dep = delays_data['DEP_TIME'].to_numpy()

for s in array_dep:
    if(len(s)==2):
        print(s)
        break
    if(len(s)==3):
        print(s)
        break
    if(len(s)==4):
        print(s)
        break

In [None]:
def remove_useless_times(delays_data):
    delays_data['DEP_TIME'] = delays_data['DEP_TIME'].astype(str)
    delays_data['ARR_TIME'] = delays_data['ARR_TIME'].astype(str)

    array_dep = delays_data['DEP_TIME'].to_numpy()
    array_arr = delays_data['ARR_TIME'].to_numpy()

    i = 0
    ret = set()
    for s in array_dep:
        if(len(s)==2):
            ret.add(i)
        if(len(s)==3):
            ret.add(i)
        if(len(s)==4):
            ret.add(i)
        i+=1
    i = 0
    for s in array_arr:
        if(len(s)==2):
            ret.add(i)
        if(len(s)==3):
            ret.add(i)
        if(len(s)==4):
            ret.add(i)
        i+=1
    
    
    delays_data.drop(ret,inplace=True)
    delays_data.reset_index(inplace=True)
    delays_data.drop(columns='index',inplace=True)
remove_useless_times(delays_data)

In [None]:
array = delays_data['DEP_TIME'].to_numpy()
array2 = delays_data['ARR_TIME'].to_numpy()

i = 0
for s in array:
    if(len(s)==3):
        i+=1
        #print(s)    
    if(len(s)==4):
        i+=1
        #print(s)

for s in array2:
    if(len(s)==3):
        i+=1
        #print(s)    
    if(len(s)==4):
        i+=1
        #print(s) 
print(i)

In [None]:
def time_preprocessing(time):
    if(len(time)==5):
        time = '0'+time[:1]+':'+time[1:3]
        return time
    else:
        time = time[:2]+':'+time[2:4]
        if(time[:2]=='24'):
            time = '00'+time[2:]
    return time

def refactor(time):
    time = time[:4]+'0'+time[5:]
    return time

from datetime import datetime
def convert(time):
    return datetime.strptime(time,"%H:%M")

We continue with a pre-processing of the **ARR_TIME** and **DEP_TIME** columns, in order to admit to these features the shape: 
    **%H:%M**.

Subsequently, in order to generate a new function representing the flight duration, the values ​​relating to the aforementioned columns are converted into the datetime format in order to calculate the difference between the departure time and the arrival time.
The new attribute (**DUR**) is therefore a pre-processing phase that involves the elimination of some non-useful data such as the days elapsed between arrival and departure. This information is deleted as all flights certainly last less than 24 hours.
Note that, in order to avoid generating specific and not very reusable durations, it was decided to eliminate the information about the fraction of minutes. For example, a flight with duration **01: 15** will have a new duration equal to **01: 10**.

In [None]:
delays_data['ARR_TIME'] = delays_data["ARR_TIME"].astype(str).apply(lambda x : time_preprocessing(x))
delays_data['DEP_TIME'] = delays_data["DEP_TIME"].astype(str).apply(lambda x : time_preprocessing(x))

delays_data['ARR_TIME'] = delays_data['ARR_TIME'].apply(lambda x : convert(x))
delays_data['DEP_TIME'] = delays_data['DEP_TIME'].apply(lambda x : convert(x))

delays_data["DUR"] = (delays_data["ARR_TIME"]-delays_data["DEP_TIME"])
delays_data['DUR'] = delays_data['DUR'].astype(str).map(lambda x : x[7:])
delays_data['DUR'] = delays_data['DUR'].apply(lambda x : refactor(x))

dur = delays_data['DUR'].to_numpy()
i = 0
ret = []
for s in dur:
    if('+' in s):
        ret.append(i)
    i+=1

delays_data.drop(ret,inplace=True)
delays_data.reset_index(inplace=True)
delays_data.drop(columns='index',inplace=True)
delays_data['DUR'].nunique()

As evidenced by the output (**65**), the new attribute features only 65 unique values ​​out of a total of more than 500,000 rows.
Note that the choice to minimize the number of distinct values ​​is due to the application of the **OneHotEncoder** in the **Train Generation and Test set** section, as this encoding technique generates a new feature for each distinct value present in each of the columns characterizing the dataset, therefore it could expose the problem to the phenomenon of the curse of dimensionality if the number of features generated as a result of encoding were high.

In [None]:
delays_data.head()

---

## Feature selection

A step prior to generating Train and Test sets is the process of identifying core attributes.

To perform this analysis, the dataset is transformed by a label encoder, so using a random forest classifier we will get some information that could guide us on how to choose the main attributes.

In [None]:
delays_data[delays_data.DEP_DEL15!=delays_data.ARR_DEL15]

In [None]:
from sklearn.preprocessing import LabelEncoder
Encoder_df = LabelEncoder() 
feature_importance_df = delays_data.copy()

feature_importance_df = feature_importance_df.apply(LabelEncoder().fit_transform)

In [None]:
from sklearn.ensemble import RandomForestRegressor
feature_importance_df_x = feature_importance_df.drop('ARR_DEL15', axis = 1)
feature_importance_df_y = feature_importance_df['ARR_DEL15']

rnd_reg = RandomForestRegressor(n_estimators=40, n_jobs=-1, random_state=40)
rnd_reg.fit(feature_importance_df_x, feature_importance_df_y)
attributes = feature_importance_df_x.columns
importances = rnd_reg.feature_importances_
index = np.argsort(importances)

plt.figure(figsize=(20,15))
plt.title("Attribute importance")
p = plt.barh(range(len(index)), importances[index], color='r', align='center')
plt.yticks(range(len(index)), attributes[index])
plt.xlabel("Relative importance")
plt.show()

Note that the fraction of records that have different values ​​at the **DEP_DEL15** and **ARR_DEL15** attributes is 0.06%, which means that the class attribute (**ARR_DEL15**) is strongly correlated to the **DEP_DEL15** attribute. In order to avoid training the model incorrectly, giving excessive weight to the **DEP_DEL15** attribute, it was decided to delete this column as following the cleaning of the data every flight with delayed departure reaches its destination late.

It would be incorrect to train the model by taking this information strongly into consideration as the delay is affected by a multiplicity of variables such as meteorological conditions that are not taken into consideration in solving this specific problem.

The feature importance step is repeated in order to verify the stability of the results previously obtained.

In [None]:
delays_data.drop(columns = "DEP_DEL15",inplace=True)

In [None]:
delays_data.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
Encoder_df = LabelEncoder() 
feature_importance_df = delays_data.copy()

feature_importance_df = feature_importance_df.apply(LabelEncoder().fit_transform)

In [None]:
from sklearn.ensemble import RandomForestRegressor
feature_importance_df_x = feature_importance_df.drop('ARR_DEL15', axis = 1)
feature_importance_df_y = feature_importance_df['ARR_DEL15']

rnd_reg = RandomForestRegressor(n_estimators=40, n_jobs=-1, random_state=40)
rnd_reg.fit(feature_importance_df_x, feature_importance_df_y)
attributes = feature_importance_df_x.columns
importances = rnd_reg.feature_importances_
index = np.argsort(importances)

plt.figure(figsize=(20,15))
plt.title("Attribute importance")
p = plt.barh(range(len(index)), importances[index], color='r', align='center')
plt.yticks(range(len(index)), attributes[index])
plt.xlabel("Relative importance")
plt.show()

In [None]:
delays_data.info()

---

## Correlation

The analysis of the relations of the attributes is of fundamental importance to select a set of attributes representing the maximum possible information. To do this, statistical measures such as correlation are used.

Correlation is a measure that expresses the relationship between two variables and is widely used to describe simple relationships since it does not depend on a cause-effect relationship but on the tendency of one variable to change as a function of another.

Note how the correlation measure depends on the type of features that characterize the dataset, in other words categorical attributes will employ correlation measures different from those expected for numeric attributes

#### Cramer's V

Cramer's V is a measure of the strength of the association between two categorical variables.
Its value ranges from 0 to 1 where:

    - 0 indicates no association between the two variables
    - 1 indicates a perfect association between the two variables

In [None]:
import scipy.stats as ss

def cramers_v(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

def corr_cramers_matrix(df_cat):
    ret = pd.DataFrame(index=df_cat.columns, dtype='float64')
    for col1 in df_cat.columns:
        series = pd.Series(index=df_cat.columns, dtype='float64')
        for col2 in df_cat.columns:
            confusion_matrix = pd.crosstab(df_cat[col1], df_cat[col2])
            cramer = cramers_v(confusion_matrix.values)
            series.loc[col2] = cramer
        ret[col1] = series
    return ret

corr_cramer = corr_cramers_matrix(delays_data)

In [None]:
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15,12))
    ax = sns.heatmap(corr_cramer,vmax=1,cmap="Blues",square=True,annot=True)

From the analysis of the correlation between the attributes, it emerges that there is a significant correlation between the following pairs:

    1. DAY_OF_MONTH and DAY_OF_WEEK,
    2. OP_CARRIER and TAIL_NUMBER,
    3. between the departure time and the relative block


---

## Attribute deletion

Due to the strong correlation between the **OP_CARRIER** and **TAIL_NUM** attribute, it was decided to delete **TAIL_NUM**.

The **OP_CARRIER_FL_NUM**, **ARR_TIME** and **DEP_TIME** features are also deleted as:
    
    1. OP_CARRIER_FL_NUM, like TAIL_NUM, does not provide meaningful information. Furthermore, being a feature that expresses a highly specific property, there would be a risk of making the classifier learn the values ​​by heart without it being able to exploit its generalization skills.
    
    2. ARR_TIME and DEP_TIME, are replaced by information about the arrival and departure time blocks.



In [None]:
delays_data.drop(columns=["TAIL_NUM","OP_CARRIER_FL_NUM","ARR_TIME","DEP_TIME"],inplace = True)

In [None]:
delays_data.info()

---

## Outlier detection

In [None]:
delays_data.describe()

In [None]:
delays_data.loc[delays_data["DISTANCE"]==31].head()

Note the existence of a 31-mile route from WRG airport (iata https://en.wikipedia.org/wiki/Wrangell_Airport) to PSG airport (iata https://en.wikipedia.org/wiki/ Petersburg_James_A._Johnson_Airport).

Therefore, contrary to what one might think, 31 miles does not represent an outlier.



---

## Train and Test set generation

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(delays_data, test_size=0.03, random_state=42)
train_set.reset_index(inplace=True)
test_set.reset_index(inplace=True)
train_set.drop(columns="index",inplace = True)
test_set.drop(columns="index",inplace = True)
train_set.shape, test_set.shape

A new variable is defined, **train_set_graph**, which represents a copy of the train set generated in the previous step. The definition of this variable allows us to generate graphs in the **DATA_UNDERSTANDING** section, making use of attributes that will be eliminated from the **train_set** used in the definition of the models.

In [None]:
train_set_graph = train_set.copy()

### Is train set è balanced? 

In [None]:
print(len(train_set[train_set["ARR_DEL15"]==0.0]))

In [None]:
print(len(train_set[train_set["ARR_DEL15"]==1.0]))

We can eliminate the target attribute on the **train_set** and on the **test_set**, not before having saved the information relating to the aforementioned attribute in appropriate variables, such as **train_set_labels** and **test_set_labels**. This information will later be used in the model training phase.

In [None]:
train_set_labels = np.squeeze(train_set["ARR_DEL15"])
test_set_labels = np.squeeze(test_set["ARR_DEL15"])

for set_ in (train_set, test_set):
    set_.drop(columns = ["ARR_DEL15"], axis=1, inplace=True)

It is evident that the dataset is unbalanced as the distribution of the class attribute in the training set is not uniform. To confirm what has been said, the results obtained from the two previous prints can be observed, which express a clear prevalence of the "flight not delayed" class attribute encoded by the value 0, compared to the value 1, or "flight delayed".
Note that if the dataset was not adequately balanced, following training, classifiers would tend to prefer the majority class as they are unable to generalize.

### Balance

In order to balance the train set, we made use of the RandomUnderSampler class specifying the number of records for each value taken by the class attribute. This method refers to the UnderSampling technique which envisages reducing the number of records pertaining to the majority class.
Note that different methodologies could have been applied, such as oversampling (opposite approach to undersampling), in order to resolve the imbalance, but given the large size of the dataset it was decided to reduce the number of records of the class negative.

In [None]:
from imblearn.under_sampling import RandomUnderSampler 
rus = RandomUnderSampler(    
      sampling_strategy={
          0: 15000,
          1: 15000
      }, random_state = 42)
    
train_set_res, train_set_labels_res = rus.fit_resample(train_set, train_set_labels)

train_set_res.shape,train_set_labels_res.shape

In [None]:
train_set_res.info()

### Columns update:

Two new categorical attributes are defined, respectively **DAY_TYPE** and **TYPE_OF_FLIGHT**

**Note that the operations carried out have the objective of considerably reducing the number of unique values ​​characterizing the features**

Considering the high correlation between the **DAY_OF_WEEK** and **DAY_OF_MONTH** attribute, and that the **DAY_OF_MONTH** attribute alone gives clear information about the flight date, it was decided to delete **DAY_OF_WEEK** , in favor of adding a new categorical attribute that indicates whether the day of the week is a working day or relative to the weekend (working day, dd <5; week_end, gg> = 5). This choice also depends on the **OneHotEncoder**, in fact by eliminating the **DAY_OF_WEEK** attribute, the number of columns produced following the application of the aforementioned transformation function is reduced.

Due to the high specificity of the distances characterizing the flights, it was decided to transform this feature into an ordinal categorical attribute that classifies the type of flight based on the distance traveled. Specifically, the flight can be of short, medium or long type.

Furthermore, always considering the output of the **OneHotEncoder**, in order to avoid incurring the phenomenon of the curse of dimensionality, the information on the airports of origin and destination are replaced with the names of the countries they belong to.




In [None]:
def search_state(x):
    row = airports_data[airports_data.IATA == x]['STATE']
    state = row.iloc[0]
    return state

In [None]:
from sklearn.preprocessing import FunctionTransformer
def update_columns(dataset):   
    
    dataset["DAY_TYPE"] = np.where((dataset["DAY_OF_WEEK"].astype(int)<5),"Working day","Week end day")

    dataset["TYPE_OF_FLIGHT"] = np.where((dataset["DISTANCE"].astype(float)<700),"Short flight","Medium flight")
    dataset["TYPE_OF_FLIGHT"] = np.where(((dataset["TYPE_OF_FLIGHT"]!="Short flight")  & (dataset["DISTANCE"].astype(float)>3000)),"Long flight",dataset["TYPE_OF_FLIGHT"])
    
    dataset["ORIGIN"] = dataset["ORIGIN"].apply(lambda x : search_state(x))
    dataset["DEST"] = dataset["DEST"].apply(lambda x : search_state(x))

    dataset.drop(columns=["DAY_OF_WEEK","DISTANCE"],inplace = True)

    print(dataset.columns)
    
    pd.DataFrame(dataset).info()
    
    print("------------end update_columns\n")

    return pd.DataFrame(dataset)
    
   

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

pipeline = Pipeline([
    ("upd_col",FunctionTransformer(update_columns,validate=False)),
    ("cat", OneHotEncoder(handle_unknown='ignore',)),
])


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

cat_attribs = ["ARR_TIME_BLK","DEP_TIME_BLK","OP_CARRIER","DUR","DAY_OF_MONTH"]
pipe_cat = ["DAY_OF_WEEK","ORIGIN","DEST","DISTANCE"]

full_pipeline = ColumnTransformer([
    ("pipe", pipeline, pipe_cat),
    ("cat", OneHotEncoder(handle_unknown='ignore',), cat_attribs),
    ])

train_set_prepared = full_pipeline.fit_transform(train_set_res)
test_set_prepared = full_pipeline.transform(test_set)

In [None]:
train_set_prepared,test_set_prepared

In [None]:
#from sklearn.decomposition import IncrementalPCA
#s_IPCA = IncrementalPCA(n_components=200)
#train_set_prepared = s_IPCA.fit_transform(train_set_prepared)
#test_set_prepared = s_IPCA.fit_transform(test_set_prepared)

In [None]:
#train_set_prepared.shape,test_set_prepared.shape

---

## Data understanding

Done one **train set**

**Bar plot of aggregate_op per op_carrier**

In [None]:
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(11.7,8.27)})
#Some settings 
df_carieer = pd.DataFrame(train_set_graph['OP_CARRIER'].value_counts().reset_index().values, columns=["OP_CARRIER", "AGGREGATE_OP"]).merge(airlines_data,left_on="OP_CARRIER",right_on="IATA_CODE")
df_carieer = df_carieer.drop(columns="OP_CARRIER")
df_carieer["AGGREGATE_OP"] = pd.to_numeric(df_carieer["AGGREGATE_OP"])
df_carieer.info()

#sns.scatterplot(x='IATA_CODE', y='AGGREGATE_OP',data=df_carieer,palette=sns.blend_palette(['blue','red'],18),hue="AIRLINE")

px.scatter(df_carieer,x='AIRLINE', y='AGGREGATE_OP',size="AGGREGATE_OP",color="AGGREGATE_OP", color_continuous_scale="Dense",title="Aggregate_op per op_carrier")


In [None]:
df_origin = pd.DataFrame(train_set_graph['ORIGIN'].value_counts().reset_index().values, columns=["ORIGIN", "ORIGIN_AGGREGATE"])
df_dest = pd.DataFrame(train_set_graph['DEST'].value_counts().reset_index().values, columns=["DEST", "DEST_AGGREGATE"])
#print(df_origin)
#print(df_dest)
df_tot = df_origin.merge(df_dest,left_on="ORIGIN",right_on="DEST")
df_tot['TOT_ARR_DEP'] = df_tot.loc[:,["ORIGIN_AGGREGATE","DEST_AGGREGATE"]].sum(axis=1)
df_tot = df_tot.merge(airports_data,left_on="ORIGIN",right_on="IATA")
df_tot.drop(columns=["DEST","ORIGIN"],inplace=True)
print(df_tot)



So note that, df_origin contains one more row that isn't in df_dest. Without loss of generality we can consider flights with origin and destination airporst known.


**Top 50 Airports Arrival & Destination**

In [None]:
#sns.lineplot(data=df3[0:20],x="ORIGIN",y="TOT_ARR_DEP",color='red').set_title("Top 20 Airports Arrival & Destination")
px.line(df_tot.loc[:50],x="IATA",y="TOT_ARR_DEP",markers=True,title="Top 50 airports Arrival & Destination")

**Top 50 Airports Arrival**

In [None]:
px.line(df_origin[:50],x="ORIGIN",y="ORIGIN_AGGREGATE",markers=True,title="Top 50 Airports Arrival")

**Top 50 Airports Destination**

In [None]:
px.line(df_dest[:50],x="DEST",y="DEST_AGGREGATE",markers=True,title="Top 50 Airports Destination")

**Flights per date**

In [None]:
train_set_graph['DATE'] = pd.to_datetime('2020-01-'+train_set_graph['DAY_OF_MONTH'].astype(object).apply(str))
train_set_graph['DAY_NAME'] = train_set_graph['DATE'].dt.day_name()

In [None]:
df_date = pd.DataFrame(train_set_graph.loc[:,["DATE","DAY_NAME"]].value_counts().reset_index().values,columns=["DATE","DAY_NAME","AGGREGATE_DATE"])
df_date = df_date.sort_values(by="DATE")
df_date["AGGREGATE_DATE"] = pd.to_numeric(df_date["AGGREGATE_DATE"])
print(df_date)
#sns.lineplot(data=df_date,x="DATE",y="AGGREGATE_DATE")
px.scatter(df_date,x="DATE",y="AGGREGATE_DATE",color="AGGREGATE_DATE",size="AGGREGATE_DATE",color_continuous_scale="Dense",title="Flights per date")

**Departures for blocks**

In [None]:
df_dep_time_blk = pd.DataFrame(train_set_graph["DEP_TIME_BLK"].value_counts().reset_index().values,columns=["BLOCK","AGGREGATE"])
df_dep_time_blk["AGGREGATE"] = pd.to_numeric(df_dep_time_blk["AGGREGATE"])
px.bar(df_dep_time_blk,x="BLOCK",y="AGGREGATE",color="AGGREGATE",color_continuous_scale="Dense",title="Departures for blocks")
#sns.barplot(data=df_dep_time_blk,x="BLOCK",y="AGGREGATE",palette=sns.blend_palette(['blue','red'],18)).set_title("Departures for block")


**Arrivals for blocks**

In [None]:
df_arr_time_blk = pd.DataFrame(train_set_graph["ARR_TIME_BLK"].value_counts().reset_index().values,columns=["BLOCK","AGGREGATE"])
df_arr_time_blk["AGGREGATE"] = pd.to_numeric(df_arr_time_blk["AGGREGATE"])
px.bar(df_arr_time_blk,x="BLOCK",y="AGGREGATE",color="AGGREGATE",color_continuous_scale="Dense",title="Arrivals for blocks")
#sns.barplot(data=df_dep_time_blk,x="BLOCK",y="AGGREGATE",palette=sns.blend_palette(['blue','red'],18)).set_title("Departures for block")


**Flights per days in January**

In [None]:
df_name = pd.DataFrame(df_date.loc[:,["DAY_NAME","AGGREGATE_DATE"]])
df_name = pd.DataFrame((df_name.groupby("DAY_NAME",as_index=False)["AGGREGATE_DATE"]).sum()).sort_values("AGGREGATE_DATE")
print(df_name)
px.scatter(df_name,x="DAY_NAME",y="AGGREGATE_DATE",color="AGGREGATE_DATE",size="AGGREGATE_DATE",color_continuous_scale="Dense",title="Flights per date")

**Airports map**

In [None]:
px.scatter_geo(
    data_frame= df_tot, 
    lat= df_tot["LATITUDE"], 
    lon=df_tot["LONGITUDE"],
    # loc
    color = df_tot["TOT_ARR_DEP"],
    hover_name="AIRPORT",
    hover_data=[
        "CITY",
        "STATE"
    ],
    scope="north america",
    title="US airports"
)

**Types of flights**

A graph is defined about the types of flights present in the dataset

        1. - <= 700 miles == Short flight

        2. -> = 700 miles and <= 3000 miles == Medium flight

        3. -> = 3000 miles to inf == Long flight

For the choice of the correct range of values, the size of the American continent and the current classifications of airlines operating in the USA were taken into account, as reported on https://en.wikipedia.org/wiki/Flight_length.

In [None]:
train_set_graph["TYPE_OF_FLIGHT"] = np.where((train_set_graph["DISTANCE"].astype(float)<700),"Short flight","Medium flight")
train_set_graph["TYPE_OF_FLIGHT"] = np.where(((train_set_graph["TYPE_OF_FLIGHT"]!="Short flight")  & (train_set_graph["DISTANCE"].astype(float)>3000)),"Long flight",train_set_graph["TYPE_OF_FLIGHT"])

In [None]:
df_type_of_flight = pd.DataFrame(train_set_graph['TYPE_OF_FLIGHT'].value_counts().reset_index().values, columns=["TYPE_OF_FLIGHT", "TYPE_OF_FLIGHT_AGGREGATE"])
print(df_type_of_flight)
px.bar(df_type_of_flight,x="TYPE_OF_FLIGHT",y="TYPE_OF_FLIGHT_AGGREGATE")

---

## Select and train a model

#### Settings

In [None]:
import pickle
from sklearn.metrics import classification_report
import datetime

dt = datetime.datetime.now()
str_date_time = dt.strftime("%d_%m_%Y_%H_%M_%S")
#result_filename = "results_"+str_date_time
result_filename = 'results_2'
print(result_filename)

dataResults= pd.DataFrame(columns=[
  'Classifier','Type of test',
  'Accuracy', 'Precision', 'Recall', 'F1', 'Balanced accuracy', 'Weighted Precision', 'Weighted Recall', 'Weighted F1'
  ])
  
print(dataResults.head())

def upload(classifier,type_of_test,
  accuracy, precision,recall,f1,
  balanced_accuracy, weighted_precision, weighted_recall,weighted_f1):

  dataResults = None

  try:
    dataResults = pd.read_csv("./Result/"+result_filename+".csv")
  except FileNotFoundError:
    open("./Result/"+result_filename+".csv",'w')

  try:
    dataResults = pd.read_csv("./Result/"+result_filename+".csv")
  except pd.errors.EmptyDataError:
      print('CSV empty')
      dataResults = pd.DataFrame()

  dict = {
    "Classifier" : classifier,
    "Type of test" : type_of_test,
    "Accuracy" : accuracy,
    "Precision" : precision,
    "Recall" : recall,
    "F1" : f1,
    "Balanced Accuracy" : balanced_accuracy,
    "Weighted Precision":weighted_precision,
    "Weighted Recall":weighted_recall,
    "Weighted F1":weighted_f1
    }
  dataResults=dataResults.append(dict, ignore_index=True)
  dataResults.to_csv("./Result/"+result_filename+".csv",index=False)
  return dataResults



Notice that following the predictions, are saved the measure below:
    
    1. Accuracy,
    2. Precision,
    3. Recall,
    4. F1,

balanced, and weighted measure such as:

    1. Balanced Accuracy,
    2. Weighted Precision,
    3. Weighted Recall,
    4. Weighted F1


The **accuracy** metric evaluates the fraction of samples correctly labeled on the total of records present in the dataset. This metric is not adequate if the classes contain a highly different number of records, or if the dataset is unbalanced. Considering that balancing operations only make sense if applied to the train set, it was necessary to introduce another metric that takes into account the unbalancing of the test set in the evaluation of accuracy. For this reason the **balanced_accuracy_score** was introduced.

In addition to the accuracy metric, and its version for unbalanced datasets, metrics such as **Precision**, **Recall** and **F1** were introduced, respectively used for:

    1. Evaluate the relationship between records correctly classified as positive and records classified as positive.
    2. Evaluate the relationship between positive records and correctly classified ones.
    3. Evaluate the harmonic mean between precision and recall.

Equivalent to the speech made previously for accuracy, it was decided to introduce appropriate modifications on the Precision, Recall and F1 metrics in order to adapt them to properly evaluate scores on unbalanced datasets (test set).
This readjustment required the inclusion of an **average** parameter in the definition of the class of the metric taken into consideration. This parameter takes the value: **weighted**.



Note that due to the imbalance of the test set it was decided to train the classifiers with respect to the F1 metric (F-measure). Specifically, this choice is due to the desire to take into good consideration both the Precision and the Recall metrics.

In [None]:
train_set_prepared.shape

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score

def get_metrics(pred,labels):
    ret = {
        "accuracy" : accuracy_score(pred,labels),
        "precision" : precision_score(pred,labels),
        "recall" : recall_score(pred,labels),
        "f1" : f1_score(pred,labels),
        "balanced_accuracy" : balanced_accuracy_score(pred,labels),
        "balanced_precision" : precision_score(pred,labels,average = 'weighted'),
        "balanced_recall" : recall_score(pred,labels,average = 'weighted'),
        "balanced_f1" : f1_score(pred,labels,average = 'weighted')
    }
    return ret


### 1. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
filename = './Model/rnd_clf.sav'

# load the model from disk
try:
    rnd_clf = pickle.load(open(filename, 'rb'))
    print(rnd_clf)
except FileNotFoundError:
    rnd_clf = RandomForestClassifier()
    rnd_clf.fit(train_set_prepared,train_set_labels_res)
    # save the model to disk
    pickle.dump(rnd_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_rnd = rnd_clf.predict(train_set_prepared)
report = get_metrics(y_pred_rnd,train_set_labels_res)
#print(classification_report(y_pred_rnd,train_set_labels_res))
upload('Random forest','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1']
);

Performance check on the Test set:

In [None]:
y_pred_rnd = rnd_clf.predict(test_set_prepared)
report = get_metrics(y_pred_rnd,test_set_labels)
print(classification_report(y_pred_rnd,test_set_labels))
upload('Random forest','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])


#### Fine tuning 
The goal is to improve the performance of the classifier by tuning the parameters

##### Grid Search
It was decided to use the GridSearchCV made available by Scikit-Learn's in order to find the best configurations.

In [None]:
from sklearn.model_selection import GridSearchCV
filename = './Model/best_rnd_clf.sav'

try:
    best_rnd_clf = pickle.load(open(filename, 'rb'))
    print(best_rnd_clf)
except FileNotFoundError:
  # hyperparameters
  param_grid = [
    {'n_estimators': [10, 20, 35], 'max_features': [10, 15, 20]},
  ]
  # define grid search
  best_rnd_clf = RandomForestClassifier()
  grid_search_rnd = GridSearchCV(best_rnd_clf, param_grid, cv=5,scoring='f1', return_train_score=True, n_jobs=4)
  # fit
  grid_search_rnd.fit(train_set_prepared, train_set_labels_res)

  print("Best Param:", grid_search_rnd.best_params_)
  best_rnd_clf = grid_search_rnd.best_estimator_

  # save the model to disk
  pickle.dump(best_rnd_clf, open(filename, 'wb'))


Performance check on the Train set:

In [None]:
y_pred_rnd= best_rnd_clf.predict(train_set_prepared)
report = get_metrics(y_pred_rnd,train_set_labels_res)
print(classification_report(y_pred_rnd,train_set_labels_res))
upload('Random forest','Train Grid',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_rnd= best_rnd_clf.predict(test_set_prepared)
report = get_metrics(y_pred_rnd,test_set_labels)
print(classification_report(y_pred_rnd,test_set_labels))
upload('Random forest','Test Grid',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

### 2. SGD

In [None]:
from sklearn.linear_model import SGDClassifier
filename = './Model/sgd_clf.sav'

# load the model from disk
try:
    sgd_clf = pickle.load(open(filename, 'rb'))
    print(sgd_clf)
except FileNotFoundError:
    sgd_clf = SGDClassifier()
    sgd_clf.fit(train_set_prepared, train_set_labels_res)
    # save the model to disk
    pickle.dump(sgd_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_sgd = sgd_clf.predict(train_set_prepared)
report = get_metrics(y_pred_sgd,train_set_labels_res)
print(classification_report(y_pred_sgd,train_set_labels_res))
upload('SGD','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_sgd = sgd_clf.predict(test_set_prepared)
report = get_metrics(y_pred_sgd,test_set_labels)
print(classification_report(y_pred_sgd,test_set_labels))
upload('SGD','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

#### Fine-Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
filename = './Model/best_sgd_clf.sav'

try:
    best_sgd_clf = pickle.load(open(filename, 'rb'))
    print(best_sgd_clf)
except FileNotFoundError:
    # hyperparameters
    params = {
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'penalty': ['l2','elasticnet', 'l1']
    }
    # define grid search
    best_sgd_clf = SGDClassifier()
    grid_search_sgd = GridSearchCV(best_sgd_clf, param_grid = params, cv=5,scoring='f1',n_jobs=4)
    # fit
    grid_search_sgd.fit(train_set_prepared, train_set_labels_res)
    print("Best Param:", grid_search_sgd.best_params_)
    best_sgd_clf = grid_search_sgd.best_estimator_

    # save the model to disk
    pickle.dump(best_sgd_clf, open(filename, 'wb'))


Performance check on the Train set:

In [None]:
y_pred_sgd= best_sgd_clf.predict(train_set_prepared)
report = get_metrics(y_pred_sgd,train_set_labels_res)
print(classification_report(y_pred_sgd,train_set_labels_res))
upload('SGD','Train Grid',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_sgd= best_sgd_clf.predict(test_set_prepared)
report = get_metrics(y_pred_sgd,test_set_labels)
print(classification_report(y_pred_sgd,test_set_labels))
upload('SGD','Test Grid',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

### 3. Linear SVC

In [None]:
from sklearn.svm import LinearSVC
filename = './Model/l_svc_clf.sav'

# load the model from disk
try:
    l_svc_clf = pickle.load(open(filename, 'rb'))
    print(l_svc_clf)
except FileNotFoundError:
    l_svc_clf = LinearSVC()
    l_svc_clf.fit(train_set_prepared, train_set_labels_res)
    # save the model to disk
    pickle.dump(l_svc_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_l_svc= l_svc_clf.predict(train_set_prepared)
report = get_metrics(y_pred_l_svc,train_set_labels_res)
print(classification_report(y_pred_l_svc,train_set_labels_res))
upload('Linear SVC','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_l_svc= l_svc_clf.predict(test_set_prepared)
report = get_metrics(y_pred_l_svc,test_set_labels)
print(classification_report(y_pred_l_svc,test_set_labels))
upload('Linear SVC','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

#### Fine-tuning

In [None]:
from sklearn.model_selection import GridSearchCV
filename = './Model/best_l_svc_clf.sav'

try:
    best_l_svc_clf = pickle.load(open(filename, 'rb'))
    print(best_l_svc_clf)
except FileNotFoundError:
    # hyperparameters
    params = {
        'loss': ['squared_hinge'],
    }
    # define grid search
    best_l_svc_clf = LinearSVC(max_iter = 100,dual=False)
    grid_search_l_svc = GridSearchCV(best_l_svc_clf, param_grid = params, cv=5,scoring='f1',n_jobs=4)
    # fit
    grid_search_l_svc.fit(train_set_prepared, train_set_labels_res)
    print("Best Param:", grid_search_l_svc.best_params_)
    best_l_svc_clf = grid_search_l_svc.best_estimator_

    # save the model to disk
    pickle.dump(best_l_svc_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_l_svc= best_l_svc_clf.predict(train_set_prepared)
report = get_metrics(y_pred_l_svc,train_set_labels_res)
print(classification_report(y_pred_l_svc,train_set_labels_res))
upload('Linear SVC','Train Grid',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_l_svc= best_l_svc_clf.predict(test_set_prepared)
report = get_metrics(y_pred_l_svc,test_set_labels)
print(classification_report(y_pred_l_svc,test_set_labels))
upload('Linear SVC','Test Grid',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

### 4. Logistic-Regressor

In [None]:
from sklearn.linear_model import LogisticRegression
filename = './Model/logreg_clf.sav'

# load the model from disk
try:
    logreg_clf = pickle.load(open(filename, 'rb'))
    print(logreg_clf)
except FileNotFoundError:
    logreg_clf = LogisticRegression(solver='lbfgs', max_iter=1000)
    logreg_clf.fit(train_set_prepared, train_set_labels_res)

    # save the model to disk
    pickle.dump(logreg_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_logreg = logreg_clf.predict(train_set_prepared)
report = get_metrics(y_pred_logreg,train_set_labels_res)
print(classification_report(y_pred_logreg,train_set_labels_res))
upload('Logistic Regressor','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],
report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_logreg = logreg_clf.predict(test_set_prepared)
report = get_metrics(y_pred_logreg,test_set_labels)
print(classification_report(y_pred_logreg,test_set_labels))
upload('Logistic Regressor','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],
report['balanced_f1'])

#### Fine-tuning

In [None]:
# load the model from disk
filename = './Model/best_logreg_clf.sav'

try:
    best_logreg_clf = pickle.load(open(filename, 'rb'))
    print(best_logreg_clf)
except FileNotFoundError:
    # hyperparameters
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l2']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    # define grid search
    grid = dict(solver=solvers,penalty=penalty,C=c_values)
    best_logreg_clf = LogisticRegression()
    grid_search_logreg = GridSearchCV(best_logreg_clf, param_grid=grid, cv=10, scoring='f1', return_train_score=True, n_jobs=4)
    # fit 
    grid_search_logreg.fit(train_set_prepared, train_set_labels_res)
    print("Best Param:", grid_search_logreg.best_params_)
    best_logreg_clf = grid_search_logreg.best_estimator_

    # save the model to disk
    pickle.dump(best_logreg_clf, open(filename, 'wb'))



Performance check on the Train set:

In [None]:
y_pred_logreg= best_logreg_clf.predict(train_set_prepared)
report = get_metrics(y_pred_logreg,train_set_labels_res)
print(classification_report(y_pred_logreg,train_set_labels_res))
upload('Logistic Regressor','Train Grid',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_logreg= best_logreg_clf.predict(test_set_prepared)
report = get_metrics(y_pred_logreg,test_set_labels)
print(classification_report(y_pred_logreg,test_set_labels))
upload('Logistic Regressor','Test Grid',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])


### 5. KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
filename = './Model/knn_clf.sav'

# load the model from disk
try:
    knn_clf = pickle.load(open(filename, 'rb'))
    print(knn_clf)
except FileNotFoundError:
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(train_set_prepared, train_set_labels_res)

    # save the model to disk
    pickle.dump(knn_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_KNN= knn_clf.predict(train_set_prepared)
report = get_metrics(y_pred_KNN,train_set_labels_res)
print(classification_report(y_pred_KNN,train_set_labels_res))
upload('KNN','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_KNN= knn_clf.predict(test_set_prepared)
report = get_metrics(y_pred_KNN,test_set_labels)
print(classification_report(y_pred_KNN,test_set_labels))
upload('KNN','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Alternative:

The radius-based approach to selecting neighbors is more appropriate for sparse data, preventing records that are distant from contributing to the prediction.

In [None]:
from sklearn.neighbors import RadiusNeighborsClassifier
filename = './Model/r_knn_clf.sav'

# load the model from disk
try:
    r_knn_clf = pickle.load(open(filename, 'rb'))
    print(r_knn_clf)
except FileNotFoundError:
    r_knn_clf = RadiusNeighborsClassifier(radius=1.8,outlier_label='most_frequent')
    r_knn_clf.fit(train_set_prepared, train_set_labels_res)

    # save the model to disk
    pickle.dump(r_knn_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_r_KNN= r_knn_clf.predict(train_set_prepared)
report = get_metrics(y_pred_r_KNN,train_set_labels_res)
print(classification_report(y_pred_r_KNN,train_set_labels_res))
upload('Radius KNN','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_r_KNN= r_knn_clf.predict(test_set_prepared)
report = get_metrics(y_pred_r_KNN,test_set_labels)
print(classification_report(y_pred_r_KNN,test_set_labels))
upload('Radius KNN','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

### 6. MLP

In [None]:
from sklearn.neural_network import *
filename = './Model/mpl_clf.sav'

# load the model from disk
try:
    mpl_clf = pickle.load(open(filename, 'rb'))
    print(mpl_clf)
except FileNotFoundError:
    mpl_clf = MLPClassifier(max_iter=500)
    mpl_clf.fit(train_set_prepared, train_set_labels_res)

    # save the model to disk
    pickle.dump(mpl_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_MLP = mpl_clf.predict(train_set_prepared)
report = get_metrics(y_pred_MLP,train_set_labels_res)
print(classification_report(y_pred_MLP,train_set_labels_res))
upload('MLP','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_MLP = mpl_clf.predict(test_set_prepared)
report = get_metrics(y_pred_MLP,test_set_labels)
print(classification_report(y_pred_MLP,test_set_labels))
upload('MLP','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

### 7. Decision-Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
filename = './Model/dc_clf.sav'

# load the model from disk
try:
    dc_clf = pickle.load(open(filename, 'rb'))
    print(dc_clf)
except FileNotFoundError:
    dc_clf = DecisionTreeClassifier()
    dc_clf.fit(train_set_prepared, train_set_labels_res)

    # save the model to disk
    pickle.dump(dc_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_DC = dc_clf.predict(train_set_prepared)
report = get_metrics(y_pred_DC,train_set_labels_res)
print(classification_report(y_pred_DC,train_set_labels_res))
upload('Decision Tree','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_DC = dc_clf.predict(test_set_prepared)
report = get_metrics(y_pred_DC,test_set_labels)
print(classification_report(y_pred_DC,test_set_labels))
upload('Decision Tree','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

### 8. Ada-Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
filename = './Model/adaBoost_clf.sav'

# load the model from disk
try:
    adaBoost_clf = pickle.load(open(filename, 'rb'))
    print(adaBoost_clf)
except FileNotFoundError:
    adaBoost_clf = AdaBoostClassifier()
    adaBoost_clf.fit(train_set_prepared, train_set_labels_res)

    # save the model to disk
    pickle.dump(adaBoost_clf, open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_ADABOOST = adaBoost_clf.predict(train_set_prepared)
report = get_metrics(y_pred_ADABOOST,train_set_labels_res)
print(classification_report(y_pred_ADABOOST,train_set_labels_res))
upload('Ada Boost','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_ADABOOST = adaBoost_clf.predict(test_set_prepared)
report = get_metrics(y_pred_ADABOOST,test_set_labels)
print(classification_report(y_pred_ADABOOST,test_set_labels))
upload('Ada Boost','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

---

## Evaluation of classifiers


In [None]:
#dataResults = pd.read_csv("./Result/"+result_filename+".csv")
dataResults = pd.read_csv("./Result/results_original.csv")

dataResults.head(40)

---

## Voting classifier

In [None]:
from sklearn.ensemble import VotingClassifier
filename = './Model/voting_clf.sav'

# load the model from disk
try:
    voting_clf  = pickle.load(open(filename, 'rb'))
    print(voting_clf)
except FileNotFoundError:
    voting_clf = VotingClassifier(estimators=[('log_reg', best_logreg_clf), ('mlp', mpl_clf), ('l_svc', l_svc_clf)], voting='hard')
    voting_clf.fit(train_set_prepared, train_set_labels_res)

    # save the model to disk
    pickle.dump(voting_clf , open(filename, 'wb'))

Performance check on the Train set:

In [None]:
y_pred_VOTING = voting_clf.predict(train_set_prepared)
report = get_metrics(y_pred_VOTING,train_set_labels_res)
print(classification_report(y_pred_VOTING,train_set_labels_res))
upload('Voting','Train',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

Performance check on the Test set:

In [None]:
y_pred_VOTING = voting_clf.predict(test_set_prepared)
report = get_metrics(y_pred_VOTING,test_set_labels)
print(classification_report(y_pred_VOTING,test_set_labels))
upload('Voting','Test',
report['accuracy'],report['precision'],report['recall'],report['f1'],
report['balanced_accuracy'],report['balanced_precision'],report['balanced_recall'],report['balanced_f1'])

---

# Conclusions

As defined in the initial section, the aim of this analysis was to define whether or not an airplane flight was delayed, where a delayed flight means a flight that arrives at its destination with at least 15 minutes delay. Note that instead of considering one of **DEP_DEL15** or **ARR_DEL15** as the target attribute, a new categorical attribute could have been defined, for example **delayed_flight** as a combination in **or** or **and** of the two previous attributes.

Considering the high number of features, about 260, two different dimensionality reduction techniques were applied in order to verify the existence of any subset of features on which to train the classifiers.
Specifically, in consideration of the encoding output (sparse matrix), the Truncated-SVD ​​and Incremental-PCA classes were used as the methods of these classes allow to operate efficiently on inputs of this type.
Both classes perform linear reductions, with the aim of identifying the main features present in a data set.

Following the application of the methods made available by the aforementioned classes, two sets of reduced-size features have been identified, each characterized by 200 features. Once the identification of the main components was completed, the classifiers were trained on these features. The scores obtained following the prediction activities are shown below.

In [None]:
dataResults_IPCA = pd.read_csv("./Result/results_IPCA_200.csv")
dataResults_IPCA.head(40)

In [None]:
dataResults_TSVD = pd.read_csv("./Result/results_TSVD_200.csv")
dataResults_TSVD.head(40)

In [None]:
dataResults_original = pd.read_csv("./Result/results_original.csv")
dataResults_original.head(40)

As highlighted by the previous tables, the scores obtained on a reduced version of the number of components produced worse results than the scores obtained by classifiers trained on the original number of components. To understand the reason for these results, consider the following example:

Suppose a simple case with 3 independent variables x1,x2,x3
and the output y and suppose now that x3=y

and so you should be able to get a 0 error model.

Suppose now that in the training set the variation of y
is very small and so also the variation of x3.

Now if you run PCA and you decide to select only 2 variables you will obtain a combination of x1
and x2. So the information of x3 that was the only variable able to explain y is lost. 

Note that the excessive number of features depended on the application of the oneHotEncoder. In order to reduce the number of features generated by the encoding activity, such encoding could have been replaced with a process that transforms catarogical attributes into numeric based on the frequency of each attribute. By doing so rare attributes, they would be encoded with the same frequency thus reducing the number of components.

Considering the imbalance of the test set, different tests were performed in order to identify the best metrics in order to correctly evaluate the results. If this fundamental phase had not been carried out, valid scores would have been produced for the majority class and not very significant for the minority class. Specifically, good precision values ​​would have been obtained at the expense of low Recall values ​​as the algorithms would have had difficulty capturing the relationships between the attributes and the minority class.

Note that for some classifiers a Fine Tuning phase was carried out, applying the grid search. This procedure has not been applied to all classifiers as it is computationally burdensome.

Classifiers:

    1. Random Forest
    2. SGD
    3. Linear SVC
    4. Logistic Regressor
    5. KNN
    6. Radius KNN
    7. MLP
    8. Decision Tree
    9. AdaBoost
    10. Voting

In [None]:
a = dataResults_original[dataResults_original['Type of test']=='Train']
a.sort_values(by=['Accuracy'], inplace=True)
plt.figure(figsize=(15,5))
fig = sns.barplot(x=a['Classifier'], y=a['Balanced Accuracy'], palette='deep')
plt.xticks(rotation=90)
plt.ylabel('Accuracy')
plt.xlabel('Classifier')
plt.title('Performance on balanced accuracy', size=20)
plt.show()
figure = fig.get_figure()
#save_figure(figure,"Performance accuracy classifiers TRAIN")

In [None]:
b = dataResults_original[dataResults_original['Type of test']=='Test']
b.sort_values(by=['Balanced Accuracy'], inplace=True)
plt.figure(figsize=(15,5))
fig = sns.barplot(x=b['Classifier'], y=b['Balanced Accuracy'], palette='deep')
plt.xticks(rotation=90)
plt.ylabel('Accuracy')
plt.xlabel('Classifier')
plt.title('Performance on balanced accuracy (test set)', size=20)
plt.show()
figure = fig.get_figure()

In [None]:
c = dataResults_original[dataResults_original['Type of test']=='Test']
c.sort_values(by=['Weighted F1'], inplace=True)
plt.figure(figsize=(15,5))
fig = sns.barplot(x=c['Classifier'], y=c['Weighted F1'], palette='deep')
plt.xticks(rotation=90)
plt.ylabel('Weighted F1')
plt.xlabel('Classifier')
plt.title('Performance on Weighted F1 (test set)', size=20)
plt.show()
figure = fig.get_figure()

In [None]:
d = dataResults_original[dataResults_original['Type of test']=='Test']
d.sort_values(by=['Weighted Precision'], inplace=True)
plt.figure(figsize=(15,5))
fig = sns.barplot(x=d['Classifier'], y=d['Weighted Precision'], palette='deep')
plt.xticks(rotation=90)
plt.ylabel('Weighted Precision')
plt.xlabel('Classifier')
plt.title('Performance on Weighted Precision (test set)', size=20)
plt.show()
figure = fig.get_figure()

In [None]:
e = dataResults_original[dataResults_original['Type of test']=='Test']
e.sort_values(by=['Weighted Recall'], inplace=True)
plt.figure(figsize=(15,5))
fig = sns.barplot(x=e['Classifier'], y=e['Weighted Recall'], palette='deep')
plt.xticks(rotation=90)
plt.ylabel('Weighted Recall')
plt.xlabel('Classifier')
plt.title('Performance on Weighted Recall (test set)', size=20)
plt.show()
figure = fig.get_figure()

In conclusion, the application of the K fold cross validation technique could allow the identification of the best possible division of the dataset into train and test set at the expense of computation complexity as k different classifiers would be generated.

---