# cancellation reason
- (A) Air Carrier
- (B) Extreme Weather
- (C) National Aviation System (NAS); and
- (D) Security

# 1. Getting the data

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, BaggingRegressor, GradientBoostingClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.under_sampling import RandomUnderSampler

# performance metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix


%matplotlib inline

In [4]:
mylist = []

for chunk in  pd.read_csv('flights.csv', sep=',', chunksize=20000):
    mylist.append(chunk)

df_flights = pd.concat(mylist, axis= 0)
del mylist

In [5]:
df_airlines =pd.read_csv('airlines.csv')
df_airports = pd.read_csv('airports.csv')

# Exploratory Data Analysis (EDA)

In [6]:
df_airports.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


estado y df_fl
ORIGIN_AIRPORT en df_fl = IATA_CODE en df_airports

In [7]:
df_flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [8]:
df_iata_code_state = df_airports[['IATA_CODE', 'STATE']]
df_flights = df_flights.join(df_iata_code_state.set_index('IATA_CODE'), on='ORIGIN_AIRPORT')


In [9]:
df_flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,STATE
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,-22.0,0,0,,,,,,,AK
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,-9.0,0,0,,,,,,,CA
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,5.0,0,0,,,,,,,CA
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,-9.0,0,0,,,,,,,CA
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,-21.0,0,0,,,,,,,WA


### Flights filtered by Florida State

In [10]:
df_fl_flights = df_flights[df_flights['STATE'] == 'FL']

In [11]:
df_fl_flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,STATE
64,2015,1,1,4,AA,1323,N3CXAA,MCO,MIA,510,...,-11.0,0,0,,,,,,,FL
77,2015,1,1,4,EV,5583,N882AS,VPS,ATL,520,...,-15.0,0,0,,,,,,,FL
110,2015,1,1,4,EV,4160,N11150,JAX,EWR,540,...,-14.0,0,0,,,,,,,FL
144,2015,1,1,4,B6,2228,N231JB,MCO,EWR,550,...,-7.0,0,0,,,,,,,FL
153,2015,1,1,4,B6,860,N656JB,FLL,BDL,553,...,9.0,0,0,,,,,,,FL


In [12]:
df_fl_flights.describe()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,...,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
count,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,411142.0,411142.0,411029.0,411029.0,...,415586.0,410705.0,409611.0,415586.0,415586.0,80338.0,80338.0,80338.0,80338.0,80338.0
mean,2015.0,6.083321,15.704109,3.992353,1556.593242,1340.761412,1349.491254,10.961079,14.678903,1371.893981,...,1519.673841,1492.296427,5.57288,0.003318,0.011059,15.304401,0.104919,18.502527,25.386144,3.181135
std,0.0,3.472485,8.79117,2.002831,1141.071567,467.808764,485.118879,40.11382,6.921939,486.516164,...,500.329446,529.526736,42.027297,0.057508,0.104579,32.506344,2.70077,49.114804,46.50306,18.23228
min,2015.0,1.0,1.0,1.0,1.0,55.0,1.0,-46.0,1.0,1.0,...,1.0,1.0,-66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2015.0,3.0,8.0,2.0,709.0,930.0,933.0,-5.0,11.0,947.0,...,1123.0,1104.0,-14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2015.0,6.0,16.0,4.0,1355.0,1335.0,1343.0,-2.0,13.0,1356.0,...,1529.0,1517.0,-5.0,0.0,0.0,2.0,0.0,3.0,6.0,0.0
75%,2015.0,9.0,23.0,6.0,2077.0,1745.0,1757.0,9.0,17.0,1811.0,...,1940.0,1934.0,8.0,0.0,0.0,18.0,0.0,17.0,31.0,0.0
max,2015.0,12.0,31.0,7.0,6898.0,2359.0,2400.0,1536.0,158.0,2400.0,...,2400.0,2400.0,1556.0,1.0,1.0,784.0,241.0,1536.0,1010.0,932.0


In [13]:
df_fl_flights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415586 entries, 64 to 5819077
Data columns (total 32 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   YEAR                 415586 non-null  int64  
 1   MONTH                415586 non-null  int64  
 2   DAY                  415586 non-null  int64  
 3   DAY_OF_WEEK          415586 non-null  int64  
 4   AIRLINE              415586 non-null  object 
 5   FLIGHT_NUMBER        415586 non-null  int64  
 6   TAIL_NUMBER          414610 non-null  object 
 7   ORIGIN_AIRPORT       415586 non-null  object 
 8   DESTINATION_AIRPORT  415586 non-null  object 
 9   SCHEDULED_DEPARTURE  415586 non-null  int64  
 10  DEPARTURE_TIME       411142 non-null  float64
 11  DEPARTURE_DELAY      411142 non-null  float64
 12  TAXI_OUT             411029 non-null  float64
 13  WHEELS_OFF           411029 non-null  float64
 14  SCHEDULED_TIME       415582 non-null  float64
 15  ELAPSED_TIME   

### We inspect the null values from the Dataframe

In [14]:
df_fl_flights.isnull().sum()

YEAR                        0
MONTH                       0
DAY                         0
DAY_OF_WEEK                 0
AIRLINE                     0
FLIGHT_NUMBER               0
TAIL_NUMBER               976
ORIGIN_AIRPORT              0
DESTINATION_AIRPORT         0
SCHEDULED_DEPARTURE         0
DEPARTURE_TIME           4444
DEPARTURE_DELAY          4444
TAXI_OUT                 4557
WHEELS_OFF               4557
SCHEDULED_TIME              4
ELAPSED_TIME             5975
AIR_TIME                 5975
DISTANCE                    0
WHEELS_ON                4881
TAXI_IN                  4881
SCHEDULED_ARRIVAL           0
ARRIVAL_TIME             4881
ARRIVAL_DELAY            5975
DIVERTED                    0
CANCELLED                   0
CANCELLATION_REASON    410990
AIR_SYSTEM_DELAY       335248
SECURITY_DELAY         335248
AIRLINE_DELAY          335248
LATE_AIRCRAFT_DELAY    335248
WEATHER_DELAY          335248
STATE                       0
dtype: int64

In [15]:
df_fl_flights = df_fl_flights.fillna(0)
# df_fl_flights = df_fl_flights.dropna(axis=0,how='any')
df_fl_flights.isnull().sum()

YEAR                   0
MONTH                  0
DAY                    0
DAY_OF_WEEK            0
AIRLINE                0
FLIGHT_NUMBER          0
TAIL_NUMBER            0
ORIGIN_AIRPORT         0
DESTINATION_AIRPORT    0
SCHEDULED_DEPARTURE    0
DEPARTURE_TIME         0
DEPARTURE_DELAY        0
TAXI_OUT               0
WHEELS_OFF             0
SCHEDULED_TIME         0
ELAPSED_TIME           0
AIR_TIME               0
DISTANCE               0
WHEELS_ON              0
TAXI_IN                0
SCHEDULED_ARRIVAL      0
ARRIVAL_TIME           0
ARRIVAL_DELAY          0
DIVERTED               0
CANCELLED              0
CANCELLATION_REASON    0
AIR_SYSTEM_DELAY       0
SECURITY_DELAY         0
AIRLINE_DELAY          0
LATE_AIRCRAFT_DELAY    0
WEATHER_DELAY          0
STATE                  0
dtype: int64

### We inspect the dataframe after dropping all null values

In [16]:
df_fl_flights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415586 entries, 64 to 5819077
Data columns (total 32 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   YEAR                 415586 non-null  int64  
 1   MONTH                415586 non-null  int64  
 2   DAY                  415586 non-null  int64  
 3   DAY_OF_WEEK          415586 non-null  int64  
 4   AIRLINE              415586 non-null  object 
 5   FLIGHT_NUMBER        415586 non-null  int64  
 6   TAIL_NUMBER          415586 non-null  object 
 7   ORIGIN_AIRPORT       415586 non-null  object 
 8   DESTINATION_AIRPORT  415586 non-null  object 
 9   SCHEDULED_DEPARTURE  415586 non-null  int64  
 10  DEPARTURE_TIME       415586 non-null  float64
 11  DEPARTURE_DELAY      415586 non-null  float64
 12  TAXI_OUT             415586 non-null  float64
 13  WHEELS_OFF           415586 non-null  float64
 14  SCHEDULED_TIME       415586 non-null  float64
 15  ELAPSED_TIME   

We saw that I we drop all null values we will be left with no rows, so we fill all null values with 0

In [17]:
df_fl_flights.describe(exclude = ['object'])

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,...,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
count,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,...,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0,415586.0
mean,2015.0,6.083321,15.704109,3.992353,1556.593242,1340.761412,1335.060693,10.843869,14.517946,1356.850835,...,1519.673841,1474.769612,5.492757,0.003318,0.011059,2.958533,0.020282,3.576771,4.907461,0.614953
std,0.0,3.472485,8.79117,2.002831,1141.071567,467.808764,502.085121,39.914692,7.05157,504.493682,...,500.329446,550.412018,41.729357,0.057508,0.104579,15.517404,1.188173,22.796983,22.771463,8.11404
min,2015.0,1.0,1.0,1.0,1.0,55.0,0.0,-46.0,0.0,0.0,...,1.0,0.0,-66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2015.0,3.0,8.0,2.0,709.0,930.0,926.0,-5.0,11.0,939.0,...,1123.0,1053.0,-13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2015.0,6.0,16.0,4.0,1355.0,1335.0,1338.0,-2.0,13.0,1350.0,...,1529.0,1511.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2015.0,9.0,23.0,6.0,2077.0,1745.0,1755.0,9.0,17.0,1809.0,...,1940.0,1930.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2015.0,12.0,31.0,7.0,6898.0,2359.0,2400.0,1536.0,158.0,2400.0,...,2400.0,2400.0,1556.0,1.0,1.0,784.0,241.0,1536.0,1010.0,932.0


### Our dependet variable is Delay, overall delay, whether if it's an arriving delay or a departure delay, so we will mix both variables

In [18]:
df_fl_flights.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'STATE'],
      dtype='object')

In [29]:
SUM_DELAYS= ['DEPARTURE_DELAY','ARRIVAL_DELAY','AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY','LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
df_fl_flights['DELAY'] = df_fl_flights[SUM_DELAYS].sum(axis=1)
print(df_fl_flights[SUM_DELAYS])
print(df_fl_flights['DELAY'])

         DEPARTURE_DELAY  ARRIVAL_DELAY  AIR_SYSTEM_DELAY  SECURITY_DELAY  \
64                  -3.0          -11.0               0.0             0.0   
77                  -6.0          -15.0               0.0             0.0   
110                 -9.0          -14.0               0.0             0.0   
144                 -2.0           -7.0               0.0             0.0   
153                 -2.0            9.0               0.0             0.0   
...                  ...            ...               ...             ...   
5819011             44.0           47.0               3.0             0.0   
5819017             -8.0          -16.0               0.0             0.0   
5819023             21.0           20.0               0.0             0.0   
5819057             13.0           28.0              15.0             0.0   
5819077             -6.0          -10.0               0.0             0.0   

         AIRLINE_DELAY  LATE_AIRCRAFT_DELAY  WEATHER_DELAY  
64            

In [15]:
df_fl = df_fl_flights[['MONTH', 'DAY', 'AIRLINE', 'SCHEDULED_DEPARTURE', 'STATE', 'WEATHER_DELAY', 'CANCELLATION_REASON']]

In [10]:
dependentVar1= (df_fl['WEATHER_DELAY'])


dependentVar= np.where(( (df_fl['CANCELLATION_REASON']=='B') ), 1,0)
dependentVar= np.where(( (df_fl['WEATHER_DELAY'].notna()) ), 1,0)
#assign the dependent variable to the key 'WEATHER_CONDITION' in the dataframe df_fl
df_fl['WEATHER_CONDITION'] = dependentVar

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fl['WEATHER_CONDITION'] = dependentVar


In [11]:
df_fl.head()

Unnamed: 0,MONTH,DAY,AIRLINE,SCHEDULED_DEPARTURE,STATE,WEATHER_DELAY,CANCELLATION_REASON,WEATHER_CONDITION
0,1,1,AS,5,AK,,,0
1,1,1,AA,10,CA,,,0
2,1,1,US,20,CA,,,0
3,1,1,AA,20,CA,,,0
4,1,1,AS,25,WA,,,0


In [12]:
df_fl.drop(['SCHEDULED_DEPARTURE'], axis=1, inplace=True)
df_fl.drop(['WEATHER_DELAY'], axis=1, inplace=True)
df_fl.drop(['CANCELLATION_REASON'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fl.drop(['SCHEDULED_DEPARTURE'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fl.drop(['WEATHER_DELAY'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fl.drop(['CANCELLATION_REASON'], axis=1, inplace=True)


In [13]:
df_fl

Unnamed: 0,MONTH,DAY,AIRLINE,STATE,WEATHER_CONDITION
0,1,1,AS,AK,0
1,1,1,AA,CA,0
2,1,1,US,CA,0
3,1,1,AA,CA,0
4,1,1,AS,WA,0
...,...,...,...,...,...
5819074,12,31,B6,CA,0
5819075,12,31,B6,NY,0
5819076,12,31,B6,NY,0
5819077,12,31,B6,FL,0


In [17]:
df_fl.dropna(subset=['STATE'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fl.dropna(subset=['STATE'], inplace=True)


In [18]:
states=df_fl['STATE'].unique()

In [19]:
states = states.tolist()

In [20]:
fl_number = states.index('FL')

In [21]:
states_index = np.arange(0, len(states))

In [22]:
df_fl['STATE'].replace(states, states_index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fl['STATE'].replace(states, states_index, inplace=True)


In [23]:
df_fl

Unnamed: 0,MONTH,DAY,AIRLINE,STATE,WEATHER_CONDITION
0,1,1,AS,0,0
1,1,1,AA,1,0
2,1,1,US,1,0
3,1,1,AA,1,0
4,1,1,AS,2,0
...,...,...,...,...,...
5819074,12,31,B6,1,0
5819075,12,31,B6,10,0
5819076,12,31,B6,10,0
5819077,12,31,B6,13,0


In [24]:
airline_code = df_fl['AIRLINE'].unique()

airline_index = np.arange(0, len(airline_code))

df_fl['AIRLINE'].replace(airline_code, airline_index, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fl['AIRLINE'].replace(airline_code, airline_index, inplace=True)


In [26]:
df_fl

Unnamed: 0,MONTH,DAY,AIRLINE,STATE,WEATHER_CONDITION
0,1,1,0,0,0
1,1,1,1,1,0
2,1,1,2,1,0
3,1,1,1,1,0
4,1,1,0,2,0
...,...,...,...,...,...
5819074,12,31,7,1,0
5819075,12,31,7,10,0
5819076,12,31,7,10,0
5819077,12,31,7,13,0


In [27]:
df_fl = df_fl[(df_fl['MONTH'] == 1) | (df_fl['MONTH'] == 12)]

In [None]:
corr = df_fl.corr()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

In [28]:
dependentVar = df_fl['WEATHER_CONDITION']
df_fl.drop(['WEATHER_CONDITION'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fl.drop(['WEATHER_CONDITION'], axis=1, inplace=True)


In [None]:
corr = df_fl.corr()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

In [29]:
X_train, X_test, y_train, y_test = train_test_split( df_fl, dependentVar, test_size = 0.3, random_state = 746)

In [30]:
print("Before Undersampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before Undersampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
  
# apply near miss 

rd = RandomUnderSampler() 
  
X_train_rd, y_train_rd = rd.fit_resample(X_train, y_train.ravel()) 
X_test_rd, y_test_rd = rd.fit_resample(X_test, y_test.ravel()) 

  
print('After Undersampling, the shape of X_train: {}'.format(X_train_rd.shape)) 
print('After Undersampling, the shape of y_train: {} \n'.format(y_train_rd.shape)) 

print('After Undersampling, the shape of X_test: {}'.format(X_test_rd.shape)) 
print('After Undersampling, the shape of y_test: {} \n'.format(y_test_rd.shape))
  
print("After Undersampling, counts of label '1': {}".format(sum(y_train_rd == 1))) 
print("After Undersampling, counts of label '0': {}".format(sum(y_train_rd == 0)))

Before Undersampling, counts of label '1': 134812
Before Undersampling, counts of label '0': 529626 

After Undersampling, the shape of X_train: (269624, 4)
After Undersampling, the shape of y_train: (269624,) 

After Undersampling, the shape of X_test: (115822, 4)
After Undersampling, the shape of y_test: (115822,) 

After Undersampling, counts of label '1': 134812
After Undersampling, counts of label '0': 134812


In [31]:
#create list of model and accuracy dicts
perform_list = []

# list to keep track of models
models = []

# list to keep track of proba scores
proba_score = []

In [32]:
dt = DecisionTreeClassifier()

# calculate accuracy of train
dt_acc = cross_val_score(dt, X_train_rd, y_train_rd, cv=10).mean()
dt_std = cross_val_score(dt, X_train_rd, y_train_rd, cv=10).std()
print('Accuracy of Decision Tree (TRAIN): {} +/- {}'.format(round(dt_acc, 2), round(dt_std, 2)))

# fit model
dt.fit(X_train_rd, y_train_rd)


Accuracy of Decision Tree (TRAIN): 0.66 +/- 0.0


DecisionTreeClassifier()

In [33]:

# get predictions and probabilities
dt_pred = dt.predict(X_test_rd)
dt_proba = dt.predict_proba(X_test_rd)

# calculate accuracy of test
dt_acc_test = round(accuracy_score(y_test_rd, dt_pred), 2)
print(f'Accuracy of Decision Tree (TEST): {dt_acc_test}')

# get precision, recall, f1-score
precision, recall, fscore, support = score(y_test_rd, dt_pred, average='macro')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {fscore}')

# add model and accuracy dict to list
perform_list.append(dict([
    ('Model', 'Decision Tree'),
    ('Train Accuracy', round(dt_acc, 2)),
    ('Test Accuracy', dt_acc),
    ('Precision', round(precision, 2)),
    ('Recall', round(recall, 2)),
    ('F1', round(fscore, 2))
     ]))

# add model to list
models.append('Decision Tree')
# add proba socre to list
proba_score.append(dt_proba)

Accuracy of Decision Tree (TEST): 0.66
Precision : 0.6637214558578505
Recall    : 0.6630778263196975
F-score   : 0.662746369460046


In [34]:
rf = RandomForestClassifier(n_estimators=100)

In [35]:
# fit model
rf.fit(X_train_rd, y_train_rd)


RandomForestClassifier()

In [36]:
# calculate accuracy of train
rf_acc = cross_val_score(rf, X_train_rd, y_train_rd, cv=10).mean()
rf_std = cross_val_score(rf, X_train_rd, y_train_rd, cv=10).std()
print('Accuracy of Random Forest (TRAIN): {} +/- {}'.format(round(rf_acc, 2), round(rf_std, 2)))

# get predictions and probabilities
rf_pred = rf.predict(X_test_rd)
rf_proba= rf.predict_proba(X_test_rd)

# calculate accuracy of test
rf_acc_test = round(accuracy_score(y_test_rd, rf_pred), 2)
print(f'Accuracy of Random Forest (TEST): {rf_acc_test}')

# get precision, recall, f1-score
precision, recall, fscore, support = score(y_test_rd, rf_pred, average='macro')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {fscore}')

# add model and accuracy dict to list
perform_list.append(dict([
    ('Model', 'Random Forest'),
    ('Train Accuracy', round(rf_acc, 2)),
    ('Test Accuracy', rf_acc_test),
    ('Precision', round(precision, 2)),
    ('Recall', round(recall, 2)),
    ('F1', round(fscore, 2))
     ]))

# add model to list
models.append('Random Forest')

# add proba socre to list
proba_score.append(rf_proba)

Accuracy of Random Forest (TRAIN): 0.66 +/- 0.0
Accuracy of Random Forest (TEST): 0.66
Precision : 0.662558468704715
Recall    : 0.6624648167014902
F-score   : 0.6624161951196138


### Logistic regression

In [37]:
lr = LogisticRegression(random_state=1)

# calculate accuracy of train
lr_acc = cross_val_score(lr, X_train_rd, y_train_rd, cv=10).mean()
lr_std = cross_val_score(lr, X_train_rd, y_train_rd, cv=10).std()
print('Accuracy of Logistic Regression (TRAIN): {} +/- {}'.format(round(lr_acc, 2), round(lr_std, 2)))

# fit model
lr.fit(X_train_rd, y_train_rd)

# get predictions and probabilities
lr_pred = lr.predict(X_test_rd)
lr_proba = lr.predict_proba(X_test_rd)

# calculate accuracy of test
lr_acc_test = round(accuracy_score(y_test_rd, lr_pred), 2)
print(f'Accuracy of Logistic Regression (TEST): {lr_acc_test}')

print()

# get precision, recall, f1-score
precision, recall, fscore, support = score(y_test_rd, lr_pred, average='macro')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {fscore}')

# add model and accuracy dict to list
perform_list.append(dict([
    ('Model', 'Logistic Regression'),
    ('Train Accuracy', round(lr_acc, 2)),
    ('Test Accuracy', lr_acc_test),
    ('Precision', round(precision, 2)),
    ('Recall', round(recall, 2)),
    ('F1', round(fscore, 2))
     ]))

# add model to list
models.append('Logistic Regression')

# add proba score to list
proba_score.append(lr_proba)

Accuracy of Logistic Regression (TRAIN): 0.53 +/- 0.0
Accuracy of Logistic Regression (TEST): 0.53

Precision : 0.5302059599641626
Recall    : 0.5301497124898551
F-score   : 0.5299308798423393


### Navie bayes

In [38]:
nb = GaussianNB()

# calculate accuracy of train
nb_acc = cross_val_score(nb, X_train_rd, y_train_rd, cv=10).mean()
nb_std = cross_val_score(nb, X_train_rd, y_train_rd, cv=10).std()
print('Accuracy of Naive Bayes Classifier (TRAIN): {} +/- {}'.format(round(nb_acc, 2), round(nb_std, 2)))

# cit model
nb.fit(X_train_rd, y_train_rd)

# get predictions and probabilities
nb_pred = nb.predict(X_test_rd)
nb_proba = nb.predict_proba(X_test_rd)

# calculate accuracy of test
nb_acc_test = round(accuracy_score(y_test_rd, nb_pred), 2)
print(f'Accuracy of Naive Bayes Classifier (TEST): {nb_acc_test}')

# get precision, recall, f1-score
precision, recall, fscore, support = score(y_test_rd, nb_pred, average='macro')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {fscore}')

# add model and accuracy dict to list
perform_list.append(dict([
    ('Model', 'Naive Bayes'),
    ('Train Accuracy', round(nb_acc, 2)),
    ('Test Accuracy', nb_acc_test),
    ('Precision', round(precision, 2)),
    ('Recall', round(recall, 2)),
    ('F1', round(fscore, 2))
     ]))

# add model to list
models.append('Naive Bayes')

# add proba score to list
proba_score.append(nb_proba)

Accuracy of Naive Bayes Classifier (TRAIN): 0.54 +/- 0.0
Accuracy of Naive Bayes Classifier (TEST): 0.54
Precision : 0.5358605391934395
Recall    : 0.5356236293622973
F-score   : 0.5348553940503726


In [39]:
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
model_performance

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1
0,Decision Tree,0.66,0.660197,0.66,0.66,0.66
1,Random Forest,0.66,0.66,0.66,0.66,0.66
2,Logistic Regression,0.53,0.53,0.53,0.53,0.53
3,Naive Bayes,0.54,0.54,0.54,0.54,0.53
