# 0.0 - Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble as en
from sklearn import metrics as mt
from matplotlib import pyplot as plt



# 1.0 - Load Dataset

## 1.1 - Training Dataset

In [2]:
df1 = pd.read_csv('/home/aderaldo/estudos/comunidade_ds/fundamentos_de_machine_learning/projeto/Datasets/classificação/X_training.csv')

In [3]:
df2 = pd.read_csv('/home/aderaldo/estudos/comunidade_ds/fundamentos_de_machine_learning/projeto/Datasets/classificação/y_training.csv')

## 1.3 - Test Dataset

In [4]:
df3 = pd.read_csv('/home/aderaldo/estudos/comunidade_ds/fundamentos_de_machine_learning/projeto/Datasets/classificação/X_test.csv')

In [5]:
df4 = pd.read_csv('/home/aderaldo/estudos/comunidade_ds/fundamentos_de_machine_learning/projeto/Datasets/classificação/y_test.csv')

## 1.2 - Valdiation Dataset

In [6]:
df5 = pd.read_csv('/home/aderaldo/estudos/comunidade_ds/fundamentos_de_machine_learning/projeto/Datasets/classificação/X_validation.csv')

In [7]:
df6 = pd.read_csv('/home/aderaldo/estudos/comunidade_ds/fundamentos_de_machine_learning/projeto/Datasets/classificação/y_validation.csv')

## 1.4 - Feature Selection

In [8]:
features = ['customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel',
       'type_of_travel_personal_travel']

In [9]:
x_train = df1.loc[:, features]

In [10]:
y_train = df2['0'].values

In [11]:
x_test = df3.loc[:, features]

In [12]:
y_test = df4['0'].values

In [13]:
x_val = df5.loc[:, features]

In [14]:
y_val = df6['0'].values

# 2.0 - Training

## 2.1 - Fine Tunning

In [15]:
depth_values = np.arange( 2, 30, 2 )
accuracy_values = []

for i in depth_values:
    print("valor do max_depth: {}".format( i ))
    model = en.RandomForestClassifier(n_estimators=100, max_depth = i, random_state=0)
    model.fit(x_train, y_train)
    y_pred_test = model.predict( x_val )
    accuracy = mt.accuracy_score( y_val, y_pred_test )
    accuracy_values.append( accuracy )
    print( "accuracy: {} \n".format( accuracy ))

valor do max_depth: 2
accuracy: 0.8651179252871714 

valor do max_depth: 4
accuracy: 0.9074938061070176 

valor do max_depth: 6
accuracy: 0.929373531966923 

valor do max_depth: 8
accuracy: 0.9378036616364748 

valor do max_depth: 10
accuracy: 0.9472634254641398 

valor do max_depth: 12
accuracy: 0.9536986389523473 

valor do max_depth: 14
accuracy: 0.9582354644615335 

valor do max_depth: 16
accuracy: 0.9601660285079957 

valor do max_depth: 18
accuracy: 0.9611634865986679 

valor do max_depth: 20
accuracy: 0.9619357122172528 

valor do max_depth: 22
accuracy: 0.9621609446893401 

valor do max_depth: 24
accuracy: 0.9624505292963094 

valor do max_depth: 26
accuracy: 0.9626435857009557 

valor do max_depth: 28
accuracy: 0.9626757617683966 



In [16]:
results = {'depth_values': depth_values, "accuracy_values" : accuracy_values}
results_df = pd.DataFrame(results)

In [17]:
results_df.sort_values(by = "accuracy_values", ascending = False)

Unnamed: 0,depth_values,accuracy_values
13,28,0.962676
12,26,0.962644
11,24,0.962451
10,22,0.962161
9,20,0.961936
8,18,0.961163
7,16,0.960166
6,14,0.958235
5,12,0.953699
4,10,0.947263


## 2.2 - Houldout Validation

In [18]:
# Model definition
model = en.RandomForestClassifier(n_estimators=100, max_depth=28, random_state=0)
# Fit
model.fit ( np.concatenate( ( x_train, x_val) ) , np.concatenate( ( y_train, y_val ) ) )
# Predict
y_pred_final = model.predict( x_test )
# Performance
mt.accuracy_score( y_test, y_pred_final )



0.9628084810566563

# 3.0 - Metrics

In [37]:
mt.precision_score(y_test, y_pred_final)

0.9721314451706609

In [38]:
mt.recall_score(y_test, y_pred_final)

0.9422789265288165

In [39]:
mt.f1_score(y_test, y_pred_final)

0.956972431973549

In [24]:
feature_names = [f"Feature name: {i}" for i in range(x_train.shape[1])]
importances = model.feature_importances_
pd.Series( importances, index = feature_names)

Feature name: 0     0.044728
Feature name: 1     0.030653
Feature name: 2     0.079007
Feature name: 3     0.033061
Feature name: 4     0.135725
Feature name: 5     0.014394
Feature name: 6     0.036559
Feature name: 7     0.015507
Feature name: 8     0.010568
Feature name: 9     0.177593
Feature name: 10    0.039382
Feature name: 11    0.052208
Feature name: 12    0.027247
Feature name: 13    0.031681
Feature name: 14    0.028878
Feature name: 15    0.024973
Feature name: 16    0.026627
Feature name: 17    0.029128
Feature name: 18    0.010804
Feature name: 19    0.012158
Feature name: 20    0.003052
Feature name: 21    0.003054
Feature name: 22    0.077680
Feature name: 23    0.055330
dtype: float64