In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading train and test data:</b> We have the actual data and validation data separated for us. We will load them to two separate dataframes. 
</div>

In [2]:
path_of_input_file = 'D:\\kaggle_trials\\forest-cover-type-prediction\\train.csv'
df                 = pd.read_csv(path_of_input_file)
df.head(4)

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2


In [3]:
path_of_input_file_valid      = 'D:\\kaggle_trials\\forest-cover-type-prediction\\test.csv'
df_validation                 = pd.read_csv(path_of_input_file_valid)
df_validation.head(3)

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,15121,2680,354,14,0,0,2684,196,214,156,...,0,0,0,0,0,0,0,0,0,0
1,15122,2683,0,13,0,0,2654,201,216,152,...,0,0,0,0,0,0,0,0,0,0
2,15123,2713,16,15,0,0,2980,206,208,137,...,0,0,0,0,0,0,0,0,0,0


<div class="alert alert-block alert-info">
<b>Feature selection and Preprocessing:</b> We will select all features except for ID. Also we will do standard scaling to bring all the features in some particular range. All the features are numerical and hence applying MinMax Scalar will be easy
</div>

In [4]:
cols        = df.columns.tolist()
cols_needed = cols[1:len(cols)-1]
X           = df[cols_needed].values
y           = df[cols[-1]].values
X_validate  = df_validation[cols[1:len(cols)-1]].values

In [5]:
num_labels = df['Cover_Type'].unique()
print('The number of labels are ',len(num_labels))

The number of labels are  7


In [6]:
for i in range(len(num_labels)):
    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['Cover_Type']==num_labels[i]]))
print('We have a pretty balanced dataset and hence we wont need to perform any imbalanced dataset handling')

The number of  5  labels are :-  2160
The number of  2  labels are :-  2160
The number of  1  labels are :-  2160
The number of  7  labels are :-  2160
The number of  3  labels are :-  2160
The number of  6  labels are :-  2160
The number of  4  labels are :-  2160
We have a pretty balanced dataset and hence we wont need to perform any imbalanced dataset handling


<div class="alert alert-block alert-info">
<b>Encoding and Scaling the data:</b> We will use one hot encoding for the categorical data present(we don't have anyright now) and perform Min Max Scaling for the Numerical data.
</div>

In [7]:
columns_to_encode = cols_needed[10:]
columns_to_scale  = cols_needed[:10]

scaler            = MinMaxScaler()

scaled_columns    = scaler.fit_transform(df[columns_to_scale]) 
encoded_columns   = df[columns_to_encode].values
X_processed_data  = np.concatenate([scaled_columns, encoded_columns], axis=1)

<div class="alert alert-block alert-info">
<b>Train Test Split:</b> We will now split the data to train and test sets to perform classification
</div>

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_processed_data, y, test_size=0.33, random_state=42)

<div class="alert alert-block alert-info">
<b>Hyperparameter Grid:</b> We now create a hyperparameter grid for only one hyper parameter(we could have selected other hyper parameters to tune as well but it will take time to execute and we were getting fair results from the default parameters as well)
</div>

In [9]:
random_forest_grid = {'n_estimators' : hp.choice('n_estimators',range(100,150))}

<div class="alert alert-block alert-info">
<b>Tuning :</b> We tune our model and eventually get a dictionary of hyper-parameters tuned
</div>

In [10]:
def hyperopt_train_test(params):
    clf = RandomForestClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, random_forest_grid, algo=tpe.suggest, max_evals=200, trials=trials)
best_parameters = space_eval(random_forest_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 200/200 [10:25<00:00,  3.10s/it, best loss: -0.8405692645610724]
The best parameter tuned on training set is given by :-  {'n_estimators': 148}


<div class="alert alert-block alert-info">
<b>Model Fitting and analysis:</b> We finally fit the model and then evaluate some metrics for performance analysis
</div>

In [11]:
rfclf = RandomForestClassifier(n_estimators       = best_parameters['n_estimators'])
rfclf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=148,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
y_hat = rfclf.predict(X_test)
print(classification_report(y_hat,y_test))

              precision    recall  f1-score   support

           1       0.77      0.76      0.77       682
           2       0.68      0.81      0.74       605
           3       0.81      0.82      0.81       692
           4       0.98      0.92      0.95       781
           5       0.94      0.88      0.91       769
           6       0.86      0.84      0.85       738
           7       0.97      0.94      0.95       723

    accuracy                           0.86      4990
   macro avg       0.86      0.85      0.85      4990
weighted avg       0.86      0.86      0.86      4990



<div class="alert alert-block alert-info">
<b>Added part:</b> Since the data selected was a part of some competition, they provided a separate dataset for validation purposes. The final step of the analysis was done in the previous step itself. This output is just there for the examiner to grade. (PS:- I haven't uploaded anything there because I used the data just to demonstrate the usage of Random Forests only)
</div>

In [15]:
scaled_columns    = scaler.fit_transform(df_validation[columns_to_scale]) 
encoded_columns   = df_validation[columns_to_encode].values
X_validation_data = np.concatenate([scaled_columns, encoded_columns], axis=1)

In [17]:
predicted_output = rfclf.predict(X_validation_data)

In [23]:
df_validation['predict'] = predicted_output
df_validation[['Id','predict']]

Unnamed: 0,Id,predict
0,15121,2
1,15122,1
2,15123,2
3,15124,2
4,15125,2
5,15126,2
6,15127,2
7,15128,2
8,15129,2
9,15130,2
