## Forest Cover Type Prediction
## Junfei Ma
###  Data from https://www.kaggle.com/c/forest-cover-type-prediction

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import time

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import ExtraTreesClassifier
import lazypredict
from lazypredict.Supervised import LazyClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import plotly.express as px

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15115,15116,2607,243,23,258,7,660,170,251,214,...,0,0,0,0,0,0,0,0,0,3
15116,15117,2603,121,19,633,195,618,249,221,91,...,0,0,0,0,0,0,0,0,0,3
15117,15118,2492,134,25,365,117,335,250,220,83,...,0,0,0,0,0,0,0,0,0,3
15118,15119,2487,167,28,218,101,242,229,237,119,...,0,0,0,0,0,0,0,0,0,3


In [4]:
df_test

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,15121,2680,354,14,0,0,2684,196,214,156,...,0,0,0,0,0,0,0,0,0,0
1,15122,2683,0,13,0,0,2654,201,216,152,...,0,0,0,0,0,0,0,0,0,0
2,15123,2713,16,15,0,0,2980,206,208,137,...,0,0,0,0,0,0,0,0,0,0
3,15124,2709,24,17,0,0,2950,208,201,125,...,0,0,0,0,0,0,0,0,0,0
4,15125,2706,29,19,0,0,2920,210,195,115,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565887,581008,2396,153,20,85,17,108,240,237,118,...,0,0,0,0,0,0,0,0,0,0
565888,581009,2391,152,19,67,12,95,240,237,119,...,0,0,0,0,0,0,0,0,0,0
565889,581010,2386,159,17,60,7,90,236,241,130,...,0,0,0,0,0,0,0,0,0,0
565890,581011,2384,170,15,60,5,90,230,245,143,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#drop first id column
df_train = df_train.iloc[:,1:]
df_test = df_test.iloc[:,1:]

In [6]:
# Datatypes of the attributes

print(df_train.dtypes)

Elevation                             int64
Aspect                                int64
Slope                                 int64
Horizontal_Distance_To_Hydrology      int64
Vertical_Distance_To_Hydrology        int64
Horizontal_Distance_To_Roadways       int64
Hillshade_9am                         int64
Hillshade_Noon                        int64
Hillshade_3pm                         int64
Horizontal_Distance_To_Fire_Points    int64
Wilderness_Area1                      int64
Wilderness_Area2                      int64
Wilderness_Area3                      int64
Wilderness_Area4                      int64
Soil_Type1                            int64
Soil_Type2                            int64
Soil_Type3                            int64
Soil_Type4                            int64
Soil_Type5                            int64
Soil_Type6                            int64
Soil_Type7                            int64
Soil_Type8                            int64
Soil_Type9                      

In [7]:
df_train.isnull().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0
Soil_Type11                           0


In [8]:
df_train.nunique()

Elevation                             1665
Aspect                                 361
Slope                                   52
Horizontal_Distance_To_Hydrology       400
Vertical_Distance_To_Hydrology         423
Horizontal_Distance_To_Roadways       3250
Hillshade_9am                          176
Hillshade_Noon                         141
Hillshade_3pm                          247
Horizontal_Distance_To_Fire_Points    2710
Wilderness_Area1                         2
Wilderness_Area2                         2
Wilderness_Area3                         2
Wilderness_Area4                         2
Soil_Type1                               2
Soil_Type2                               2
Soil_Type3                               2
Soil_Type4                               2
Soil_Type5                               2
Soil_Type6                               2
Soil_Type7                               1
Soil_Type8                               2
Soil_Type9                               2
Soil_Type10

In [9]:
df_train.drop(["Soil_Type7","Soil_Type15"],axis=1,inplace=True)

In [10]:
df_test.drop(["Soil_Type7","Soil_Type15"],axis=1,inplace=True)

In [11]:
df_train.nunique()

Elevation                             1665
Aspect                                 361
Slope                                   52
Horizontal_Distance_To_Hydrology       400
Vertical_Distance_To_Hydrology         423
Horizontal_Distance_To_Roadways       3250
Hillshade_9am                          176
Hillshade_Noon                         141
Hillshade_3pm                          247
Horizontal_Distance_To_Fire_Points    2710
Wilderness_Area1                         2
Wilderness_Area2                         2
Wilderness_Area3                         2
Wilderness_Area4                         2
Soil_Type1                               2
Soil_Type2                               2
Soil_Type3                               2
Soil_Type4                               2
Soil_Type5                               2
Soil_Type6                               2
Soil_Type8                               2
Soil_Type9                               2
Soil_Type10                              2
Soil_Type11

In [58]:
df_corr = df_train.iloc[:,:10]
df_corr

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points
0,2596,51,3,258,0,510,221,232,148,6279
1,2590,56,2,212,-6,390,220,235,151,6225
2,2804,139,9,268,65,3180,234,238,135,6121
3,2785,155,18,242,118,3090,238,238,122,6211
4,2595,45,2,153,-1,391,220,234,150,6172
...,...,...,...,...,...,...,...,...,...,...
15115,2607,243,23,258,7,660,170,251,214,1282
15116,2603,121,19,633,195,618,249,221,91,1325
15117,2492,134,25,365,117,335,250,220,83,1187
15118,2487,167,28,218,101,242,229,237,119,932


In [59]:
#correlation of the train data
fig = px.imshow(df_corr.corr() ,text_auto=True, aspect="auto" , color_continuous_scale = "redor")
fig.show()

In [14]:
#Use the train set to modeling. The 70% train set will be train data and the 30% train set will be test data 
X =  df_train.iloc[:, :-1]
y =  df_train.iloc[:, -1]
X_train , X_test , y_train , y_test = train_test_split(X , y,  random_state = 12 ,test_size =0.3)


In [15]:
#Use lazycliassifier to get a baseline of the top models that perform well for this dataset
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=False,
                     random_state=12,
                     classifiers='all')

models, predictions = clf.fit(X_train , X_test , y_train , y_test)

100%|██████████| 29/29 [00:56<00:00,  1.94s/it]


In [16]:
#Do futher analysis on the top models 
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.86,0.86,,0.86,0.94
RandomForestClassifier,0.86,0.86,,0.85,1.15
LGBMClassifier,0.85,0.85,,0.85,0.85
BaggingClassifier,0.84,0.84,,0.84,0.59
LabelPropagation,0.8,0.8,,0.8,5.23
LabelSpreading,0.8,0.8,,0.8,6.27
KNeighborsClassifier,0.78,0.79,,0.78,0.8
DecisionTreeClassifier,0.79,0.79,,0.78,0.11
ExtraTreeClassifier,0.74,0.74,,0.73,0.04
SVC,0.73,0.73,,0.73,4.3


### Extra Trees Classifier	

In [17]:
xt_clf = ExtraTreesClassifier(random_state=42)

In [18]:
param_grid = { 
    'n_estimators': [10, 20, 50],
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 5, 8, 13, 55, 89, None],
    'min_samples_split': [2, 5, 13, 89, 144],
     'min_samples_leaf': [1, 2, 5, 13, 89, 144],
     'max_features': ['auto', 'sqrt', 'log2', 2, 5, None]
}

In [19]:
#CV_xt_clf = GridSearchCV(estimator=xt_clf, param_grid=param_grid, cv= 5,n_jobs=-1)
#CV_xt_clf.fit(X_train, y_train)

In [20]:
#CV_xt_clf.best_params_

In [21]:
#{'criterion': 'entropy',
# 'max_depth': 55,
# 'max_features': None,
# 'min_samples_leaf': 1,
# 'min_samples_split': 2,
# 'n_estimators': 50}

In [22]:
xt_clf = ExtraTreesClassifier(random_state=42,  criterion = 'entropy', max_depth= 55, max_features= None,
 min_samples_leaf = 1,min_samples_split = 2, n_estimators= 50)

In [23]:
#X_train , X_test , y_train , y_test

xt_clf.fit(X_train,y_train)

ExtraTreesClassifier(criterion='entropy', max_depth=55, max_features=None,
                     n_estimators=50, random_state=42)

In [24]:
print("Test Accuracy:",xt_clf.score(X_test, y_test))

Test Accuracy: 0.8719135802469136


In [25]:
pred=xt_clf.predict(df_test)

In [26]:
#get the test id column
df_test_result = pd.read_csv("test.csv")
df_test_result = df_test_result[['Id']]
df_test_result["Cover_Type"] = pred
df_test_result
df_test_result.to_csv('submission_1.csv', index=False)

In [27]:
df_test

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,2680,354,14,0,0,2684,196,214,156,6645,...,0,0,0,0,0,0,0,0,0,0
1,2683,0,13,0,0,2654,201,216,152,6675,...,0,0,0,0,0,0,0,0,0,0
2,2713,16,15,0,0,2980,206,208,137,6344,...,0,0,0,0,0,0,0,0,0,0
3,2709,24,17,0,0,2950,208,201,125,6374,...,0,0,0,0,0,0,0,0,0,0
4,2706,29,19,0,0,2920,210,195,115,6404,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565887,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,0
565888,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,0
565889,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,0
565890,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,0


### Random Forest Classifier

In [28]:
rfc=RandomForestClassifier(random_state=42)

In [29]:
param_grid = { 
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'criterion' :['gini', 'entropy']
}

In [30]:
#CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,n_jobs=-1)
#CV_rfc.fit(X_train, y_train)

In [31]:
#CV_rfc.best_params_

In [32]:
#{'criterion': 'gini',
# 'max_depth': 40,
# 'max_features': 'auto',
# 'n_estimators': 800}

In [33]:
rfc=RandomForestClassifier(random_state=42, criterion = 'gini', max_depth= 40, max_features= 'auto',
n_estimators= 800)

In [34]:
rfc.fit(X_train,y_train)

RandomForestClassifier(max_depth=40, n_estimators=800, random_state=42)

In [35]:
print("Test Accuracy:",rfc.score(X_test, y_test))

Test Accuracy: 0.8582451499118166


In [36]:
pred=rfc.predict(df_test)

In [37]:
#get the test id column
df_test_result = pd.read_csv("test.csv")
df_test_result = df_test_result[['Id']]
df_test_result["Cover_Type"] = pred
df_test_result
df_test_result.to_csv('submission_2.csv', index=False)

### Random Forest Classifier with backward selection

In [38]:
#sf = SequentialFeatureSelector(rfc, scoring='accuracy', direction = 'backward')
#sf.fit(X,y)

In [39]:
#the result of the backward selection False meaning donot including this column
#sf.get_support()

In [40]:
#array([ True, False, False,  True, False,  True,  True,  True, False,
#        True,  True, False, False,  True, False,  True, False,  True,
#       False,  True,  True,  True,  True,  True,  True,  True, False,
#       False,  True,  True, False, False, False,  True,  True, False,
#        True, False, False, False, False,  True, False, False, False,
#       False, False,  True, False,  True,  True, False])

In [41]:
sf_get_support = ([ True, False, False,  True, False,  True,  True,  True, False,
        True,  True, False, False,  True, False,  True, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True, False, False, False,  True,  True, False,
        True, False, False, False, False,  True, False, False, False,
       False, False,  True, False,  True,  True, False])

In [42]:
rfc1=RandomForestClassifier(random_state=42)

In [43]:
#X_train =  X_train.loc[: , sf.get_support()]
X_train =  X_train.loc[: , sf_get_support]

In [44]:
param_grid = { 
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'criterion' :['gini', 'entropy']
}

In [45]:
#CV_rfc = GridSearchCV(estimator=rfc1, param_grid=param_grid, cv= 5,n_jobs=-1)
#CV_rfc.fit(X_train, y_train)

In [46]:
#CV_rfc.best_params_

In [47]:
#{'criterion': 'gini',
# 'max_depth': 40,
# 'max_features': 'auto',
# 'n_estimators': 600}

In [48]:
rfc1=RandomForestClassifier(random_state=42, criterion = 'gini', max_depth= 40, max_features= 'auto',
n_estimators= 600)

In [49]:
rfc1.fit(X_train,y_train)

RandomForestClassifier(max_depth=40, n_estimators=600, random_state=42)

In [50]:
X_test = X_test.loc[: , sf_get_support]

In [51]:
print("Test Accuracy:",rfc1.score(X_test, y_test))

Test Accuracy: 0.8628747795414462


In [52]:
df_test = df_test.loc[: , sf_get_support]

In [53]:
pred=rfc1.predict(df_test)

In [54]:
#get the test id column
df_test_result = pd.read_csv("test.csv")
df_test_result = df_test_result[['Id']]
df_test_result["Cover_Type"] = pred
df_test_result
df_test_result.to_csv('submission_3.csv', index=False)