# Classify forest types based on information about the area

## Upload libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold

from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import matplotlib.pyplot as plt

import catboost

In [2]:
# Fix the seed to have the reproducable results
SEED = 2019

In [3]:
# Fix target
TARGET = 'Cover_Type'

## Upload data

In [5]:
train = pd.read_csv('./data/train.csv', sep=',', index_col=0)

In [6]:
print(train.shape)
train.head(5)

(15120, 55)


Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
2,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
3,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
4,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
5,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [7]:
test = pd.read_csv('./data/test.csv', sep=',', index_col=0)

In [8]:
print(test.shape)
test.head(5)

(565892, 54)


Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15121,2680,354,14,0,0,2684,196,214,156,6645,...,0,0,0,0,0,0,0,0,0,0
15122,2683,0,13,0,0,2654,201,216,152,6675,...,0,0,0,0,0,0,0,0,0,0
15123,2713,16,15,0,0,2980,206,208,137,6344,...,0,0,0,0,0,0,0,0,0,0
15124,2709,24,17,0,0,2950,208,201,125,6374,...,0,0,0,0,0,0,0,0,0,0
15125,2706,29,19,0,0,2920,210,195,115,6404,...,0,0,0,0,0,0,0,0,0,0


## Simble baseline

Let's try a few models to see the score, that we can get without any feature engineering.

### Data processing

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15120 entries, 1 to 15120
Data columns (total 55 columns):
Elevation                             15120 non-null int64
Aspect                                15120 non-null int64
Slope                                 15120 non-null int64
Horizontal_Distance_To_Hydrology      15120 non-null int64
Vertical_Distance_To_Hydrology        15120 non-null int64
Horizontal_Distance_To_Roadways       15120 non-null int64
Hillshade_9am                         15120 non-null int64
Hillshade_Noon                        15120 non-null int64
Hillshade_3pm                         15120 non-null int64
Horizontal_Distance_To_Fire_Points    15120 non-null int64
Wilderness_Area1                      15120 non-null int64
Wilderness_Area2                      15120 non-null int64
Wilderness_Area3                      15120 non-null int64
Wilderness_Area4                      15120 non-null int64
Soil_Type1                            15120 non-null int64
Soil_T

In [11]:
# Any missed data?
train.isnull().values.any()

False

In [12]:
# Separate labels and features
train_data = train.drop([TARGET], axis = 1)
train_labels = train[TARGET]

print(train_data.shape)
print(train_labels.shape)

(15120, 54)
(15120,)


Some supporting functions:

In [13]:
# For prediction
def predict(model,filename, fit_model = False,  X=train_data, y=train_labels, test=test):
    if fit_model == True:
        model.fit(X, y)
        print('Model {} is fitted!'.format(model))
    predicts = model.predict(test)
    predicts = predicts.astype('int64')
    print('Model {} made the prediction!'.format(model))
    output = pd.DataFrame({'Id': test.index, 
                       'Cover_Type': predicts.reshape((test.shape[0], ))})
    output.to_csv(filename + '.csv', index=False)
    return predicts

In [14]:
# For KFold cross-validation
def cross_val(model, X=train_data, y=train_labels, n_splits = 3, scoring = 'accuracy', stratified = True, shuffle = False, verbose = True):
    if stratified == True:
        cv = StratifiedKFold(n_splits = n_splits, shuffle = shuffle, random_state=SEED)
    else:
        cv = KFold(n_splits = n_splits, shuffle = shuffle, random_state=SEED)
    cv_model = cross_val_score(model, X, y, scoring=scoring, cv=cv)
    if verbose == True:
        print(cv_model)
        print('Mean: {0:.4f}'.format(cv_model.mean()))
    return cv_model, cv_model.mean()

Let's explore, how the classes are balanced

In [15]:
train[TARGET].value_counts()

7    2160
6    2160
5    2160
4    2160
3    2160
2    2160
1    2160
Name: Cover_Type, dtype: int64

Every class has the same number of samples! Very good: we can use the classification algorithms right away and apply cross-validation to validate them.

In [17]:
# Store indices of categorial features separately (useful for gradient boosting algorithms)
cat_features = [i for i in range(10, 54)]

In [18]:
# Leave some data for validation
(X_train, X_test, 
 y_train, y_test) = train_test_split(train_data, train_labels, 
                                     test_size=0.2, stratify = train_labels, 
                                     random_state=0)

### Classification models

Since we have multilabel classification, let's try Random Forest classifier:

### Random forest classifier

In [19]:
classifier_RF = RandomForestClassifier(random_state = 0, 
                        max_depth = 50, n_estimators = 200)

In [20]:
classifier_RF.fit(train_data, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [21]:
cv_rf = cross_val(classifier_RF)

[0.79345238 0.77619048 0.78789683]
Mean: 0.7858


Let's make the first submisson for Random Forest (random_state = 0, max_depth = 50, n_estimators = 200)

In [22]:
# First submit with RF
predict(classifier_RF, 'RF_baseline')

Model RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False) made the prediction!


array([2, 1, 2, ..., 3, 3, 6])

Public score became 0.75178. Not bad!