In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/forest-cover-type-prediction/test.csv
/kaggle/input/forest-cover-type-prediction/train.csv
/kaggle/input/forest-cover-type-prediction/sampleSubmission.csv


In [2]:
from sklearn.preprocessing import normalize
from sklearn import ensemble
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [3]:
%%time
df_train  = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
df_test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')
loc_submission = 'submission.csv'

CPU times: user 1.56 s, sys: 412 ms, total: 1.97 s
Wall time: 1.98 s


# Normalization

In [4]:
%%time 
cols_to_normalize = ['Aspect','Slope','Horizontal_Distance_To_Hydrology',
                     'Vertical_Distance_To_Hydrology','Hillshade_9am',
                     'Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']

df_train[cols_to_normalize] = normalize(df_train[cols_to_normalize])
df_test[cols_to_normalize] = normalize(df_test[cols_to_normalize])

CPU times: user 920 ms, sys: 1.11 s, total: 2.03 s
Wall time: 2.02 s


# Feature Engineering

In [5]:
feature_cols = [col for col in df_train.columns if col not in ['Cover_Type','Id']]

feature_cols.append('binned_elevation')
feature_cols.append('Horizontal_Distance_To_Roadways_Log')
feature_cols.append('Soil_Type12_32')
feature_cols.append('Soil_Type23_22_32_33')

In [6]:
df_train['binned_elevation'] = [math.floor(v/50.0) for v in df_train['Elevation']]
df_test['binned_elevation'] = [math.floor(v/50.0) for v in df_test['Elevation']]

In [7]:
df_train['Horizontal_Distance_To_Roadways_Log'] = [math.log(v+1) for v in df_train['Horizontal_Distance_To_Roadways']]
df_test['Horizontal_Distance_To_Roadways_Log'] = [math.log(v+1) for v in df_test['Horizontal_Distance_To_Roadways']]

In [8]:
df_train['Soil_Type12_32'] = df_train['Soil_Type32'] + df_train['Soil_Type12']
df_test['Soil_Type12_32'] = df_test['Soil_Type32'] + df_test['Soil_Type12']
df_train['Soil_Type23_22_32_33'] = df_train['Soil_Type23'] + df_train['Soil_Type22'] + df_train['Soil_Type32'] + df_train['Soil_Type33']
df_test['Soil_Type23_22_32_33'] = df_test['Soil_Type23'] + df_test['Soil_Type22'] + df_test['Soil_Type32'] + df_test['Soil_Type33']

# Train / Test Sets

In [9]:
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]

y_train = df_train['Cover_Type']

y_train_1_2 = df_train['Cover_Type'].copy()
y_train_1_2[~df_train['Cover_Type'].isin([1, 2])] = 999

y_train_3_4_6 = df_train['Cover_Type'].copy()
y_train_3_4_6[~df_train['Cover_Type'].isin([3, 4, 6])] = 999

ohe = OneHotEncoder()
y_train_onehot = ohe.fit_transform(y_train.values.reshape(-1, 1)).toarray()

ohe_1_2 = OneHotEncoder()
y_train_1_2_onehot = ohe_1_2.fit_transform(y_train_1_2.values.reshape(-1, 1)).toarray()

ohe_3_4_6 = OneHotEncoder()
y_train_3_4_6_onehot = ohe_3_4_6.fit_transform(y_train_3_4_6.values.reshape(-1, 1)).toarray()


test_ids = df_test['Id']

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [10]:
print(y_train_onehot.shape)
print(ohe.categories_)

test_data = np.array([1, 2, 3, 4, 5, 6, 7]).reshape(-1, 1)
print(ohe.transform(test_data).toarray())


print(y_train_1_2_onehot.shape)
print(ohe_1_2.categories_)
print(ohe_1_2.transform(test_data).toarray())


print(y_train_3_4_6_onehot.shape)
print(ohe_3_4_6.categories_)
print(ohe_3_4_6.transform(test_data).toarray())

(15120, 7)
[array([1., 2., 3., 4., 5., 6., 7.])]
[[1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1.]]
(15120, 3)
[array([  1.,   2., 999.])]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(15120, 4)
[array([  3.,   4.,   6., 999.])]
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 0.]]


# Define Stacking Network Class

In [11]:
from tqdm import tqdm_notebook

class TreesAggreator():
    """
    Args:
        clf (list of sklearn trees): classifiers of sklearn tree models.
    """
    def __init__(self, clfs, num_classes):
        self.clfs = clfs
        self.num_classes = num_classes
        
        # calculate the total number of output dim
        self.output_dim = self.num_classes * len(self.clfs)
        
    def fit(self, X, y, verbose=True):
        if verbose:
            progress_bar = tqdm_notebook
            print('Start fitting...')
        else:
            progress_bar = list
    
        for clf in progress_bar(self.clfs):
            # Train
            clf.fit(X, y)
            
            
    def predict(self, X, verbose=True):
        """
        Returns:
            predict meta-features based on predictions of self.clfs.
        """
        if verbose:
            progress_bar = tqdm_notebook
            print('Start Predicting...')
        else:
            progress_bar = list
        
        meta_features = np.zeros((X.shape[0], self.output_dim))
        col_index = 0
        
        for clf in progress_bar(self.clfs):
            # Generate meta-features
            meta_features[:, col_index: col_index+self.num_classes] = clf.predict_proba(X)
            
            # increment index
            col_index += self.num_classes
        
        return meta_features
            
        
    def fit_predict(self, X, y, verbose=True):
        """
        Fit and predict X to meta-features
        
        Returns:
            predicted meta-features based on predictions of self.clfs.
        """
        
        self.fit(X, y, verbose)
        
        return self.predict(X, verbose)
        

# Training - Level 1

In [12]:
%%time
trees_level_1 = [ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=0),
                       ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0),
                       ensemble.GradientBoostingClassifier(n_estimators=100, random_state=0),
                       ensemble.AdaBoostClassifier(n_estimators=50, random_state=0)]


clfs_l1 = TreesAggreator(trees_level_1, num_classes=7)

X_train_l2_all = clfs_l1.fit_predict(X_train, y_train)

Start fitting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


Start Predicting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 39.8 s, sys: 332 ms, total: 40.2 s
Wall time: 30.2 s


#### Level 1: Network 2 (Label=1, 2)

In [13]:
%%time
trees_l1_1_2 = [ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=0),
                       ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0),
                       ensemble.GradientBoostingClassifier(n_estimators=100, random_state=0),
                       ensemble.AdaBoostClassifier(n_estimators=50, random_state=0)]


clfs_l1_1_2 = TreesAggreator(trees_l1_1_2, num_classes=3)

X_train_l2_1_2 = clfs_l1_1_2.fit_predict(X_train, y_train_1_2)

Start fitting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


Start Predicting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 22.8 s, sys: 348 ms, total: 23.1 s
Wall time: 15 s


#### Level 1: Network 3 (Label=3, 4. 6)

In [14]:
%%time
trees_l1_3_4_6 = [ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=0),
                       ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0),
                       ensemble.GradientBoostingClassifier(n_estimators=100, random_state=0),
                       ensemble.AdaBoostClassifier(n_estimators=50, random_state=0)]


clfs_l1_3_4_6 = TreesAggreator(trees_l1_3_4_6, num_classes=4)

X_train_l2_3_4_6 = clfs_l1_3_4_6.fit_predict(X_train, y_train_3_4_6)

Start fitting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


Start Predicting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 24.7 s, sys: 308 ms, total: 25 s
Wall time: 18.1 s


#### Concatenate meta-features in 3 Networks 

In [15]:
print(type(X_train_l2_all))
print(X_train_l2_all.shape)
print(X_train_l2_1_2.shape)
print(X_train_l2_3_4_6.shape)

<class 'numpy.ndarray'>
(15120, 28)
(15120, 12)
(15120, 16)


# Training - Level 2
Train meta-features
Train Label (1, 2, 3, 4, 5, 6, 7), (1, 2, 999) and (3, 4, 6, 999)

In [16]:
%%time
clf_l2 = LinearRegression()
clf_l2.fit(X_train_l2_all, y_train_onehot)
X_train_l3_all = clf_l2.predict(X_train_l2_all)

clf_l2_1_2 = LinearRegression()
clf_l2_1_2.fit(X_train_l2_1_2, y_train_1_2_onehot)
X_train_l3_1_2 = clf_l2_1_2.predict(X_train_l2_1_2)

clf_l2_3_4_6 = LinearRegression()
clf_l2_3_4_6.fit(X_train_l2_3_4_6, y_train_3_4_6_onehot)
X_train_l3_3_4_6 = clf_l2_3_4_6.predict(X_train_l2_3_4_6)

CPU times: user 88 ms, sys: 40 ms, total: 128 ms
Wall time: 107 ms


# Train Level 3

In [17]:
X_train_l3 = np.concatenate((X_train_l3_all, X_train_l3_1_2, X_train_l3_3_4_6),
                           axis=1) # append column-wise

print(X_train_l3.shape)

(15120, 14)


In [18]:
%%time
clf_l3 = LinearRegression()
clf_l3.fit(X_train_l3, y_train_onehot)

CPU times: user 136 ms, sys: 228 ms, total: 364 ms
Wall time: 94 ms


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Prediction

In [19]:
%%time
X_test_l2_all = clfs_l1.predict(X_test)
X_test_l2_1_2 = clfs_l1_1_2.predict(X_test)
X_test_l2_3_4_6 = clfs_l1_3_4_6.predict(X_test)

X_test_l3_all = clf_l2.predict(X_test_l2_all)
X_test_l3_1_2 = clf_l2_1_2.predict(X_test_l2_1_2)
X_test_l3_3_4_6 = clf_l2_3_4_6.predict(X_test_l2_3_4_6)

X_test_l3 = np.concatenate((X_test_l3_all, X_test_l3_1_2, X_test_l3_3_4_6),
                          axis=1)  # append column-wise

prediction_l3 = clf_l3.predict(X_test_l3)

Start Predicting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


Start Predicting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


Start Predicting...


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 2min 39s, sys: 14.2 s, total: 2min 53s
Wall time: 1min 38s


In [20]:
print(prediction_l3[0])
print(prediction_l3[0].argmax() + 1)

prediction_l3.argmax(axis=1) + 1

[ 0.43545329  0.47831349 -0.01729872  0.00826042  0.10508398 -0.01084166
  0.00235377]
2


array([2, 1, 1, ..., 3, 3, 3])

In [21]:
submission = pd.DataFrame({'Id': test_ids,
                           'Cover_Type': prediction_l3.argmax(axis=1) + 1})

submission.to_csv(loc_submission, index=False)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [22]:
print(submission.head())

print()

!head submission.csv

      Id  Cover_Type
0  15121           2
1  15122           1
2  15123           1
3  15124           1
4  15125           1

Id,Cover_Type
15121,2
15122,1
15123,1
15124,1
15125,1
15126,1
15127,1
15128,1
15129,1
