In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/forest-cover-type-prediction/test.csv
/kaggle/input/forest-cover-type-prediction/train.csv
/kaggle/input/forest-cover-type-prediction/sampleSubmission.csv


In [2]:
from sklearn.preprocessing import normalize
from sklearn import ensemble
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [3]:
%%time
df_train  = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
df_test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')
loc_submission = 'submission.csv'

CPU times: user 1.57 s, sys: 364 ms, total: 1.93 s
Wall time: 2 s


# Normalization

In [4]:
%%time 
cols_to_normalize = ['Aspect','Slope','Horizontal_Distance_To_Hydrology',
                     'Vertical_Distance_To_Hydrology','Hillshade_9am',
                     'Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']

df_train[cols_to_normalize] = normalize(df_train[cols_to_normalize])
df_test[cols_to_normalize] = normalize(df_test[cols_to_normalize])

CPU times: user 892 ms, sys: 1.13 s, total: 2.02 s
Wall time: 2.02 s


# Feature Engineering

In [5]:
feature_cols = [col for col in df_train.columns if col not in ['Cover_Type','Id']]

feature_cols.append('binned_elevation')
feature_cols.append('Horizontal_Distance_To_Roadways_Log')
feature_cols.append('Soil_Type12_32')
feature_cols.append('Soil_Type23_22_32_33')

In [6]:
df_train['binned_elevation'] = [math.floor(v/50.0) for v in df_train['Elevation']]
df_test['binned_elevation'] = [math.floor(v/50.0) for v in df_test['Elevation']]

In [7]:
df_train['Horizontal_Distance_To_Roadways_Log'] = [math.log(v+1) for v in df_train['Horizontal_Distance_To_Roadways']]
df_test['Horizontal_Distance_To_Roadways_Log'] = [math.log(v+1) for v in df_test['Horizontal_Distance_To_Roadways']]

In [8]:
df_train['Soil_Type12_32'] = df_train['Soil_Type32'] + df_train['Soil_Type12']
df_test['Soil_Type12_32'] = df_test['Soil_Type32'] + df_test['Soil_Type12']
df_train['Soil_Type23_22_32_33'] = df_train['Soil_Type23'] + df_train['Soil_Type22'] + df_train['Soil_Type32'] + df_train['Soil_Type33']
df_test['Soil_Type23_22_32_33'] = df_test['Soil_Type23'] + df_test['Soil_Type22'] + df_test['Soil_Type32'] + df_test['Soil_Type33']

# Train / Test Sets

In [9]:
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]

y_train = df_train['Cover_Type']

ohe = OneHotEncoder()

y_train_onehot = ohe.fit_transform(y_train.values.reshape(-1, 1)).toarray()


test_ids = df_test['Id']

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [10]:
print(y_train_onehot.shape)
print(ohe.categories_)

test_data = np.array([1, 2, 3, 4, 5, 6, 7]).reshape(-1, 1)
ohe.transform(test_data).toarray()

(15120, 7)
[array([1., 2., 3., 4., 5., 6., 7.])]


array([[1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1.]])

In [11]:
# X_train, X_valid, y_train, y_valid = train_test_split(X_train_org, y_org, 
#                                                       test_size=0.2, random_state=0) 

# Training - Level 1

In [12]:
%%time
clf_1 = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=0)
clf_1.fit(X_train, y_train)

clf_2 = ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0)
clf_2.fit(X_train, y_train)

clf_3 = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=0)
clf_3.fit(X_train, y_train)

clf_4 = ensemble.AdaBoostClassifier(n_estimators=50, random_state=0)
clf_4.fit(X_train, y_train)

CPU times: user 37 s, sys: 372 ms, total: 37.4 s
Wall time: 28.6 s


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=0)

# Training - Level 2
Train meta-features

In [13]:
def predict_level_1(X_valid, classifiers_level_1):
    num_classes = 7

    X_train_level_2 = np.zeros((X_valid.shape[0], 
                                num_classes * len(classifiers_level_1)))

    for index, clf in  enumerate(classifiers_level_1):
        pred = clf.predict_proba(X_valid)
        X_train_level_2[:, index*num_classes:(index+1)*num_classes] = pred
        
    return X_train_level_2

In [14]:
%%time
classifiers_level_1 = [clf_1, clf_2, clf_3, clf_4]

X_train_level_2 = predict_level_1(X_train, classifiers_level_1)
    
    
# clf_l2 = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=0)
clf_l2 = LinearRegression()
clf_l2.fit(X_train_level_2, y_train_onehot)

CPU times: user 1.84 s, sys: 40 ms, total: 1.88 s
Wall time: 1.01 s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Prediction

In [15]:
%%time
X_test_level_2 = predict_level_1(X_test, classifiers_level_1)
prediction_l2 = clf_l2.predict(X_test_level_2)

CPU times: user 1min 7s, sys: 6.3 s, total: 1min 14s
Wall time: 41.7 s


In [19]:
# print(prediction_l2[0])
# print(prediction_l2[0].argmax() + 1)

# prediction_l2.argmax(axis=1) + 1

In [17]:
submission = pd.DataFrame({'Id': test_ids,
                           'Cover_Type': prediction_l2.argmax(axis=1) + 1})

submission.to_csv(loc_submission, index=False)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [18]:
print(submission.head())

print()

!head submission.csv

      Id  Cover_Type
0  15121           2
1  15122           1
2  15123           1
3  15124           1
4  15125           1

Id,Cover_Type
15121,2
15122,1
15123,1
15124,1
15125,1
15126,1
15127,1
15128,1
15129,1
