In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/forest-cover-type-prediction/train.csv
/kaggle/input/forest-cover-type-prediction/sampleSubmission.csv
/kaggle/input/forest-cover-type-prediction/test.csv


In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import seaborn as sns
from IPython.display import display
%matplotlib inline

import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,13 mins 06 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.2
H2O cluster version age:,1 month and 22 days
H2O cluster name:,H2O_from_python_unknownUser_eqh2xt
H2O cluster total nodes:,1
H2O cluster free memory:,3.386 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


# Read Dataset

In [22]:
from sklearn.model_selection import train_test_split

data_train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv', index_col='Id')
data_test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv', index_col='Id')

print(data_train.shape)
print(data_test.shape)

(15120, 55)
(565892, 54)


# Add Nearest Neighbors Features

In [23]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import NearestNeighbors
from multiprocessing import Pool


class NearestNeighborsFeats(BaseEstimator, ClassifierMixin):
    """
    This class implement KNN features extraction
    """
    def __init__(self, n_jobs, k_list, metric, n_classes=None, n_neighbors=None, eps=1e-6):
        self.n_jobs = n_jobs
        self.k_list = k_list
        self.metric = metric
        
        if n_neighbors is None:
            self.n_neighbors = max(k_list)
        else:
            self.n_neighbors = n_neighbors
            
        self.eps = eps
        self.n_classes_ = n_classes
        
    
    def fit(self, X, y):
        """
        Set up the train set and self.NN object (Nearest Neighbors object)
        """
        
        # Create a NearestNeighbors (NN) object. We will use it in `predict`` function
        self.NN = NearestNeighbors(n_neighbors=max(self.k_list),
                                  metric=self.metric,
                                  n_jobs=1,
                                  algorithm='brute' if self.metric=='cosine' else 'auto')
        
        self.NN.fit(X)
        
        # Store train labels
        self.y_train = y
        
        # Save how many classes we have
        self.n_classes = np.unique(y).shape[0] if self.n_classes_ is None else self.n_classes_
        
        
    def predict(self, X):
        """
        Produces KNN features for every object of a dataset X  (validation/test dataset)
        """
        if self.n_jobs == 1:
            test_feats = []
            for i in range(X.shape[0]):
                test_feats.append(self.get_features_for_one(X[i:i+1]))
        else:
            """
            Multiprocessing: number of threads should be `self.n_jobs`
            """
            
            test_feats = Pool(self.n_jobs).map(self.get_features_for_one,
                                              (X[i:i+1] for i in range(X.shape[0])))
            
        return np.vstack(test_feats)
    
    
    def get_features_for_one(self, x):
        """
        Compute KNN features for a single object `x` (from the validation/test dataset)
        That is, find nearest neighbors in train dataset.
        """
        
        NN_output = self.NN.kneighbors(x)
        
        # indices of `x`'s nearest neighbors
        neighs = NN_output[1][0]
        
        # distance between `x` and its neighbors
        neighs_dist = NN_output[0][0]
        
        # labels in the train dataset
        neighs_y = self.y_train[neighs]
        
        # Append the computed features, and then use np.hstack() to concatenate thoese features.
        return_list = []
        
        """
        1. Average appearance of classes in K nearest neighbors.
        Note: self.k_list would look like: [3, 8, 32]
        """
        for k in self.k_list:
            feats = np.bincount(neighs_y[:k], minlength=self.n_classes) / k    
            assert len(feats) == self.n_classes
            return_list += [feats]
            
        
        """
        2. Same label streak: the larget number N,
        such that N nearest neighbors have the same label
        """
        feats = 1 + \
                np.where(np.append(neighs_y[:-1] != neighs_y[1:], True))[0].min(keepdims=True)
        
        assert len(feats) == 1
        return_list += [feats]
        
        
        """
        3. minimum distance to objects of each class
        Find the first instance of a class and take its distance as features.
        
        If there are no neighboring objects of some classes,
        Then set distance to that class to be 999.
        """
        feats = []
        for c in range(self.n_classes):
            feats.append(np.append(neighs_dist[neighs_y == c], 999).min())
            
        assert len(feats) == self.n_classes
        return_list += [feats]
        
        """
        4. minimum *normalized* distance to objects of each class.
        Similar to 3., but we normalize the distances
        by the distance to the closest neighbor.
        
        If there are no neighboring objects of some classes,
        Then set distance to that class to be 999.
        
        Add self.eps to the denominator to avoid dividing by zero error.
        """
        feats = []
        for c in range(self.n_classes):
            feat = neighs_dist[neighs_y == c] / (neighs_dist[0] + self.eps)
            feats.append(feat.min() if feat.size else 999)
            
        assert len(feats) == self.n_classes
        return_list += [feats]
        
        """
        5.
            5.1 Distance to Kth neighbor: quantiles of a distribution
            5.2 Distance to Kth neighbor normalized by
                distance to the first neighbor
        """
        for k in self.k_list:
            feat_51 = neighs_dist[k-1]
            feat_52 = neighs_dist[k-1] / (neighs_dist[0] + self.eps)
            
            return_list += [[feat_51, feat_52]]
            
        """
        6. Mean distance to neighbors of each class for each K from `k_list`
            for each class select the neighbors of that class among K nearesst neighbors
            
            If there are no objects of a ceertain class among K enighbors, set mean distance to 999.
        """
        for k in self.k_list:
            bincount = np.bincount(neighs_y[:k], minlength=self.n_classes)
            feats = np.where(
                bincount,
                np.bincount(neighs_y[:k], weights=neighs_dist[:k], minlength=self.n_classes) / (bincount+self.eps),
                    999
            )
            
            assert len(feats) == self.n_classes
            return_list += [feats]
            
        # merge
        knn_feats = np.hstack(return_list)
        
        """
        Number of features: 
            1. len(k_list) * n_classes
            2. 1
            3. n_classes
            4. n_classes
            5. len(k_list) * 2
            6. len(k_list) * n_classes
            
        Total number of features
        = len(k_list) * ( 2*n_classes + 2) + 1 + 2*n_classes
        = 3 * 18 + 1 + 16
        """
        
        return knn_feats
        
        
        

In [24]:
%%time
from multiprocessing import cpu_count
print('Number of CPU: {}'.format(cpu_count()))

k_list = [3, 8, 32]

NNF = NearestNeighborsFeats(n_jobs=cpu_count(), k_list=k_list, metric='minkowski', n_classes=7+1)

NNF.fit(X=data_train.drop(columns=['Cover_Type']).values,
       y=data_train['Cover_Type'].values)



Number of CPU: 4
CPU times: user 32 ms, sys: 4 ms, total: 36 ms
Wall time: 35.8 ms


In [25]:
%%time
data_train_knn = NNF.predict(data_train.drop(columns=['Cover_Type']).values)

col_names_knn = ['knn_{}'.format(i) for i in range(data_train_knn.shape[1])]
print(col_names_knn)

data_train = pd.concat([
                            data_train, 
                           pd.DataFrame(data_train_knn, index=data_train.index, 
                                        columns=col_names_knn)
                       ], axis=1)

['knn_0', 'knn_1', 'knn_2', 'knn_3', 'knn_4', 'knn_5', 'knn_6', 'knn_7', 'knn_8', 'knn_9', 'knn_10', 'knn_11', 'knn_12', 'knn_13', 'knn_14', 'knn_15', 'knn_16', 'knn_17', 'knn_18', 'knn_19', 'knn_20', 'knn_21', 'knn_22', 'knn_23', 'knn_24', 'knn_25', 'knn_26', 'knn_27', 'knn_28', 'knn_29', 'knn_30', 'knn_31', 'knn_32', 'knn_33', 'knn_34', 'knn_35', 'knn_36', 'knn_37', 'knn_38', 'knn_39', 'knn_40', 'knn_41', 'knn_42', 'knn_43', 'knn_44', 'knn_45', 'knn_46', 'knn_47', 'knn_48', 'knn_49', 'knn_50', 'knn_51', 'knn_52', 'knn_53', 'knn_54', 'knn_55', 'knn_56', 'knn_57', 'knn_58', 'knn_59', 'knn_60', 'knn_61', 'knn_62', 'knn_63', 'knn_64', 'knn_65', 'knn_66', 'knn_67', 'knn_68', 'knn_69', 'knn_70']
CPU times: user 392 ms, sys: 588 ms, total: 980 ms
Wall time: 5.07 s


In [36]:
pd.DataFrame(data_train_knn).to_csv('knn_features_train.csv')

In [26]:
from IPython.display import display
display(data_train.head())

print(data_train_knn.shape)
print(data_train.shape)

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,knn_61,knn_62,knn_63,knn_64,knn_65,knn_66,knn_67,knn_68,knn_69,knn_70
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,51,3,258,0,510,221,232,148,6279,...,999.0,999.0,999.0,806.911208,675.650854,999.0,999.0,491.324013,999.0,999.0
2,2590,56,2,212,-6,390,220,235,151,6225,...,999.0,999.0,999.0,787.433019,596.444826,999.0,999.0,505.522026,999.0,999.0
3,2804,139,9,268,65,3180,234,238,135,6121,...,999.0,999.0,999.0,340.861429,312.684697,999.0,999.0,279.909865,999.0,999.0
4,2785,155,18,242,118,3090,238,238,122,6211,...,999.0,999.0,999.0,350.851608,296.830203,999.0,999.0,345.978349,999.0,999.0
5,2595,45,2,153,-1,391,220,234,150,6172,...,999.0,999.0,999.0,999.0,470.827077,999.0,999.0,499.421487,999.0,999.0


(15120, 71)
(15120, 126)


In [30]:
%%time
data_test_knn = NNF.predict(data_test.values)
data_test = pd.concat([
                        data_test, 
                        pd.DataFrame(data_test_knn, index=data_test.index,
                                   columns=col_names_knn)
                    ], axis=1)

CPU times: user 10.2 s, sys: 4.2 s, total: 14.4 s
Wall time: 2min 51s


In [37]:
pd.DataFrame(data_test_knn).to_csv('knn_features_test.csv')

# Train / Valid Splitting

In [31]:
df_train, df_valid = train_test_split(data_train, test_size=0.1, random_state=42)

htrain_frame = h2o.H2OFrame(df_train)
hvalid_frame = h2o.H2OFrame(df_valid)
htest_frame = h2o.H2OFrame(data_test)

# htrain_frame = h2o.H2OFrame(data_train.drop(columns=col_keywords))
# hvalid_frame = h2o.H2OFrame(data_valid.drop(columns=col_keywords))
# htest_frame = h2o.H2OFrame(data_test.drop(columns=col_keywords))

y = 'Cover_Type'

htrain_frame[y] = htrain_frame[y].asfactor()
hvalid_frame[y] = hvalid_frame[y].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


# Start Training

In [35]:
%%time
aml = h2o.estimators.random_forest.H2ORandomForestEstimator(
                            max_runtime_secs=60, 
#                            max_models=1,
                            balance_classes=True,
                           seed=42)

aml.train(y=y, training_frame=htrain_frame)

drf Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 248 ms, sys: 40 ms, total: 288 ms
Wall time: 5.54 s


In [29]:
aml.model_performance()


ModelMetricsMultinomial: drf
** Reported on train data. **

MSE: 0.0007908731863906722
RMSE: 0.02812246764405059
LogLoss: 0.007625359306196723
Mean Per-Class Error: 0.0
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8
1.0,2.0,3.0,4.0,5.0,6.0,7.0,Error,Rate
1961.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 1,961"
0.0,1956.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 1,956"
0.0,0.0,1949.0,0.0,0.0,0.0,0.0,0.0,"0 / 1,949"
0.0,0.0,0.0,1961.0,0.0,0.0,0.0,0.0,"0 / 1,961"
0.0,0.0,0.0,0.0,1957.0,0.0,0.0,0.0,"0 / 1,957"
0.0,0.0,0.0,0.0,0.0,1957.0,0.0,0.0,"0 / 1,957"
0.0,0.0,0.0,0.0,0.0,0.0,1952.0,0.0,"0 / 1,952"
1961.0,1956.0,1949.0,1961.0,1957.0,1957.0,1952.0,0.0,"0 / 13,693"


Top-7 Hit Ratios: 


0,1
k,hit_ratio
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0




In [None]:
# predictions = aml.predict(hvalid_frame.drop(y))

# accuracy = accuracy_score(data_valid[y],
#                          predictions['predict'].as_data_frame())

# print('Accuracy: {}'.format(accuracy))

# Prediction

In [32]:
%%time
prediction_test_hframe = aml.predict(htest_frame)

submission = pd.DataFrame.from_dict({'ID': data_test.index.tolist(),
                                    'Cover_Type': prediction_test_hframe['predict'].as_data_frame().iloc[:,0].tolist(),
                                    })

submission.to_csv('./submission.csv', index=False)

drf prediction progress: |████████████████████████████████████████████████| 100%


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


CPU times: user 3.3 s, sys: 204 ms, total: 3.51 s
Wall time: 25.4 s


In [33]:
print(submission.head())

print()
!head submission.csv

      ID  Cover_Type
0  15121           2
1  15122           2
2  15123           1
3  15124           1
4  15125           1

ID,Cover_Type
15121,2
15122,2
15123,1
15124,1
15125,1
15126,1
15127,1
15128,1
15129,1
