In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/forest-cover-type-prediction/test.csv
/kaggle/input/forest-cover-type-prediction/train.csv
/kaggle/input/forest-cover-type-prediction/sampleSubmission.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import seaborn as sns
from IPython.display import display
%matplotlib inline

import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_222"; OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1~deb9u1-b10); OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)
  Starting server from /opt/conda/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp_q5uvbc4
  JVM stdout: /tmp/tmp_q5uvbc4/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp_q5uvbc4/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.2
H2O cluster version age:,1 month and 21 days
H2O cluster name:,H2O_from_python_unknownUser_rau9w4
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


# Read Dataset

In [3]:
from sklearn.model_selection import train_test_split

data_train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv', index_col='Id')
data_test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv', index_col='Id')

# Add Nearest Neighbors Features

In [4]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import NearestNeighbors
from multiprocessing import Pool


class NearsetNeighborFeats(BaseEstimator, ClassifierMixin):
    """
    This class implement KNN features extraction
    """
    def __init__(self, n_jobs, k_list, metric, n_classes=None, n_neighbors=None, eps=1e-6):
        self.n_jobs = n_jobs
        self.k_list = k_list
        self.metric = metric
        
        if n_neighbors is None:
            self.n_neighbors = max(k_list)
        else:
            self.n_neighbors = n_neighbors
            
        self.eps = eps
        self.n_classes_ = n_classes
        
    
    def fit(self, X, y):
        """
        Set up the train set and self.NN object (Nearest Neighbors object)
        """
        
        # Create a NearestNeighbors (NN) object. We will use it in `predict`` function
        self.NN = NearestNeighbors(n_neighbors=max(self.k_list),
                                  metic=self.metric,
                                  n_jobs=1,
                                  algorithm='brute' if self.metric=='cosine' else 'auto')
        
        self.NN.fit(X)
        
        # Store train labels
        self.y_train = y
        
        # Save how many classes we have
        self.n_classes = np.unique(y).shape[0] if self.n_classes_ is None else self.n_classes_
        
        
    def predict(self, X):
        """
        Produces KNN features for every object of a dataset X  (validation/test dataset)
        """
        if self.n_jobs == 1:
            test_feats = []
            for i in range(X.shape[0]):
                test_feats.append(self.get_features_for_one(X[i:i+1]))
        else:
            """
            Multiprocessing: number of threads should be `self.n_jobs`
            """
            
            test_feats = Pool(self.n_jobs).map(self.get_features_for_one,
                                              (X[i:i+1] for i in range(X.shape[0])))
            
        return np.vstack(test_feats)
    
    
    def get_features_for_one(self, X):
        """
        Compute KNN features for a single object `x` (from the validation/test dataset)
        That is, find nearest neighbors in train dataset.
        """
        
        NN_output = self.NN.kneigbhors(x)
        
        # indices of `x`'s nearest neighbors
        neighs = NN_output[1][0]
        
        # distance between `x` and its neighbors
        neighs_dist = NN_output[0][0]
        
        # labels in the train dataset
        neighs_y = self.y_train[neighs]
        
        # Append the computed features, and then use np.hstack() to concatenate thoese features.
        return_list = []
        
        """
        1. Average appearance of classes in K nearest neighbors.
        Note: self.k_list would look like: [3, 8, 32]
        """
        for k in self.k_list:
            feats = np.bincount(neighs_y[:k], minlength=self.n_classes) / k    
            assert len(feats) == self.n_classes
            return_list += [feats]
            
        
        """
        2. same label streak: the larget number N,
        such that N nearest neighbors have the same label
        """
        feats = 1 + \
                np.where(np.append(neighs_y[:-1] != neighs_y[1:], true))[0].min(keepdims=True)
        
        assert len(feats) == 1
        return_list += [feats]
        
        
        """
        3. minimum distance to objects of each class
        Find the first instance of a class and take its distance as features.
        
        If there are no neighboring objects of some classes,
        Then set distance to that class to be 999.
        """
        feats = []
        for c in range(self.n_classes):
            feats.append(np.append(neighs_dist[neighs_y == c], 999).min())
            
        assert len(feats) == self.n_classes
        return_list += [feats]
        
        """
        4. minimum *normalized* distance to objects of each class.
        Similar to 3., but we normalize the distances
        by the distance to the closest neighbor.
        
        If there are no neighboring objects of some classes,
        Then set distance to that class to be 999.
        
        Add self.eps to the denominator to avoid dividing by zero error.
        """
        
        
        

# Train / Valid Splitting

In [5]:
df_train, df_valid = train_test_split(data_train, test_size=0.1, random_state=42)

htrain_frame = h2o.H2OFrame(df_train)
hvalid_frame = h2o.H2OFrame(df_valid)
htest_frame = h2o.H2OFrame(data_test)
# htrain_frame = h2o.H2OFrame(data_train.drop(columns=col_keywords))
# hvalid_frame = h2o.H2OFrame(data_valid.drop(columns=col_keywords))
# htest_frame = h2o.H2OFrame(data_test.drop(columns=col_keywords))

y = 'Cover_Type'

htrain_frame[y] = htrain_frame[y].asfactor()
hvalid_frame[y] = hvalid_frame[y].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


# Start Training

In [6]:
%%time
aml = h2o.estimators.random_forest.H2ORandomForestEstimator(
                            max_runtime_secs=60, 
#                            max_models=1,
                            balance_classes=True,
                           seed=42)

aml.train(y=y, training_frame=htrain_frame)

drf Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 328 ms, sys: 40 ms, total: 368 ms
Wall time: 12.2 s


In [7]:
aml.model_performance()


ModelMetricsMultinomial: drf
** Reported on train data. **

MSE: 0.13415853825544902
RMSE: 0.3662765870970311
LogLoss: 0.44879027734619426
Mean Per-Class Error: 0.14066734205101566
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8
1.0,2.0,3.0,4.0,5.0,6.0,7.0,Error,Rate
1503.0,290.0,2.0,0.0,38.0,7.0,121.0,0.2335543,"458 / 1,961"
345.0,1316.0,46.0,0.0,173.0,59.0,17.0,0.3271984,"640 / 1,956"
0.0,4.0,1568.0,99.0,26.0,252.0,0.0,0.1954849,"381 / 1,949"
0.0,0.0,31.0,1909.0,0.0,21.0,0.0,0.0265171,"52 / 1,961"
4.0,41.0,25.0,0.0,1857.0,30.0,0.0,0.0510986,"100 / 1,957"
1.0,14.0,156.0,51.0,14.0,1721.0,0.0,0.1205927,"236 / 1,957"
55.0,2.0,0.0,0.0,2.0,0.0,1893.0,0.0302254,"59 / 1,952"
1908.0,1667.0,1828.0,2059.0,2110.0,2090.0,2031.0,0.1406558,"1,926 / 13,693"


Top-7 Hit Ratios: 


0,1
k,hit_ratio
1,0.8593442
2,0.9728328
3,0.9932082
4,0.9986854
5,0.9993427
6,0.9993427
7,0.9999999




In [8]:
# predictions = aml.predict(hvalid_frame.drop(y))

# accuracy = accuracy_score(data_valid[y],
#                          predictions['predict'].as_data_frame())

# print('Accuracy: {}'.format(accuracy))

# Prediction

In [9]:
%%time
prediction_test_hframe = aml.predict(htest_frame)

submission = pd.DataFrame.from_dict({'ID': data_test.index.tolist(),
                                    'Cover_Type': prediction_test_hframe['predict'].as_data_frame().iloc[:,0].tolist(),
                                    })

submission.to_csv('./submission.csv', index=False)

drf prediction progress: |████████████████████████████████████████████████| 100%


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


CPU times: user 3.34 s, sys: 440 ms, total: 3.78 s
Wall time: 42.1 s


In [10]:
print(submission.head())

print()
!head submission.csv

      ID  Cover_Type
0  15121           2
1  15122           1
2  15123           1
3  15124           1
4  15125           1

ID,Cover_Type
15121,2
15122,1
15123,1
15124,1
15125,1
15126,1
15127,1
15128,1
15129,1
