In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.endswith('csv') or filename.endswith('hdf'):
            print(os.path.join(dirname, filename))


print()

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if '__' not in filename:
            if filename.endswith('json'):
                print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.

/kaggle/input/forest-cover-type-prediction/train.csv
/kaggle/input/forest-cover-type-prediction/sampleSubmission.csv
/kaggle/input/forest-cover-type-prediction/test.csv
/kaggle/input/forest-cover-type-standard-normalized/train_noramlized.csv
/kaggle/input/forest-cover-type-standard-normalized/test_noramlized.csv
/kaggle/input/forest-cover-type-standard-normalized/train_noramlized.hdf
/kaggle/input/forest-cover-type-standard-normalized/test_noramlized.hdf
/kaggle/input/covertype-target/target.csv

/kaggle/input/featuretools-feature-selection-comb-normalized/features.json
/kaggle/input/featuretools-feature-selection-combinations/features.json


In [2]:
from collections import OrderedDict
from IPython.display import display

import featuretools.variable_types as vtypes

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold



## Read Dataset/ Test Labels

In [3]:
%%time
train_df_org = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
test_df_org = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')

train_df_normalized = pd.read_hdf('/kaggle/input/forest-cover-type-standard-normalized/train_noramlized.hdf')
test_df_normalized = pd.read_hdf('/kaggle/input/forest-cover-type-standard-normalized/test_noramlized.hdf')

target_all = pd.read_csv('/kaggle/input/covertype-target/target.csv',
                        index_col='Id')

y_test = target_all.loc[test_df_org['Id']]

CPU times: user 2.52 s, sys: 3.07 s, total: 5.59 s
Wall time: 5.28 s


In [4]:
print('shape:')
print('train_df_org:        {}'.format(train_df_org.shape))
print('test_df_normalized:  {}'.format(test_df_normalized.shape))
print()
print('test_df_org:         {}'.format(test_df_org.shape))
print('train_df_normalized: {}'.format(train_df_normalized.shape))



shape:
train_df_org:        (15120, 56)
test_df_normalized:  (565892, 55)

test_df_org:         (565892, 55)
train_df_normalized: (15120, 56)


In [5]:
display(train_df_org.head(2))
display(train_df_normalized.head(2))

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,-0.367095,-0.95998,-1.597132,0.146639,-0.834074,-0.908681,0.271454,0.571653,0.281259,...,0,0,0,0,0,0,0,0,0,5
1,2,-0.381461,-0.914559,-1.715424,-0.072337,-0.932054,-0.999246,0.238732,0.703225,0.346627,...,0,0,0,0,0,0,0,0,0,5


## Combine Train/Test Dataset

In [6]:
test_df_org['Cover_Type'] = np.nan
data_df_org = pd.concat([train_df_org, test_df_org]).drop(columns=['Cover_Type'])

test_df_normalized['Cover_Type'] = np.nan
data_df_normalized = pd.concat([train_df_normalized, test_df_normalized]).drop(columns=['Cover_Type'])

In [7]:
print('shape')
print('data_df_org:        {}'.format(data_df_org.shape))
print('data_df_normalized: {}'.format(data_df_normalized.shape))

shape
data_df_org:        (581012, 55)
data_df_normalized: (581012, 55)


## For Original Dataset
Convert One-Hot Encoding to Label Encoding

In [8]:
cat_soil = [col for col in data_df_org.columns if 'Soil' in col]
data_df_org['Soil_Type'] = data_df_org[cat_soil].values.argmax(axis=1)+1
data_df_org = data_df_org.drop(columns=cat_soil)


cat_wilderness = [col for col in data_df_org.columns if 'Wilderness' in col]
data_df_org['Wilderness_Area'] = data_df_org[cat_wilderness].values.argmax(axis=1)+1
data_df_org = data_df_org.drop(columns=cat_wilderness)

In [9]:
print('data_df_org: {}'.format(data_df_org.shape))
display(data_df_org.head(2))

data_df_org: (581012, 13)


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Soil_Type,Wilderness_Area
0,1,2596,51,3,258,0,510,221,232,148,6279,29,1
1,2,2590,56,2,212,-6,390,220,235,151,6225,29,1


# Delcare Transform Primitives

In [10]:
from featuretools.primitives import make_trans_primitive
from featuretools.variable_types import Numeric

def squared_root(col1, col2):
    return (col1**2 + col2**2)**0.5

Squared_Root = make_trans_primitive(function=squared_root,
                                   input_types=[Numeric, Numeric],
                                   return_type=Numeric,
                                    commutative=True, # True: combination, False: permutaion
                                   )


def abs_diff(col1, col2):
    return abs(col1 - col2)

Abs_Diff = make_trans_primitive(function=abs_diff,
                               input_types=[Numeric, Numeric],
                               return_type=Numeric,
                               commutative=True, # True: combination, False: permutaion
                               )


def mean_2_cols(col1, col2):
    return (col1.add(col2)) / 2


Mean_2_Cols = make_trans_primitive(function=mean_2_cols,
                                  input_types=[Numeric, Numeric],
                                  return_type=Numeric,
                                  commutative=True, # True: combination, False: permutaion
                                  )


def mean_3_cols(col1, col2, col3):
    return (col1.add(col2.add(col3))) / 3

Mean_3_Cols = make_trans_primitive(function=mean_3_cols,
                                  input_types=[Numeric, Numeric, Numeric],
                                  return_type=Numeric,
                                  commutative=True, # True: combination, False: permutaion
                                  )


def div_2_cols(col1, col2):
    eps = 10e-2
    return (col1+eps) / (col2+eps)


Div_2_Cols = make_trans_primitive(function=div_2_cols,
                                  input_types=[Numeric, Numeric],
                                  return_type=Numeric,
                                  commutative=True, # True: combination, False: permutaion
                                  )

## Read Features

In [11]:
import featuretools as ft

filenames = OrderedDict()
filenames['original'] = '/kaggle/input/featuretools-feature-selection-combinations/features.json'
filenames['normalized'] = '/kaggle/input/featuretools-feature-selection-comb-normalized/features.json'

features_dict = OrderedDict()

for key, filename in filenames.items():
    features_dict[key] = ft.load_features(filename)

## Generate Features of Org/Normalized Datasets

In [12]:
%%time

def generate_features(dataframe, transform_features):
    variable_type = {col: vtypes.Categorical for col in dataframe.columns if 'Soil' in col or 'Wilderness' in col}

    es = ft.EntitySet('Dataset')
    es.entity_from_dataframe(entity_id='forest_table',
                             index='Id',
                             dataframe=dataframe,
                             variable_types=variable_type
                            )
    _, features = ft.dfs(entityset=es,
                        target_entity='forest_table')
    
    features_combined = features + transform_features
    feature_matrix = ft.calculate_feature_matrix(features_combined,
                                                entityset=es,
#                                                 n_jobs=cpu_count(),
                                                )

    return feature_matrix
    


feature_matrix_org = generate_features(data_df_org,
                                     features_dict['original'])

feature_matrix_normalized = generate_features(data_df_normalized,
                                     features_dict['normalized'])

CPU times: user 26.8 s, sys: 9.24 s, total: 36.1 s
Wall time: 31.8 s


In [13]:
print('transform features of org:        {}'.format(len(features_dict['original'])))
print('data_df_org:                      {}'.format(data_df_org.shape))
print('feature_matrix_org:               {}'.format(feature_matrix_org.shape))
print()
print('transform features of normalized: {}'.format(len(features_dict['normalized'])))
print('data_df_normalized:               {}'.format(data_df_normalized.shape))
print('feature_matrix_normalized:        {}'.format(feature_matrix_normalized.shape))

display(feature_matrix_org.head(2))
display(feature_matrix_normalized.head(2))

transform features of org:        16
data_df_org:                      (581012, 13)
feature_matrix_org:               (581012, 28)

transform features of normalized: 28
data_df_normalized:               (581012, 55)
feature_matrix_normalized:        (581012, 82)


Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Elevation - Horizontal_Distance_To_Roadways,Horizontal_Distance_To_Fire_Points - Horizontal_Distance_To_Hydrology,"ABS_DIFF(Horizontal_Distance_To_Hydrology, Horizontal_Distance_To_Roadways)","ABS_DIFF(Horizontal_Distance_To_Fire_Points, Horizontal_Distance_To_Roadways)","DIV_2_COLS(Horizontal_Distance_To_Fire_Points, Horizontal_Distance_To_Hydrology)","DIV_2_COLS(Elevation, Horizontal_Distance_To_Hydrology)","MEAN_3_COLS(Elevation, Horizontal_Distance_To_Fire_Points, Horizontal_Distance_To_Roadways)","SQUARED_ROOT(Horizontal_Distance_To_Fire_Points, Horizontal_Distance_To_Roadways)","SQUARED_ROOT(Elevation, Horizontal_Distance_To_Fire_Points)","SQUARED_ROOT(Elevation, Horizontal_Distance_To_Roadways)"
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,51,3,258,0,510,221,232,148,6279,...,2086,6021,252,5769,24.328167,10.058504,3128.333333,6299.677849,6794.487251,2645.622044
2,2590,56,2,212,-6,390,220,235,151,6225,...,2200,6013,178,5835,29.349835,12.211693,3068.333333,6237.204903,6742.308581,2619.198351


Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Elevation - Slope,Elevation - Hillshade_3pm,Hillshade_9am - Horizontal_Distance_To_Roadways,Hillshade_9am - Horizontal_Distance_To_Hydrology,Aspect - Horizontal_Distance_To_Fire_Points,Horizontal_Distance_To_Hydrology * Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Fire_Points * Horizontal_Distance_To_Roadways,Elevation * Hillshade_9am,Elevation * Horizontal_Distance_To_Hydrology,Elevation * Vertical_Distance_To_Hydrology
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.367095,-0.95998,-1.597132,0.146639,-0.834074,-0.908681,0.271454,0.571653,0.281259,4.334805,...,1.230037,-0.648354,1.180135,0.124814,-5.294785,-0.122308,-3.938956,-0.099649,-0.053831,0.306185
2,-0.381461,-0.914559,-1.715424,-0.072337,-0.932054,-0.999246,0.238732,0.703225,0.346627,4.28571,...,1.333963,-0.728088,1.237977,0.311068,-5.200269,0.067422,-4.282478,-0.091067,0.027594,0.355542


## Rename columns in the normalized dataset

In [14]:
col_new = {col: 'NORM({})'.format(col) for col in feature_matrix_normalized.columns}

feature_matrix_normalized = feature_matrix_normalized.rename(columns=col_new)

## Combine org/normalized feature_matrix

In [15]:
feature_matrix_all = pd.concat([feature_matrix_org, feature_matrix_normalized], axis='columns')

In [16]:
print('feature_matrix_org:        {}'.format(feature_matrix_org.shape))
print('feature_matrix_normalized: {}'.format(feature_matrix_normalized.shape))
print('feature_matrix_all:        {}'.format(feature_matrix_all.shape))

feature_matrix_org:        (581012, 28)
feature_matrix_normalized: (581012, 82)
feature_matrix_all:        (581012, 110)


## Split Train/Test

In [17]:
# train dataset
indice = train_df_org['Id']
X_train = feature_matrix_all.loc[indice]
y_train = train_df_org['Cover_Type'].astype('category')

# test dataset
indice = test_df_org['Id']
X_test = feature_matrix_all.loc[indice]

# Stacking using EnsembleVoteClassifiter and ColumnSelector

In [18]:
col_num_org = tuple(i for i, col in enumerate(feature_matrix_all) if 'NORM(' not in col)
col_num_normalized = tuple(i for i, col in enumerate(feature_matrix_all) if 'NORM(' in col)

print('size of org:                {}'.format(len(col_num_org)))
print('size of normalized columns: {}'.format(len(col_num_normalized)))

size of org:                28
size of normalized columns: 82


In [19]:
type_ratio = np.array([0.37062, 0.49657, 0.05947, 0.00106, 0.01287, 0.02698, 0.03238])
class_weight = {i: v for i, v in enumerate(type_ratio)}

In [20]:
class_weight

{0: 0.37062,
 1: 0.49657,
 2: 0.05947,
 3: 0.00106,
 4: 0.01287,
 5: 0.02698,
 6: 0.03238}

In [21]:
from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
from mlxtend.classifier import StackingClassifier, EnsembleVoteClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import lightgbm as lgb

seed = 42

pipelines_org = []

pipelines_org = pipelines_org + \
                [make_pipeline(ColumnSelector(col_num_org),
                              RandomForestClassifier(n_jobs=-1,
                                                   n_estimators=300,
#                                                                    max_features=7,
#                                                                    max_leaf_nodes=220,
#                                                     class_weight=class_weight,
                                                   random_state=seed
                                                    )
                                )]

pipelines_org = pipelines_org + \
                [make_pipeline(ColumnSelector(col_num_org),
                              ExtraTreesClassifier(n_estimators=100, class_weight=class_weight, random_state=seed)
                                )]

pipelines_org = pipelines_org + \
                [make_pipeline(ColumnSelector(col_num_org),
                              lgb.LGBMClassifier(n_estimators=300,
                                                             num_class=8,
                                                            num_leaves=32,
                                                             class_weight=class_weight,
                                                            random_state=seed,)
                                )]

pipelines_org = pipelines_org + \
                [make_pipeline(ColumnSelector(col_num_org),
                              LogisticRegression(n_jobs=-1, multi_class="multinomial", C=10**9, solver="saga", class_weight=class_weight)
                                )]

In [22]:
pipeline_normalized = []

pipeline_normalized = pipeline_normalized +\
                        [make_pipeline(ColumnSelector(col_num_normalized),
                                       RandomForestClassifier(n_jobs=-1,
                                                   n_estimators=300,
#                                                                    max_features=7,
#                                                                    max_leaf_nodes=220,
                                                    class_weight=class_weight,
                                                   random_state=seed
                                                    )
                                      )]

pipeline_normalized = pipeline_normalized +\
                        [make_pipeline(ColumnSelector(col_num_normalized),
                                       ExtraTreesClassifier(n_estimators=100, class_weight=class_weight, random_state=seed)
                                      )]

pipeline_normalized = pipeline_normalized +\
                        [make_pipeline(ColumnSelector(col_num_normalized),
                                       lgb.LGBMClassifier(n_estimators=300,
                                                             num_class=8,
                                                            num_leaves=32,
                                                              class_weight=class_weight,
                                                            random_state=seed,)
                                      )]


pipeline_normalized = pipeline_normalized +\
                        [make_pipeline(ColumnSelector(col_num_normalized),
                                       LogisticRegression(n_jobs=-1, multi_class="multinomial", C=10**9, solver="saga", class_weight=class_weight,)
                                      )]


pipeline_normalized = pipeline_normalized +\
                        [make_pipeline(ColumnSelector(col_num_normalized),
                                       SVC(C=10, gamma=0.1, probability=True, class_weight=class_weight),
                                      )]



In [23]:
print('num of models in org:        {}'.format(len(pipelines_org)))
print('num of models in normalized: {}'.format(len(pipeline_normalized)))

num of models in org:        4
num of models in normalized: 5


## Start training

In [24]:
# %%time
# sclf = StackingClassifier(classifiers=pipelines_org+pipeline_normalized,
#                          meta_classifier=LogisticRegression(),
#                          verbose=1)

# sclf.fit(X_train, y_train)



## Print Scores

In [25]:
# %%time

# prediction = sclf.predict(X_test)

# score = accuracy_score(prediction, y_test)

# print('score: {}'.format(score))

In [26]:
def fit_predict(model, X_train=X_train, y_train=y_train, 
                        X_test=X_test, y_test=y_test):
    
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    score = accuracy_score(predictions, y_test)
    
    return score

In [27]:
%%time

eclf = EnsembleVoteClassifier(clfs=pipelines_org+pipeline_normalized,
                             voting='soft')


score = fit_predict(eclf)

print('score={}'.format(score))



score=0.8153057473864271
CPU times: user 34min 16s, sys: 18.6 s, total: 34min 34s
Wall time: 27min 14s


In [28]:
# %%time

# eclf = EnsembleVoteClassifier(clfs=pipelines_org+pipeline_normalized,
#                              voting='soft')

# scores = cross_val_score(eclf, X_train, y_train,
#                        cv=5,
#                        scoring='accuracy')

# print(scores)

In [29]:
# %%time

# for train_index, test_index in KFold(n_splits=5, random_state=seed).split(X_train):
#     eclf.fit(X_train.iloc[train_index], y_train.iloc[train_index])

In [30]:
# %%time

predictions = eclf.predict(X_test)
score = accuracy_score(predictions, y_test)

print(score)

0.8153057473864271


In [31]:
# %%time

# eclf = EnsembleVoteClassifier(clfs=pipelines_org+pipeline_normalized,
#                              voting='hard')


# score = fit_predict(eclf)

# print('score={}'.format(score))