In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
%%time
train_df = pd.read_csv('/kaggle/input/forest-cover-type-kernels-only/train.csv')
test_df = pd.read_csv('/kaggle/input/forest-cover-type-kernels-only/test.csv')
submission_df = pd.read_csv('/kaggle/input/forest-cover-type-kernels-only/sample_submission.csv')

In [None]:
train_df.head()

In [None]:
test_df['Cover_Type'] = np.nan
data_df = pd.concat([train_df, test_df]).drop(columns=['Cover_Type'])

In [None]:
print(train_df.shape)
print(test_df.shape)
print(data_df.shape)

# Read Target Dataset

In [None]:
target_all = pd.read_csv('/kaggle/input/covertype-target/target.csv',
                        index_col='Id')

y_test = target_all.loc[test_df['Id']]

# Convert One-Hot encoding to label

In [None]:

cat_soil = [col for col in data_df.columns if 'Soil' in col]
data_df['Soil_Type'] = data_df[cat_soil].values.argmax(axis=1)+1
data_df = data_df.drop(columns=cat_soil)


cat_wilderness = [col for col in data_df.columns if 'Wilderness' in col]
data_df['Wilderness_Area'] = data_df[cat_wilderness].values.argmax(axis=1)+1
data_df = data_df.drop(columns=cat_wilderness)

In [None]:
data_df.shape

In [None]:
data_df.columns

In [None]:
train_df.columns

In [None]:
import featuretools as ft
import featuretools.variable_types as vtypes
from multiprocessing import cpu_count

In [None]:
from IPython.display import display

for i in range(0, len(ft.list_primitives()), 10):
#     print(i)
    
    display(ft.list_primitives()[0+i:i+10])

# Feature Engineering

Add `normalize_entity`

In [None]:
variable_type = {col: vtypes.Categorical for col in data_df.columns if 'Soil' in col or 'Wilderness' in col}

# variable_type['Cover_Type'] = vtypes.Categorical

print(variable_type)

In [None]:
es = ft.EntitySet('Dataset')

es.entity_from_dataframe(entity_id='forest_table',
                         index='Id',
                        dataframe=data_df,
                         variable_types=variable_type
                        )

# es.normalize_entity(base_entity_id='forest_table',
#                    new_entity_id='soil_type_table',
#                    index='Soil_Type',
# #                    additional_variables=['Cover_Type'],
#                    )

# es.normalize_entity(base_entity_id='forest_table',
#                    new_entity_id='wilderness_area_table',
#                    index='Wilderness_Area',
#                    additional_variables=['Elevation', 'Slope'])

In [None]:
es['forest_table'].variables

In [None]:
es['forest_table'].df

In [None]:
from featuretools.primitives import make_trans_primitive
from featuretools.variable_types import Numeric

def squared_root(col1, col2):
    return (col1**2 + col2**2)**0.5

Squared_Root = make_trans_primitive(function=squared_root,
                                   input_types=[Numeric, Numeric],
                                   return_type=Numeric,
                                    commutative=True, # True: combination, False: permutaion
                                   )


def abs_diff(col1, col2):
    return abs(col1 - col2)

Abs_Diff = make_trans_primitive(function=abs_diff,
                               input_types=[Numeric, Numeric],
                               return_type=Numeric,
                               commutative=True, # True: combination, False: permutaion
                               )


def mean_3_cols(col1, col2, col3):
    return (col1.add(col2.add(col3))) / 3

Mean_3_Cols = make_trans_primitive(function=mean_3_cols,
                                  input_types=[Numeric, Numeric, Numeric],
                                  return_type=Numeric,
                                  commutative=True, # True: combination, False: permutaion
                                  )

In [None]:
%%time
feature_matrix, features = ft.dfs(entityset=es,
                                 target_entity='forest_table',
                                  trans_primitives=[
#                                                     Squared_Root,
#                                                     Abs_Diff,
                                                    Mean_3_Cols,
#                                                     'add_numeric', 
#                                                   'subtract_numeric',
#                                                     'multiply_numeric',
#                                                     'divide_numeric',  # divide by 0.
#                                                     'subtract_numeric',
                                                   ],
                                 max_depth=1,
                                 n_jobs=cpu_count())

In [None]:
feature_matrix = feature_matrix.reset_index()

In [None]:
feature_matrix.shape

In [None]:
data_df.shape

In [None]:
features

### Check commutative of average function
It has to be combination rather than permutation.

In [None]:
for f in features:
    name = f.get_name()
    if 'Aspect' in name and 'Elevation' in name and 'Hillshade_3pm' in name:
        print(name)

# Prepare Train/Test Dataset

In [None]:
feature_matrix = feature_matrix.set_index('Id')

In [None]:
indice = train_df['Id']
# X_train = feature_matrix.loc[indice].drop(columns=['soil_type_table.Cover_Type'])
# y_train = feature_matrix.loc[indice, 'soil_type_table.Cover_Type'].astype('category')
# y_train = train_df['Cover_Type']
X_train = feature_matrix.loc[indice]
y_train = train_df['Cover_Type'].astype('category')

indice = test_df['Id']
# X_test = feature_matrix.loc[indice].drop(columns=['soil_type_table.Cover_Type'])
X_test = feature_matrix.loc[indice]

### Recycle `feature_matrix`

In [None]:
import gc

del feature_matrix
del features

gc.collect()

In [None]:
from IPython.display import display

display(X_train.tail())
display(train_df.tail())

In [None]:
display(X_test.tail())
display(test_df.tail())

# Training

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import accuracy_score

from tqdm import tqdm_notebook

from collections import OrderedDict

def fit_predict(model, X_train=X_train, y_train=y_train, 
                        X_test=X_test, y_test=y_test):
    
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    score = accuracy_score(predictions, y_test)
    
    return score


def fit_pipelines(pipelines):
    scores = {}
    for name, pipeline in tqdm_notebook(pipelines.items()):
        print('Start fitting using {}'.format(name))
        pipeline.fit(X_train, y_train)

        predictions = pipeline.predict(X_test)
        score = accuracy_score(predictions, y_test)
        scores[name] = score

        print('{:20s}: score={}'.format(name, score))

    return(scores)

In [None]:
%%time


seed=47
pipelines = {}

pipelines['Random Forest'] = Pipeline(steps=[('RF', 
                                            RandomForestClassifier(n_jobs=-1,
                                                                   n_estimators=300,
#                                                                    max_features=7,
#                                                                    max_leaf_nodes=220,
                                                                   random_state=seed
                                                                   )
                                                    ),])

pipelines['Extra Trees'] = Pipeline(
                                    steps=[('random forest extra', ExtraTreesClassifier(n_estimators=100, random_state=seed))]
                                    )

pipelines['KNN'] = Pipeline(steps=[
                                        ('KNN', KNeighborsClassifier(n_neighbors=2))
                                    ])

pipelines['LGBM'] = Pipeline(steps=[('LightGBM', 
                                    lgb.LGBMClassifier(n_estimators=300,
                                                             num_class=8,
                                                            num_leaves=32,
                                                            random_state=seed,)
                                   ),])



scores = fit_pipelines(pipelines)

In [None]:
classifiers = [pipeline for name, pipeline in pipelines.items()]

print(len(classifiers))

In [None]:
# pipelines = OrderedDict()

# pipelines['SVM Classifier'] = Pipeline(steps=[('SVM', 
#                                     SVC(C=1,
# #                                         gamma=0.1,
#                                         gamma='auto',
#                                         probability=True,
#                                         )
#                                     ),])

# %prun fit_pipelines(pipelines)

# Stacking

In [None]:
%%time
from mlxtend.classifier import EnsembleVoteClassifier

eclf = EnsembleVoteClassifier(clfs=classifiers,
                             voting='soft')


score = fit_predict(eclf)

print('score={}'.format(score))


In [None]:
%%time
from mlxtend.classifier import EnsembleVoteClassifier

eclf = EnsembleVoteClassifier(clfs=classifiers,
                             voting='hard')


score = fit_predict(eclf)

print('score={}'.format(score))
