In [2]:
import csv
import json
import warnings

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
from xgboost import XGBClassifier

from typing import List, Dict

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import ExtraTreeClassifier

from sklearn.model_selection import GridSearchCV

import lightgbm as lgb


In [3]:
def soil_type_2_elu(soil_type: int) -> int:
    assert 0 < soil_type < 41, \
        "Soil type out of boundary 1~40."
    code_dict = [
        2702, 2703, 2704, 2705, 2706,
        2717, 3501, 3502, 4201, 4703,
        4704, 4744, 4758, 5101, 5151,
        6101, 6102, 6731, 7101, 7102,
        7103, 7201, 7202, 7700, 7701,
        7702, 7709, 7710, 7745, 7746,
        7755, 7756, 7757, 7790, 8703,
        8707, 8708, 8771, 8772, 8776,
        ]
    return code_dict[soil_type - 1]


def get_climatic_zone(elu: int) -> int:
    res = elu // 1000
    assert 0 < res <= 8, "Climatic zone code out of boundary 1~8."
    return res


def get_geologic_zone(elu: int) -> int:
    res = elu % 1000 // 100
    assert 0 < res <= 8, "Geologic zone code out of boundary 1~8."
    return res


def get_third_digit(elu: int) -> int:
    return elu % 100 // 10


def get_fourth_digit(elu: int) -> int:
    return elu % 10


def preprocess(df: pd.DataFrame, mode: str = "train") -> List[np.ndarray]:
    """
    Preprocess the dataframe and return [X, y] without reshuffling nor rescaling.
    X is of shape (n,d+2) and y is of shape (n).
    The first column of X contains the ID of each record,
    whilst the second column contains the area code of each record.
    :param mode whether "train" or "test"
    """

    # soil types
    df.insert(loc=0, column="Soil_Type", value=0)
    for i in range(1, 41):
        column_name = "Soil_Type" + str(i)
        df.loc[df[column_name] == 1, "Soil_Type"] = i
        df.drop(column_name, axis=1, inplace=True)

    df["elu"] = [soil_type_2_elu(i) for i in df["Soil_Type"]]
    df.drop("Soil_Type", axis=1, inplace=True)

    df["climatic_zone"] = [get_climatic_zone(i) for i in df["elu"]]
    df["geologic_zone"] = [get_geologic_zone(i) for i in df["elu"]]
    df["third_digit"] = [get_third_digit(i) for i in df["elu"]]
    df["fourth_digit"] = [get_fourth_digit(i) for i in df["elu"]]
    df.drop("elu", axis=1, inplace=True)

    # wilderness area
    df.insert(loc=0, column="Wilderness_Area", value=0)
    for i in range(1, 5):
        column_name = "Wilderness_Area" + str(i)
        df.loc[df[column_name] == 1, "Wilderness_Area"] = i
        df.drop(column_name, axis=1, inplace=True)

    # reformat
    ids = df["Id"].to_numpy()
    df.drop("Id", axis=1, inplace=True)

    if mode == "train":
        y = df["Cover_Type"].to_numpy()
        df.drop("Cover_Type", axis=1, inplace=True)
    elif mode == "test":
        y = None
    else:
        raise AssertionError("Unexpected mode, try \"train\" or \"test\" instead. ")

    areas = df["Wilderness_Area"].to_numpy()
    df.drop("Wilderness_Area", axis=1, inplace=True)


    X = df.to_numpy()

    # one hot encode
    enc = OneHotEncoder(categories=[np.arange(1, 9), np.arange(1, 9), np.arange(0, 10), np.arange(0, 10)])
    enc.fit(X[:, -4:])

    X = np.concatenate((
        ids.reshape(-1, 1),
        areas.reshape(-1, 1),
        X[:, :-4],
        enc.transform(X[:, -4:]).toarray()
    ), axis=1)

    return [X, y]

In [5]:
df_train = pd.read_csv("./data/train.csv")
X, y = preprocess(df_train)

In [6]:
# LGBMClassifier
pipeline = Pipeline([('scaler', StandardScaler()),
                     ('classifier', lgb.LGBMClassifier())
                     ])

param_grid = {
}

model = GridSearchCV(pipeline, param_grid, scoring='accuracy', verbose=3)
model.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..................................., score=0.883 total time=   2.0s
[CV 2/5] END ..................................., score=0.896 total time=   2.0s
[CV 3/5] END ..................................., score=0.895 total time=   2.3s
[CV 4/5] END ..................................., score=0.892 total time=   1.9s
[CV 5/5] END ..................................., score=0.885 total time=   1.8s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('classifier', LGBMClassifier())]),
             param_grid={}, scoring='accuracy', verbose=3)

In [7]:
df_test = pd.read_csv("./data/test-full.csv")
X_test, _ = preprocess(df_test, mode="test")
ids = [int(id) for id in X_test[:, 0]]

In [9]:
best_model = model.best_estimator_
y_test = best_model.predict(X_test)
df_result = pd.DataFrame(list(zip(ids, y_test)), columns=['Id', 'Cover_Type'])
df_result.to_csv("./data/lgbm_pred.csv", index=False)