## LightGBM GPU Installation

In [None]:
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!apt-get install -y -qq libboost-all-dev

## Build and re-install LightGBM with GPU support

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

## Imports

In [None]:
import numpy as np
import os
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, auc, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the data

In [None]:
# Access the patch prior to the code patch and access the "data" folder
PATH = os.path.join(os.path.dirname(os.getcwd()), "data")

In [None]:
def load_train_test_data(path=PATH):
    train_path = os.path.join(path, "train.csv")
    test_path = os.path.join(path, "test.csv")
    return pd.read_csv(train_path), pd.read_csv(test_path)

In [None]:
train, test = load_train_test_data(PATH)

## Preprocessing

In [None]:
print "Train shape", train.shape
print "Test shape", test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
variables = train.drop("Cover_Type", axis=1)
var_resp = train["Cover_Type"].copy()

In [None]:
# View % of each class of var response
(var_resp.value_counts()/var_resp.count())*100

In [None]:
# Make the break between training and testing before any treatment at the base to avoid missings biases and other particularities
# Stratify is useful for small datasets, unbalanced datasets or multiclass classification
# For small datasets, k-fold or leave-one-out is better than holdout
x_train, x_test, y_train, y_test = train_test_split(variables, var_resp, test_size=0.2, random_state=2, stratify=var_resp)

## EDA

In [None]:
# Count null for each column
x_train.isnull().sum()[x_train.isnull().sum() > 0]

In [None]:
# Check constant features to delete them
feats_counts = x_train.nunique(dropna = False, axis=1) == 1

constant_features = feats_counts.loc[feats_counts==1].index.tolist()
print (constant_features)

In [None]:
# Check duplicated features to delete them
x_train.T.duplicated

In [None]:
x_train.corr()
plt.matshow()

In [None]:
x_train.mean().sort_values().plot(style='.')

In [None]:
# See correlation between base variables
corr = x_train.loc[:, x_train.columns != 'Id'].corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
# See correlation of variables with the response variable
corr_with_y = pd.DataFrame(x_train.corrwith(y_train).abs()).reset_index()
corr_with_y.columns = ["Feature", "Correlation with Target"]
corr_with_y = corr_with_y.sort_values(by="Correlation with Target", ascending=False)
corr_with_y.head(10)

In [None]:
# Feature engineering:
# 
def add_features(data):
    data["Teste"] = data["Teste_1"]
return data

In [None]:
x_train = add_features(x_train)
x_test = add_features(x_test)

In [None]:
# Drop identity and other columns
columns = ["Id", constant_features]

x_train.drop(columns, axis = 1, inplace = True)
x_test.drop(columns, axis = 1, inplace = True)

In [None]:
# Preprocessing for numeric and categoric variables
# Tree-based models doesn't depend on scaling
# Non Tree-based models hugely depend on scaling
# We use preprocessing (StardardScaler or MinMaxScaler) to scale all features to one scale, so that their initial impact on the model will be roughly similar.
# Log or SQRT functions help linear models and neural networks by making large values approach the mean and small values more distinguishable
num_pipeline = Pipeline([
        ('selector', VarianceThreshold()),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

# One-Hot can be useful in linear methods, Knn and neural networks.
# But for tree methods, this can increase the complexity of the model.
# If the number of classes in each 'One-Hot' column is large, we can store the final base in a sparse array, so we don't 'keep' the zeros in memory
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('oneHot', OneHotEncoder())
    ])

In [None]:
num_attribs = variables.select_dtypes(include=np.number).columns.tolist()
cat_attribs = variables.select_dtypes(include='object').columns.tolist()

In [None]:
preprocessor = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

## Modeling with hyperparameter tuning

In [None]:
# Maximum number of levels in tree
# Decrease to previne overfit
max_depth = [int(x) for x in np.linspace(2, 50, num = 4)]
max_depth.append(None, -1)

param_grid_extratrees = {
    'reduce_dim__n_components': [0.8, 0.9],
    'classify__learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
    # Number of trees in estimator. Increase to previne overfit
    'classify__n_estimators': [int(x) for x in np.linspace(start = 100, stop = 2000, num = 8)],
    'classify__max_depth': max_depth,
    'classify__min_samples_split': [2, 3, 5, 7, 9],
    # Never be equal 1 why this means the tree in the end could potentially have one leaf for each sample. This 100% will overfit.    
    'classify__min_samples_leaf': [2, 4, 6, 8]
    # The number of features to consider when looking for the best split, default value is auto.
    'classify__max_features': ['sqrt', 'log2', None],
    'classify__class_weight': ['balanced', 'balanced_subsample'],
    'classify__gamma': [i/10.0 for i in range(3)],
    'classify__colsample_bytree': [i/10.0 for i in range(1, 3)]
}

param_grid_logreg = [
    {
        'reduce_dim__n_components': [0.8, 0.9],
        'classify__class_weight': [None, 'balanced'],
        'classify__C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
        'classify__max_iter': [int(x) for x in np.linspace(start = 100, stop = 2000, num = 8)],
        'classify__penalty': ['l1', 'l2']
    }
]

# https://github.com/Microsoft/LightGBM/issues/695#issuecomment-315591634
param_grid_lgbm = [
    {
        'reduce_dim__n_components': [0.8, 0.9],
        'classify__extra_trees': [False, True],
        'classify__learning_rate': [0.0001, 0.001, 0.01, 0.1],
        'classify__n_estimators': [int(x) for x in np.linspace(start = 100, stop = 2000, num = 8)],
        'max_depth': max_depth
    }
]

In [None]:
pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('reduce_dim', PCA(svd_solver='full')),
        ('classify', RandomForestClassifier(n_jobs=-1))
])

# GridSearchCV with F1 that is robust against unbalanced datasets
grid_search = GridSearchCV(
    pipe, param_grid, cv=5, scoring='accuracy', verbose=1, return_train_score=True, n_jobs=-1)

grid_search = grid_search.fit(variables, y_train)

In [None]:
# https://stackoverflow.com/a/33504368
test_prepared = grid_search.best_estimator_.named_steps['preprocessor'].transform(x_test)
test_prepared = grid_search.best_estimator_.named_steps['reduce_dim'].transform(test_prepared)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_.named_steps['classify'].score(test_prepared, y_test)

In [None]:
model = grid_search.best_estimator_

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model._final_estimator.feature_importances_,x_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:20])
plt.title('RandomForest Feature Importance - Top 20')
plt.tight_layout()
plt.show()
plt.savefig('RandomForest_importances.png')

In [None]:
y_test_estimation = model.named_steps['classify'].predict(test_prepared)
y_test_score = model.named_steps['classify'].predict_proba(test_prepared)[:,1]
print("Test score: ",y_test_score)

In [None]:
test_report = classification_report(y_test, y_test_estimation, digits=4)
print("Test:\n",test_report)

In [None]:
# If number of classes if greater than 2, then is multiclass and make a adaptive roc curve. 
# Else if binary class and make normal roc curve
if(len(y_test.unique()) == 2):
    y_pred_proba = model.named_steps['classify'].predict_proba(test_prepared)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
else:
    y_pred_proba = model.named_steps['classify'].predict_proba(test_prepared)[:, 1]

In [None]:
# Compute ROC curve and ROC area for binary class
def plot_roc_curve_binary(fpr, tpr):
    plt.figure()
    plt.plot(fpr, tpr, linewidth=2, label='ROC curve (area = %0.3f)' % auc(fpr, tpr))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.legend(loc="lower right")
    plt.show()

plot_roc_curve_binary(fpr, tpr)

In [None]:
# Compute ROC curve and ROC area for each class
def plot_roc_curve_multiclass(y_test, y_pred_proba):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
    for i in range(len(y_test.unique())):
        fpr[i], tpr[i], thresholds = roc_curve(y_test_dummies[:, i], y_predicted[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Plot of a ROC curve for a specific class
    for i in range(len(y_test.unique())):
        plt.figure()
        plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic example')
        plt.legend(loc="lower right")
        plt.show()

## We apply the same changes we made in training to the official test

In [None]:
test = add_features(test)

In [None]:
test_ID = test['Id'].copy()

In [None]:
# Drop identity and other columns
columns = ["Id"]

test.drop(columns, axis = 1, inplace = True)

In [None]:
final_test = grid_search.best_estimator_.named_steps['preprocessor'].transform(test)
final_test = grid_search.best_estimator_.named_steps['reduce_dim'].transform(final_test)

In [None]:
final_test.shape

In [None]:
y_predicted_test = grid_search.best_estimator_.named_steps['classify'].predict(final_test)

In [None]:
dataset = pd.DataFrame({'Id': test_ID, 'Cover_Type': y_predicted_test})

In [None]:
dataset.head()

In [None]:
dataset.to_csv('RandomForest_output.csv', index=False)