## Imports

In [None]:
import numpy as np
import os
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, auc, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the data

In [None]:
# Acessar o patch anterior ao patch do código e acessar a pasta "data"
PATH = os.path.join(os.path.dirname(os.getcwd()), "data")

In [None]:
def load_train_test_data(path=PATH):
    train_path = os.path.join(path, "train.csv")
    test_path = os.path.join(path, "test.csv")
    return pd.read_csv(train_path), pd.read_csv(test_path)

In [None]:
train, test = load_train_test_data(PATH)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
train.info()

## Preprocessing

In [None]:
variables = train.drop("Cover_Type", axis=1)
var_resp = train["Cover_Type"].copy()

In [None]:
# Ver % de cada classe da var resposta
(var_resp.value_counts()/var_resp.count())*100

In [None]:
# Fazer a quebra entre treino e teste antes de qualquer tratamento na base
x_train, x_test, y_train, y_test = train_test_split(variables, var_resp, test_size=0.2, random_state=2, stratify=var_resp)

In [None]:
# Feature engineering:
# 
def add_features(data):
    data["Teste"] = data["Teste_1"]
return data

In [None]:
x_train = add_features(x_train)
x_test = add_features(x_test)

In [None]:
# Ver correlação entre variáveis da base
corr = x_train.loc[:, x_train.columns != 'Id'].corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
# Ver correlação das variáveis com a variável resposta
corr_with_y = pd.DataFrame(x_train.corrwith(y_train).abs()).reset_index()
corr_with_y.columns = ["Feature", "Correlation with Cover_Type"]
corr_with_y = corr_with_y.sort_values(by="Correlation with Cover_Type", ascending=True)
corr_with_y

In [None]:
# Dropar colunas identidade e outras
columns = ["Id"]

x_train.drop(columns, axis = 1, inplace = True)
x_test.drop(columns, axis = 1, inplace = True)

In [None]:
# Preprocessing for numeric and cat variables
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('oneHot', OneHotEncoder()),
    ])

In [None]:
num_attribs = variables.select_dtypes(include=np.number).columns.tolist()
cat_attribs = variables.select_dtypes(include='object').columns.tolist()

In [None]:
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [None]:
train_prepared = full_pipeline.fit_transform(variables)
test_prepared = full_pipeline.transform(x_test)

## Modeling with hyperparameter tuning

In [None]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 4)]
max_depth.append(None)

param_grid = {
    'learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
    'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 2000, num = 8)],
    'max_depth': max_depth,
    'min_samples_split': [2, 3, 5, 7, 9],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    # The number of features to consider when looking for the best split, default value is auto.
    'max_features' = ['sqrt', 'log2', None],
    
    'gamma': [i/10.0 for i in range(3)],
    'colsample_bytree': [i/10.0 for i in range(1, 3)],
#     Only for logistic regression
    'class_weight' = [None, 'balanced'],
    'C' = [0.001, 0.009, 0.01, 0.09, 0.1, 0.5, 1, 3, 5, 10, 50],
    'max_iter' = [500, 700, 1000]
}

In [None]:
param_grid = [
    {
        'reduce_dim__n_components': [0.5, 0.7, 0.9],
        'classify__max_depth': max_depth,
        'classify__n_estimators': n_estimators,
        'classify__max_features': max_features
    }
]

grid_search = GridSearchCV(
    Pipeline([
        ('reduce_dim', PCA(svd_solver='full')),
        ('classify', RandomForestClassifier(n_jobs=-1))
    ]), param_grid, cv=5, scoring='accuracy', verbose=1, return_train_score=True, n_jobs=-1)

grid_search = grid_search.fit(train_prepared, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_.score(test_prepared, y_test)

In [None]:
model = grid_search.best_estimator_

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model._final_estimator.feature_importances_,x_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:20])
plt.title('LightGBM Feature Importance - Top 20')
plt.tight_layout()
plt.show()
plt.savefig('lgbm_importances.png')

In [None]:
y_train_estimation = model.predict(train_prepared)
y_test_estimation = model.predict(test_prepared)

y_train_score = model.predict_proba(train_prepared)[:,1]
y_test_score = model.predict_proba(test_prepared)[:,1]

In [None]:
train_report = classification_report(y_train, y_train_estimation, digits=4)
print("Train:\n",train_report)

test_report = classification_report(y_test, y_test_estimation, digits=4)
print("Test:\n",test_report)

In [None]:
y_predicted = grid_search.predict_proba(test_prepared)

In [None]:
y_pred_proba = model.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.figure()
    plt.plot(fpr, tpr, linewidth=2, label='ROC curve (area = %0.3f)' % auc(fpr, tpr))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('Taxa de falsos positivos', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.legend(loc="lower right")
    plt.show()

plot_roc_curve(fpr, tpr)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
for i in range(len(y_test.unique())):
    fpr[i], tpr[i], thresholds = roc_curve(y_test_dummies[:, i], y_predicted[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(len(y_test.unique())):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

## Aplicamos as mesmas mudanças que fizemos no treino, para o teste oficial

In [None]:
test = add_features(test)

In [None]:
test_ID = test['Id'].copy()

In [None]:
# Dropar colunas identidade e outras
columns = ["Id"]

test.drop(columns, axis = 1, inplace = True)

In [None]:
final_test = full_pipeline.transform(test)

In [None]:
final_test.shape

In [None]:
y_predicted_test = grid_search.predict(final_test)

In [None]:
dataset = pd.DataFrame({'Id': test_ID, 'Cover_Type': y_predicted_test})

In [None]:
dataset.head()

In [None]:
dataset.to_csv('RandomForest_output.csv', index=False)