In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import std
from pandas import DataFrame, concat, read_csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [2]:
# Load the different datasets
X_train = read_csv("../train_features_cleaned.csv").drop(columns=["id"]) # cleaned training features
X_test = read_csv("../test_features_cleaned.csv").set_index("id") # cleaned test features
y_train = read_csv("../data/training_set_labels.csv")["status_group"] # training set labels
y_train_id = read_csv("../test_features_cleaned.csv")[["id"]] # training set labels

In [3]:
# use a grid-search over min_samples_split and n_estimators to find the optimal hyperparameters
# NOTE: as this is a small grid, it is unlikely that these are optimal hyperparameters
rf = RandomForestClassifier(
    criterion="gini",
    n_estimators=500,
    max_features="auto",
    oob_score=True,
    random_state=1,
    n_jobs=-1,
)

param_grid = {"min_samples_split": [4, 6, 8], "n_estimators": [500, 700, 1000]}

gs = GridSearchCV(
    estimator=rf, param_grid=param_grid, scoring="accuracy", cv=2, n_jobs=-1
)

gs = gs.fit(X_train, y_train.values.ravel())

print(gs.best_score_)
print(gs.best_params_)

0.8005892255892255
{'min_samples_split': 6, 'n_estimators': 700}


In [5]:
# Fit a random forest with the optimal hyperparameters
rf = RandomForestClassifier(
    criterion="gini",
    min_samples_split=6,
    n_estimators=700,
    max_features="auto",
    oob_score=True,
    random_state=1,
    n_jobs=-1,
)

rf.fit(X_train, y_train.values.ravel())
print("%.4f" % rf.oob_score_)

0.8126


In [4]:
# kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
# scores = []
# for train_index, test_index in kfold.split(X_train, y_train):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
#     y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
# # for k, (train, test) in enumerate(kfold):
# #     rf.fit(X_train.values[train], y_train.values.ravel()[train])
# #     score = rf.score(X_train.values[test], y_train.values.ravel()[test])
# #     scores.append(score)
# #     print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.array(y_train['status_group'][train].value_counts()), score))

In [107]:
# Use the trained model to make predictions for the test set
yhat = DataFrame(rf.predict(X_test)).rename(columns={0: "status_group"})
yhat = y_train_id.merge(yhat, left_index=True, right_index=True)

In [108]:
# Print the first rows of the prediction result
yhat.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
...,...,...
14845,39307,non functional
14846,18990,functional
14847,28749,functional
14848,33492,functional


In [109]:
# write the results to a CSV file for submission
yhat.to_csv("submission_1_andho.csv", index=False)