In [None]:
# load libraries
import yaml
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import geopandas
import numpy as np
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

### Load and prepare data

In [None]:
# # read in (yaml) configs
with open('../conf/model_config.yaml', 'r') as conf:
    model_config = yaml.safe_load(conf)

# import data
dataset = '..' + model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)

# define predictors and target
predictor   =  model_config['meta']['predictors']
target = model_config['meta']['target']

In [None]:
predictor

In [None]:
target

In [None]:
dataset['target']

In [None]:
dataset

In [None]:
dataset.columns

In [None]:
# prepare data
X = dataset[predictor]
y = dataset[target]
print('X Shape:', X.shape)
print('y Shape:', y.shape)
   
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = model_config['parameter']['test_size'], 
                                                    random_state = 42, stratify= y)

print('X_train, X_test, y_train, y_test shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("size of training dataset = ", len(X_train))
print("size of test dataset = ", len(X_test))

In [None]:
y_train[target].value_counts()

In [None]:
y_test[target].value_counts()

### Prepare model tuning

In [None]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

In [None]:
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance on train set
print('Mean ROC AUC: %.3f' % (scores.mean()))

### Model Predictions

In [None]:
y_pred=clf.predict(X_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# View confusion matrix for test data and predictions
confusion_matrix(y_test, y_pred)

In [None]:
import seaborn as sns

# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['low','nominal']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

In [None]:
# View the classification report for test data and predictions
print(classification_report(y_test, y_pred))

## Feature Importances

In [None]:
# Make the dataframe
importance = pd.DataFrame(
    {"Feature": X.columns, "Importance": clf.feature_importances_}
).sort_values("Importance")

In [None]:
importance

In [None]:
import matplotlib.pyplot as plt
fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Feature Importances', fontdict={'fontsize': '13', 'fontweight' : '3'})

(pd.Series(clf.feature_importances_, index=X.columns)
   .nsmallest(12).plot(kind='barh'))

## Pickling the model for use later on with predictions

In [None]:
#changing the path to save in the models folder
path = '../model/'
## pickle the file
# save the model to disk
filename = 'RF_Classifier_Model.sav'
pickle.dump(clf, open(path + filename, 'wb'))