In [1]:
# importing required libraries
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.preprocessing import scale
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from pprint import pprint

In [None]:
# define
variable = '' # define varibale for taraining model
normalize_data = True # define true to normalize the data or false to used original data
scale_ = True # define true for scaling the data or false for not scaling the data

In [None]:
# define path
base_path = ''
path = os.path.join(base_path, 'result')
file_path = os.path.join(path, 'final_df.csv')
plot = os.path.join(path, 'plot')

In [None]:
# reading the dataset
data = pd.read_csv(file_path)
data.rename(columns = {'Unnamed: 0':'datacamp'}, inplace = True) # change the specific column name
# print(data)

In [None]:
# normalize data
if normalize_data == True:
    data_final = data.drop(columns=['datacamp'],axis=1)
    # Square Transform
    square_transform = FunctionTransformer(lambda x: x ** 2)
    ct = ColumnTransformer(transformers=[['square_transform',square_transform,list(range(len(data_final.columns)))]],remainder='passthrough')
    square_X = ct.fit_transform(data_final).copy()
    square_X = pd.DataFrame(square_X,columns=list(data_final.columns)).copy()

    square_X.insert (0, 'datacamp', data['datacamp'])
    data = square_X

    # seperate the independent and target variable on training data
    x = square_X.drop(columns=[variable, 'datacamp'],axis=1)
    y = square_X[variable]
else:
    # seperate the independent and target variable on training data
    x = data.drop(columns=[variable, 'datacamp'],axis=1)
    y = data[variable]

In [None]:
# scaling data
if scale_ == True:
    x_scale = scale(x)
    y_scale = scale(y)
    x_final = pd.DataFrame(x_scale, columns=list(x.columns))
    y_final = pd.Series(y_scale)
    data = pd.DataFrame({'datacamp': data['datacamp']})
    data = data.join(x_final)
    data[variable] = y_final
else:
    x_final = x
    y_final = y

# Split dataset into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(x_final, y_final, test_size=0.3) # 70% training and 30% test

In [None]:
# creating the random forest regression model
model =  RandomForestRegressor(n_estimators = 10, random_state = 42)

#Train the model using the training sets y_pred=clf.predict(X_test)
model.fit(X_train,Y_train)

# predict y for X test
y_pred = model.predict(X_test)

In [None]:
score = model.score(X_test, Y_test)
print(score)

In [None]:
from sklearn import metrics
# metrics.mean_absolute_error(Y_test, y_pred)
print(f'mean_squared_error: {metrics.mean_squared_error(Y_test, y_pred)}')

In [None]:
# ploting predicted value and true value of y
# define x axis for scatter plot
x_axis = []
for i in Y_test:
    row = data.loc[data[variable] == i]
    x_axis.append(row['datacamp'].values[0])

# plot scatter plot
plt.rcParams["figure.figsize"] = [10, 10] # set the figure size to solve the overlapping ticks problem
plt.rcParams["figure.autolayout"] = True
plt.scatter(x_axis, Y_test, color = 'red', label = f'{variable} Measured')
plt.scatter(x_axis, y_pred, color = 'green', label = f'{variable} predicted')
plt.title(f'Scatter plot of {variable} predicted and Ntot measured\n', fontsize=20)
plt.xlabel('\ndata camp', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(np.arange(1800, 3000, 100), fontsize=12)
plt.ylabel(f'{variable}\n', fontsize=15)
plt.margins(0.5)
matplotlib.rcParams['legend.fontsize'] = 15
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid()
plt.savefig(f'{plot}\pred_vs_true.jpeg')

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, Y_train)

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(test_labels - predictions)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print(f'R_square: {model.score(test_features, test_labels)}')
    print(f'mean_squared_error: {metrics.mean_squared_error(test_labels, predictions)}')
    return accuracy

In [None]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, Y_train)
base_accuracy = evaluate(base_model, X_test, Y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, Y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 30, 40, 50, 60, 70],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [4, 5, 7],
    'n_estimators': [100, 200, 300, 400, 500, 1000]
}
# Create a based model
rf = RandomForestRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, Y_train)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_train, Y_train)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

In [None]:
# Final model training
model = RandomForestRegressor(n_estimators = 100,
                                min_samples_split = 4,
                                min_samples_leaf = 1,
                                max_features = 'sqrt',
                                max_depth = 20,
                                bootstrap = True
                             )

RF_model = model.fit(X_train, Y_train)


# Feature importance analysis
fi = pd.DataFrame({'feature': list(X_train.columns),
                   'importance': RF_model.feature_importances_}).\
                    sort_values('importance', ascending = False)

fi.head()

In [None]:
score = RF_model.score(X_test, Y_test)
print(score)

In [None]:
# export model 
model_filename = 'final_RF.sav'
path = ""
model_ path = os.path.join(path, model_filename)
joblib.dump(RF_model, model_path)