In [None]:
## Load data ##
%matplotlib inline
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
%matplotlib inline

df1=pd.read_csv('feature.csv',sep=',')
df2=pd.read_json('target.json')

#replace NaN and infinity values with zero
dfX=df1.replace((np.inf,-np.inf,np.nan),0).reset_index(drop=True)
dfy=df2.replace((np.inf,-np.inf,np.nan),0).reset_index(drop=True)

## GridSearchCV to train random forest model ##
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split

X=dfX
y1 = dfy['kL'].values
y=np.log10(y1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=1)

#regularization parameters
param_grid = {'max_depth': range(5,16),'n_estimators': [100],'max_features': ['auto', 'sqrt'],
              'min_samples_split': range(2,15),'random_state':[1]}

#Use GridSearchCV to train random forest model
forestr = RandomForestRegressor()
grid_search = GridSearchCV(forestr, param_grid, cv=10, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Forest best parameters: {}".format(grid_search.best_params_))
print("Forest best cross-validation score: {:.3f}".format(grid_search.best_score_))

#Plot the feature importance for Random Forest Regressor(best cross validation score)
plt.figure(dpi=100)
name=X.columns.values

forestr = RandomForestRegressor(max_depth=13,n_estimators=100,max_features='auto',
                                min_samples_split=3,random_state=1).fit(X_train, y_train)
feat_importance=pd.Series(forestr.feature_importances_,index=name)
feat_importance.nlargest(29).plot(kind='barh')

## Random Forest Regressor(best cross validation score) to predict target
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr

plt.figure(figsize=(6.5,6.5))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=1)

##Predict##
RF = RandomForestRegressor(max_depth=13,n_estimators=100,max_features='auto',
                                random_state=1).fit(X_train, y_train)
#save model using joblib
import joblib
joblib.dump(RF,'FinalKLModel.joblib')

y_RF_predict1 = RF.predict(X_train)
y_RF_predict2 = RF.predict(X_test)

##Pearson##
Pearson = pearsonr(RF.predict(X_train), y_train)

##Plot predicted value versus the actual target value##
Xa= np.array(y_train)
ya= y_RF_predict1
plt.plot(Xa,ya,'o',markersize=10, color='dodgerblue',mec='k',alpha=0.6,label='Training set')
Xb= np.array(y_test)
yb= y_RF_predict2
plt.plot(Xb,yb,'o',markersize=10, color='gold',mec='k',alpha=0.5,label='Test set')

##Plot a line with slpoe=1##
xx=np.linspace(-0.5,4,1000)
yy=xx
plt.plot(xx,yy,'--r',linewidth=1.5,label= 'slpoe=1')

#plt.axes().set_aspect(2)
plt.legend(loc='upper left',fontsize=13)
plt.xlabel('log(Predicted values)',fontsize=13)
plt.ylabel('log(Actual values)',fontsize=13)
plt.xlim(-0.5, 4)
plt.ylim(-0.5, 4)
ax=plt.gca()
ax.spines['bottom'].set_linewidth(1.5)
ax.spines['left'].set_linewidth(1.5)
ax.spines['right'].set_linewidth(1.5)
ax.spines['top'].set_linewidth(1.5)
plt.tick_params(axis='both',width=1.5,length=7,labelsize=13)
plt.tight_layout()
plt.show()

print("Training set score: {:.2f}".format(RF.score(X_train, y_train)))
print("Test set score: {:.2f}".format(RF.score(X_test, y_test)))
print("Pearson correlation coef, p-value:",Pearson)