In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('Data/Real-Data/Real_Combine.csv')
df.head()

In [None]:
sns.heatmap(df.isnull(), yticklabels=False, cbar = False, cmap = 'viridis')

In [None]:
df.isnull().sum()

In [None]:
df= df.dropna()

In [None]:
# Split labels and data

X=df.iloc[:,:-1] ## independent features
y=df.iloc[:,-1] ## dependent features

In [None]:
print('Null Value in X is \n{}'.format(X.isnull().sum()))
print('Null Value in y is \n{}'.format(y.isnull().sum()))

In [None]:
sns.pairplot(df)

In [None]:
# Checking Correlation among the variable 
corr_mat = df.corr()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(corr_mat, cmap = 'coolwarm',annot= True)


### Feature Importance 
Using Extra Tree Regressor

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
#plot graph of feature importances for better visualization

feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

In [None]:
X.head()

In [None]:
# We can see that the distribution for this is right skewed. Most of the values lie
# on the right part of the distribution
sns.distplot(y)

#### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

In [None]:
X_train.shape

In [None]:
X_test.shape

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

In [None]:
print('R^2 score in Training Set {}'.format(regressor.score(X_train, y_train)))

In [None]:
print('R^2 score is Test Set {}'.format(regressor.score(X_test, y_test)))

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(regressor, X,y, cv =5)

In [None]:
score.mean()

#### Model Evaluation

In [None]:
prediction = regressor.predict(X_test)

In [None]:
sns.distplot(y_test - prediction)

In [None]:
plt.scatter(y_test , prediction)

#### Hyperparameter Tuning Decision Tree Regressor

In [None]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf = RandomForestRegressor()

In [None]:
random_search= RandomizedSearchCV(estimator = rf,param_distributions= random_grid,scoring='neg_mean_squared_error',n_jobs=-1,cv=5,verbose=2
                                  , n_iter = 100,random_state= 45)

In [None]:
random_search.fit(X_train, y_train)
%time

In [None]:
random_search.best_params_

In [None]:
predictions=random_search.predict(X_test)

In [None]:
sns.distplot(y_test - predictions)


#### Regression Evaluation Metrics
Here are three common evaluation metrics for regression problems:

Mean Absolute Error (MAE) is the mean of the absolute value of the errors:

$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$
Mean Squared Error (MSE) is the mean of the squared errors:

$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:

$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$
Comparing these metrics:

MAE is the easiest to understand, because it's the average error.
MSE is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
RMSE is even more popular than MSE, because RMSE is interpretable in the "y" units.
All of these are loss functions, because we want to minimize them.

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
import pickle
import os

In [None]:
if not os.path.exists("Models"):
        os.makedirs("Models")
os.chdir("Models")

In [None]:
# open a file, where you ant to store the data
file = open('decision_regression_model.pkl', 'wb')

# dump information to that file
pickle.dump(random_search, file)