In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_excel("Na2SO4.xlsx")

In [None]:
X = data.drop('Sodium Sulphate',axis=1)

In [None]:
for column in X.columns:
    X[column] = X[column].fillna(X[column].mean())

In [None]:
Y = data['Sodium Sulphate']

In [None]:
Y.shape

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [None]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler(feature_range = (0,1))
mm.fit(x_train)
x_train = mm.transform(x_train)
x_test = mm.transform(x_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import random

In [None]:
rfc1 = RandomForestRegressor(n_estimators = 200, max_depth = 9, random_state = 7)

In [None]:
rfc1.fit(x_train, y_train)

In [None]:
y_pred_test = rfc1.predict(x_test)

In [None]:
from sklearn.metrics import r2_score

In [None]:
y_pred_train = rfc1.predict(x_train)

In [None]:
train_R = r2_score(y_train, y_pred_train)
print("R^2 score:", train_R)

In [None]:
test_R = r2_score(y_test, y_pred_test)
print("R^2 score:", test_R)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))

In [None]:
plt.scatter(y_train, y_pred_train, label='Training Data', s=50, c='blue', alpha=0.7)
plt.plot(y_train, y_train, label='Actual Y - Training', c='black', linestyle='--')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xlabel('Actual Y')
plt.ylabel('Predicted Y')

In [None]:
plt.scatter(y_test, y_pred_test, label='Training Data', s=50, c='blue', alpha=0.7)
plt.plot(y_train, y_train, label='Actual Y - Training', c='black', linestyle='--')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xlabel('Actual Y')
plt.ylabel('Predicted Y')

In [None]:
random_forest_importance1=list(rfc1.feature_importances_)
random_forest_importance1

In [None]:
importances = rfc1.feature_importances_

sorted_idx = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.barh(X.columns[sorted_idx], importances[sorted_idx])
plt.xlabel('Relative Importance')
plt.ylabel('Feature')
plt.title('Feature Importances for Random Forest Regression Model')
plt.tight_layout()
plt.show()

In [None]:
!pip install tree

In [None]:
feature_names = X.columns.tolist()

In [None]:
from sklearn.tree import plot_tree
tree_to_visualize = 0

# Plot the selected tree
plt.figure(figsize=(20, 10))
plot_tree(rfc1.estimators_[tree_to_visualize], feature_names=feature_names, filled=True, rounded=True)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def calculate_rmse_mae(y_true, y_pred):
    """Calculates RMSE and MAE between two arrays using scikit-learn."""
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae


rmse, mae = calculate_rmse_mae(y_train, y_pred_train)

print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
y_pred = rfc1.predict(x_test)

# Calculate the prediction error
error = y_test - y_pred

# Draw the histogram of prediction error
plt.hist(error, bins=30,edgecolor='black')
plt.grid(True)
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.title('Histogram of Prediction Error')
plt.show()