In [40]:
import streamlit as st
import numpy as np
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [4]:
# !pip install streamlit

In [32]:
def error_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    variance_score = r2_score(y_true, y_pred)
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"Mean Absolute Percentage Error: {mape:.2f}%")
    print(f"Variance Score: {variance_score:.2f}")

In [11]:
# Load the data 
data = pd.read_csv('SDGE_W.csv')  

In [12]:
# Creating the lag variables
for i in range(24):
    data['lag'+str(i+1)] = data['SDGE'].shift(i+1)

In [13]:
# Dropping unnecessary columns and setting date as index
data = data.drop('Unnamed: 0', axis=1)
data.set_index("Date", inplace =True)
data.index = pd.to_datetime(data.index)

In [14]:
data = data.dropna()

In [15]:
# Split the data into features (X) and target variable (y) 
X = data.drop('SDGE', axis=1) 
y = data['SDGE'] 

In [35]:
# Split the data into training and testing sets 
split_index = int(len(data)*0.8)
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

In [17]:
cat_cols = ['year', 'month', 'hour', 'weekday', 'season', 'holiday']
num_cols = ['HourlyDryBulbTemperature', 'HourlyRelativeHumidity', 'HourlyWindSpeed'] 

In [20]:
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    
])
# Since we can't impute with mean we're using the constant imputing strategy to bfill, and the fill_value ensuring missing values are
#imputed with the next available value
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(add_indicator=True, strategy='constant', fill_value=None)),
    ('scaler', StandardScaler())
])

In [23]:
processes = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_cols),
        ('num', numerical_transformer, num_cols)
    ], remainder='passthrough')

In [26]:
#Create pipeline
RandomForest = Pipeline([
    ('processes', processes),
    ('rf', RandomForestRegressor(n_estimators=10, max_features='sqrt', max_depth=4))
])

In [27]:
RandomForest.fit(X_train, y_train)



In [28]:
# Make predictions on the test set 
y_pred = RandomForest.predict(X_test)

In [41]:
# Calculate the evaluation metrics 
error_metrics(y_test, y_pred)

Mean Absolute Error: 216.79
Mean Squared Error: 84971.60
Root Mean Squared Error: 291.50
Mean Absolute Percentage Error: 10.15%
Variance Score: 0.67


In [44]:
# Create the Streamlit app 
st.title('Random Forest Regressor Model') 
st.write('error metric',error_metrics(y_test, y_pred))

Mean Absolute Error: 216.79
Mean Squared Error: 84971.60
Root Mean Squared Error: 291.50
Mean Absolute Percentage Error: 10.15%
Variance Score: 0.67
