# Solar Power Generation analysis

## Team Members
### 1.Ashish Chauhan
### 2.Akash Kumar Gaud
### 3.Anish Chauhan
### 4.Ashish Gautam
### 5.Aryan Shrikant Jadhao

# Step 1. Data importing and preprocessing

In [37]:
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [11]:
#imported the csv file

df=pd.read_csv("data/powerPlantDataBerkeley.csv")
df.head(10)

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated
0,245,2008,9,1,1,False,0.859897,69,28,7.5,0,10.0,75,8.0,29.82,0
1,245,2008,9,1,4,False,0.628535,69,28,7.5,0,10.0,77,5.0,29.85,0
2,245,2008,9,1,7,True,0.397172,69,28,7.5,0,10.0,70,0.0,29.89,5418
3,245,2008,9,1,10,True,0.16581,69,28,7.5,0,10.0,33,0.0,29.91,25477
4,245,2008,9,1,13,True,0.065553,69,28,7.5,0,10.0,21,3.0,29.89,30069
5,245,2008,9,1,16,True,0.296915,69,28,7.5,0,10.0,20,23.0,29.85,16280
6,245,2008,9,1,19,True,0.528278,69,28,7.5,0,10.0,36,15.0,29.83,515
7,245,2008,9,1,22,False,0.75964,69,28,7.5,0,10.0,49,6.0,29.86,0
8,246,2008,9,2,1,False,0.862113,72,29,6.8,0,10.0,67,6.0,29.86,0
9,246,2008,9,2,4,False,0.630155,72,29,6.8,0,10.0,49,0.0,29.87,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2920 entries, 0 to 2919
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Day of Year                           2920 non-null   int64  
 1   Year                                  2920 non-null   int64  
 2   Month                                 2920 non-null   int64  
 3   Day                                   2920 non-null   int64  
 4   First Hour of Period                  2920 non-null   int64  
 5   Is Daylight                           2920 non-null   bool   
 6   Distance to Solar Noon                2920 non-null   float64
 7   Average Temperature (Day)             2920 non-null   int64  
 8   Average Wind Direction (Day)          2920 non-null   int64  
 9   Average Wind Speed (Day)              2920 non-null   float64
 10  Sky Cover                             2920 non-null   int64  
 11  Visibility       

* As clearly visible the dataset is a time series dataset but I didn't sort it because it was already sorted. 
* Secondly there are no columns with object type values hence there was no need of converting any column to pandas category and assigning the codes back to column to convert object type column to numeric type

In [12]:
X=df.drop("Power Generated",axis=1)
y=df["Power Generated"]


In [13]:
'''Splitting data into train and test split of 8:2'''
train_size=round(0.7*len(df))
val_size=round(train_size+0.15*len(df))

'''
Splitting with train_test_split function won't work here as it shuffles the data before splitting but we avoid that in 
time series dataset
'''
X_train,y_train=X[:train_size],y[:train_size]
X_val,y_val=X[train_size:val_size],y[train_size:val_size]
X_test,y_test=X[val_size:],y[val_size:]



In [14]:
'''just checking if splitting is done correctly'''

X_train.shape,y_train.shape,X_val.shape,y_val.shape,X_test.shape,y_test.shape

((2044, 15), (2044,), (438, 15), (438,), (438, 15), (438,))

## Filling missing data in train,test and validation set

### Train set

In [15]:

X_train.isna().sum()

Day of Year                             0
Year                                    0
Month                                   0
Day                                     0
First Hour of Period                    0
Is Daylight                             0
Distance to Solar Noon                  0
Average Temperature (Day)               0
Average Wind Direction (Day)            0
Average Wind Speed (Day)                0
Sky Cover                               0
Visibility                              0
Relative Humidity                       0
Average Wind Speed (Period)             1
Average Barometric Pressure (Period)    0
dtype: int64

In [17]:
'''Filling average wind column's missing value with mean of the column'''

X_train["Average Wind Speed (Period)"].fillna(X_train["Average Wind Speed (Period)"].mean(),inplace=True)
X_train.isna().sum()

Day of Year                             0
Year                                    0
Month                                   0
Day                                     0
First Hour of Period                    0
Is Daylight                             0
Distance to Solar Noon                  0
Average Temperature (Day)               0
Average Wind Direction (Day)            0
Average Wind Speed (Day)                0
Sky Cover                               0
Visibility                              0
Relative Humidity                       0
Average Wind Speed (Period)             0
Average Barometric Pressure (Period)    0
dtype: int64

In [18]:
y_train.isna().sum()

0

### Validation set

In [19]:
X_val.isna().sum()

Day of Year                             0
Year                                    0
Month                                   0
Day                                     0
First Hour of Period                    0
Is Daylight                             0
Distance to Solar Noon                  0
Average Temperature (Day)               0
Average Wind Direction (Day)            0
Average Wind Speed (Day)                0
Sky Cover                               0
Visibility                              0
Relative Humidity                       0
Average Wind Speed (Period)             0
Average Barometric Pressure (Period)    0
dtype: int64

In [20]:
y_val.isna().sum()

0

### Test Set

In [21]:
X_test.isna().sum()

Day of Year                             0
Year                                    0
Month                                   0
Day                                     0
First Hour of Period                    0
Is Daylight                             0
Distance to Solar Noon                  0
Average Temperature (Day)               0
Average Wind Direction (Day)            0
Average Wind Speed (Day)                0
Sky Cover                               0
Visibility                              0
Relative Humidity                       0
Average Wind Speed (Period)             0
Average Barometric Pressure (Period)    0
dtype: int64

In [22]:
y_test.isna().sum()

0

# Step 2. Modelling & Evaluation

In [33]:
'''Custom evaluation function'''

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def model_score(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)  # Mean Absolute Error
    mse = mean_squared_error(y_true, y_pred)  # Mean Squared Error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    r2 = r2_score(y_true, y_pred)  # R-squared (Coefficient of Determination)
    
    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R2):", r2)

In [35]:
'''Applying Linear Regression model to dataset'''
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

y_pred_lr=model.predict(X_val)
model_score(y_val,y_pred_lr)

Mean Absolute Error (MAE): 5360.079473889949
Mean Squared Error (MSE): 47638662.990993224
Root Mean Squared Error (RMSE): 6902.076715814829
R-squared (R2): 0.6725339752370296


In [36]:
'''Applying RandomForestRegressor to dataset'''
from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor(n_estimators=100, random_state=42,n_jobs=-1)
model2.fit(X_train, y_train)
y_pred_rf=model2.predict(X_val)
model_score(y_val,y_pred_rf)


Mean Absolute Error (MAE): 2378.5115753424657
Mean Squared Error (MSE): 17880027.144984704
Root Mean Squared Error (RMSE): 4228.4781121562755
R-squared (R2): 0.87709349834337


# Hyper Parameter Tuning of RandomForestRegressor

### We will use Randomized Search CV to find best values of hyperparameters

#### Adjusting the following hyper parameters
* max_depth
* min_samples_leaf
* min_samples_split
* n_estimators 

In [45]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

grid = {
    "n_estimators": [10, 100, 200, 500, 1000, 1200],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4]
}

rf = RandomForestRegressor(n_jobs=-1)
rs = RandomizedSearchCV(estimator=rf, param_distributions=grid, n_iter=20, cv=5)

rs.fit(X_train,y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1), n_iter=20,
                   param_distributions={'max_depth': [None, 5, 10, 20, 30],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 4, 6],
                                        'n_estimators': [10, 100, 200, 500,
                                                         1000, 1200]})

In [48]:
rs.best_params_


{'n_estimators': 1200,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_depth': 30}

In [49]:
rs_y_pred=rs.predict(X_val)

In [50]:
model_score(y_val,rs_y_pred)

Mean Absolute Error (MAE): 2323.804721939806
Mean Squared Error (MSE): 17260496.339423098
Root Mean Squared Error (RMSE): 4154.575350071665
R-squared (R2): 0.8813521252102453
