In this notebook, I have tried to solve the problem statement of the <a href='https://machinehack.com/hackathon/wipro_sustainability_machine_learning_challenge/overview'>Wipro Hiring Hackathon</a> using a few different methods. <br>
Models used: 
* Decision Tree Regressor
* Random Forest Regressor
* AdaBoost Regressor
* GradientBoost Regressor 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Train set Analysis and transformation
Importing the training set

In [2]:
df = pd.read_csv(r"C:\Users\ariha\Desktop\Hackathons & Projects\Wipro Hackathon\Data + Sample Submission\train.csv")
df.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag
0,2009,1,1,0,0,0,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0
1,2009,1,1,0,30,0,0,0,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0
2,2009,1,1,1,0,0,0,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0
3,2009,1,1,1,30,0,0,0,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0
4,2009,1,1,2,0,0,0,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0


In [3]:
df.shape

(175296, 18)

Creating a single Date-Time column instead of Year, Month, etc being represented individually 

In [4]:
df['Date-Time'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute']])
df.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Date-Time
0,2009,1,1,0,0,0,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0,2009-01-01 00:00:00
1,2009,1,1,0,30,0,0,0,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0,2009-01-01 00:30:00
2,2009,1,1,1,0,0,0,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0,2009-01-01 01:00:00
3,2009,1,1,1,30,0,0,0,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0,2009-01-01 01:30:00
4,2009,1,1,2,0,0,0,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0,2009-01-01 02:00:00


We do not require these columns now

In [5]:
df.drop(columns=['Year', 'Month', 'Day', 'Hour', 'Minute'], axis=1, inplace=True)

Now we will check the correlation between all the different variables with themselves and with the values that we have to predict as well - Clearsky DHI, Clearsky DNI, Clearsky GHI

In [6]:
corr_matrix = df.corr()
corr_matrix

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag
Clearsky DHI,1.0,0.860988,0.912991,0.022479,0.222931,0.503409,-0.093766,-0.559766,-0.873421,0.207504,-0.000195,0.315655,0.00074
Clearsky DNI,0.860988,1.0,0.927926,-0.073302,0.033601,0.384353,0.007043,-0.680471,-0.886323,-0.00218,0.006256,0.381162,0.006318
Clearsky GHI,0.912991,0.927926,1.0,-0.047297,0.14042,0.478273,-0.050556,-0.655765,-0.891033,0.108725,0.004944,0.34918,-0.030297
Cloud Type,0.022479,-0.073302,-0.047297,1.0,0.202096,0.066777,-0.137158,0.235374,0.026873,0.326904,-0.081565,0.072632,0.090161
Dew Point,0.222931,0.033601,0.14042,0.202096,1.0,0.848948,-0.419534,0.193455,-0.222336,0.881881,-0.076357,-0.140699,0.036851
Temperature,0.503409,0.384353,0.478273,0.066777,0.848948,1.0,-0.371395,-0.341914,-0.526548,0.742449,-0.066673,-0.046694,0.015734
Pressure,-0.093766,0.007043,-0.050556,-0.137158,-0.419534,-0.371395,1.0,-0.085004,0.0789,-0.365342,-0.182006,-0.122674,0.052521
Relative Humidity,-0.559766,-0.680471,-0.655765,0.235374,0.193455,-0.341914,-0.085004,1.0,0.607108,0.182658,-0.019108,-0.172248,0.023674
Solar Zenith Angle,-0.873421,-0.886323,-0.891033,0.026873,-0.222336,-0.526548,0.0789,0.607108,1.0,-0.182529,0.003802,-0.295927,-0.02915
Precipitable Water,0.207504,-0.00218,0.108725,0.326904,0.881881,0.742449,-0.365342,0.182658,-0.182529,1.0,-0.105735,-0.106892,0.044106


Converting it into an upper triangular matrix so that it is faster and easier to loop through

In [7]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag
Clearsky DHI,,0.860988,0.912991,0.022479,0.222931,0.503409,-0.093766,-0.559766,-0.873421,0.207504,-0.000195,0.315655,0.00074
Clearsky DNI,,,0.927926,-0.073302,0.033601,0.384353,0.007043,-0.680471,-0.886323,-0.00218,0.006256,0.381162,0.006318
Clearsky GHI,,,,-0.047297,0.14042,0.478273,-0.050556,-0.655765,-0.891033,0.108725,0.004944,0.34918,-0.030297
Cloud Type,,,,,0.202096,0.066777,-0.137158,0.235374,0.026873,0.326904,-0.081565,0.072632,0.090161
Dew Point,,,,,,0.848948,-0.419534,0.193455,-0.222336,0.881881,-0.076357,-0.140699,0.036851
Temperature,,,,,,,-0.371395,-0.341914,-0.526548,0.742449,-0.066673,-0.046694,0.015734
Pressure,,,,,,,,-0.085004,0.0789,-0.365342,-0.182006,-0.122674,0.052521
Relative Humidity,,,,,,,,,0.607108,0.182658,-0.019108,-0.172248,0.023674
Solar Zenith Angle,,,,,,,,,,-0.182529,0.003802,-0.295927,-0.02915
Precipitable Water,,,,,,,,,,,-0.105735,-0.106892,0.044106


Selecting all columns and creating a list where correlation factor > 0.75 which shows that columns are highly correlated and hence we dont require them

In [8]:
columns_to_drop = []
for cols in upper.columns:
    if any (upper[cols] > 0.75):
        columns_to_drop.append(cols)
columns_to_drop

['Clearsky DNI', 'Clearsky GHI', 'Temperature', 'Precipitable Water']

Removing 'Clearsky DNI' and 'Clearsky GHI' since those are columns we have to predict

In [9]:
columns_to_drop.pop(0)
columns_to_drop.pop(0)
columns_to_drop

['Temperature', 'Precipitable Water']

Dropping the columns we found that had high correlation

In [10]:
df.drop(columns_to_drop, axis=1, inplace=True)
df.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Pressure,Relative Humidity,Solar Zenith Angle,Wind Direction,Wind Speed,Fill Flag,Date-Time
0,0,0,0,0,0.0,1010,75.34,106.15,346.1,3.1,0,2009-01-01 00:00:00
1,0,0,0,0,1.0,1010,80.81,112.28,346.1,3.1,0,2009-01-01 00:30:00
2,0,0,0,4,0.0,1010,78.27,118.5,347.9,3.2,0,2009-01-01 01:00:00
3,0,0,0,4,0.0,1010,78.27,124.78,347.9,3.1,0,2009-01-01 01:30:00
4,0,0,0,4,0.0,1010,76.45,131.12,350.0,3.0,0,2009-01-01 02:00:00


Updating the correlation matrix after dropping the columns

In [11]:
corr_matrix = df.corr()
corr_matrix

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Pressure,Relative Humidity,Solar Zenith Angle,Wind Direction,Wind Speed,Fill Flag
Clearsky DHI,1.0,0.860988,0.912991,0.022479,0.222931,-0.093766,-0.559766,-0.873421,-0.000195,0.315655,0.00074
Clearsky DNI,0.860988,1.0,0.927926,-0.073302,0.033601,0.007043,-0.680471,-0.886323,0.006256,0.381162,0.006318
Clearsky GHI,0.912991,0.927926,1.0,-0.047297,0.14042,-0.050556,-0.655765,-0.891033,0.004944,0.34918,-0.030297
Cloud Type,0.022479,-0.073302,-0.047297,1.0,0.202096,-0.137158,0.235374,0.026873,-0.081565,0.072632,0.090161
Dew Point,0.222931,0.033601,0.14042,0.202096,1.0,-0.419534,0.193455,-0.222336,-0.076357,-0.140699,0.036851
Pressure,-0.093766,0.007043,-0.050556,-0.137158,-0.419534,1.0,-0.085004,0.0789,-0.182006,-0.122674,0.052521
Relative Humidity,-0.559766,-0.680471,-0.655765,0.235374,0.193455,-0.085004,1.0,0.607108,-0.019108,-0.172248,0.023674
Solar Zenith Angle,-0.873421,-0.886323,-0.891033,0.026873,-0.222336,0.0789,0.607108,1.0,0.003802,-0.295927,-0.02915
Wind Direction,-0.000195,0.006256,0.004944,-0.081565,-0.076357,-0.182006,-0.019108,0.003802,1.0,0.035634,-0.017124
Wind Speed,0.315655,0.381162,0.34918,0.072632,-0.140699,-0.122674,-0.172248,-0.295927,0.035634,1.0,0.000447


In this matrix, we can see that 'Pressure', 'Wind Direction', 'Fill Flag' all 3 have almost 0 correlation to the values that we want to predict. <br>
Hence we can remove those columns as well as it will not affect our final predictions

In [12]:
df.drop(columns=['Pressure', 'Wind Direction', 'Fill Flag'], axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Relative Humidity,Solar Zenith Angle,Wind Speed,Date-Time
0,0,0,0,0,0.0,75.34,106.15,3.1,2009-01-01 00:00:00
1,0,0,0,0,1.0,80.81,112.28,3.1,2009-01-01 00:30:00
2,0,0,0,4,0.0,78.27,118.5,3.2,2009-01-01 01:00:00
3,0,0,0,4,0.0,78.27,124.78,3.1,2009-01-01 01:30:00
4,0,0,0,4,0.0,76.45,131.12,3.0,2009-01-01 02:00:00


Now, after looking at the information about the data via the <a href="https://machinehack.com/hackathon/wipro_sustainability_machine_learning_challenge/data">MachineHack</a> website, we get to know that Cloud Type is a categorical variable and not a conitnuous variable. <br> 
So, we apply one-hot encoding to convert it into continuous variables

In [14]:
df['Cloud Type'].unique()

array([ 0,  4,  1,  7,  8,  3,  2,  6,  9, 10], dtype=int64)

Pandas has a built-in function called pd.get_dummies that performs one-hot encoding. You can learn more about it through it's <a href='https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html'>documentation</a>.

In [15]:
df = pd.get_dummies(df, columns=['Cloud Type'])
df.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI,Dew Point,Relative Humidity,Solar Zenith Angle,Wind Speed,Date-Time,Cloud Type_0,Cloud Type_1,Cloud Type_2,Cloud Type_3,Cloud Type_4,Cloud Type_6,Cloud Type_7,Cloud Type_8,Cloud Type_9,Cloud Type_10
0,0,0,0,0.0,75.34,106.15,3.1,2009-01-01 00:00:00,1,0,0,0,0,0,0,0,0,0
1,0,0,0,1.0,80.81,112.28,3.1,2009-01-01 00:30:00,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0.0,78.27,118.5,3.2,2009-01-01 01:00:00,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0.0,78.27,124.78,3.1,2009-01-01 01:30:00,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0.0,76.45,131.12,3.0,2009-01-01 02:00:00,0,0,0,0,1,0,0,0,0,0


Now, we will just move the date column to the start, so it is easily readable

In [16]:
columns_list = []
for cols in df.columns:
    columns_list.append(cols)
columns_list

['Clearsky DHI',
 'Clearsky DNI',
 'Clearsky GHI',
 'Dew Point',
 'Relative Humidity',
 'Solar Zenith Angle',
 'Wind Speed',
 'Date-Time',
 'Cloud Type_0',
 'Cloud Type_1',
 'Cloud Type_2',
 'Cloud Type_3',
 'Cloud Type_4',
 'Cloud Type_6',
 'Cloud Type_7',
 'Cloud Type_8',
 'Cloud Type_9',
 'Cloud Type_10']

In [17]:
columns_list = columns_list[7:8] + columns_list[:7] + columns_list[8:]
columns_list

['Date-Time',
 'Clearsky DHI',
 'Clearsky DNI',
 'Clearsky GHI',
 'Dew Point',
 'Relative Humidity',
 'Solar Zenith Angle',
 'Wind Speed',
 'Cloud Type_0',
 'Cloud Type_1',
 'Cloud Type_2',
 'Cloud Type_3',
 'Cloud Type_4',
 'Cloud Type_6',
 'Cloud Type_7',
 'Cloud Type_8',
 'Cloud Type_9',
 'Cloud Type_10']

In [18]:
df = df[columns_list]
df.head()

Unnamed: 0,Date-Time,Clearsky DHI,Clearsky DNI,Clearsky GHI,Dew Point,Relative Humidity,Solar Zenith Angle,Wind Speed,Cloud Type_0,Cloud Type_1,Cloud Type_2,Cloud Type_3,Cloud Type_4,Cloud Type_6,Cloud Type_7,Cloud Type_8,Cloud Type_9,Cloud Type_10
0,2009-01-01 00:00:00,0,0,0,0.0,75.34,106.15,3.1,1,0,0,0,0,0,0,0,0,0
1,2009-01-01 00:30:00,0,0,0,1.0,80.81,112.28,3.1,1,0,0,0,0,0,0,0,0,0
2,2009-01-01 01:00:00,0,0,0,0.0,78.27,118.5,3.2,0,0,0,0,1,0,0,0,0,0
3,2009-01-01 01:30:00,0,0,0,0.0,78.27,124.78,3.1,0,0,0,0,1,0,0,0,0,0
4,2009-01-01 02:00:00,0,0,0,0.0,76.45,131.12,3.0,0,0,0,0,1,0,0,0,0,0


We have completed the EDA and cleaning and this is what our final dataset looks like.

# Splitting the final pre-processed data into train and validation sets

Converting cleaned dataframe into x and y variables first and then into training and validations sets

In [19]:
x = df.iloc[:, 4:].copy()

In [20]:
x.head()

Unnamed: 0,Dew Point,Relative Humidity,Solar Zenith Angle,Wind Speed,Cloud Type_0,Cloud Type_1,Cloud Type_2,Cloud Type_3,Cloud Type_4,Cloud Type_6,Cloud Type_7,Cloud Type_8,Cloud Type_9,Cloud Type_10
0,0.0,75.34,106.15,3.1,1,0,0,0,0,0,0,0,0,0
1,1.0,80.81,112.28,3.1,1,0,0,0,0,0,0,0,0,0
2,0.0,78.27,118.5,3.2,0,0,0,0,1,0,0,0,0,0
3,0.0,78.27,124.78,3.1,0,0,0,0,1,0,0,0,0,0
4,0.0,76.45,131.12,3.0,0,0,0,0,1,0,0,0,0,0


In [21]:
y = df.iloc[:, 1:4].copy()

In [22]:
y.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


Since this is a time-series data, we cannot randomly split the data into training and testing sets. <br>
And since this is a fairly large data set with 175296 rows, we will keep the first 90% data for training and the final 10% for validation

In [23]:
x_train = x.iloc[:int(len(x) * 0.90), :]
x_val = x.iloc[int(len(x) * 0.90):, :]

y_train = y.iloc[:int(len(y) * 0.90), :]
y_val = y.iloc[int(len(y) * 0.90):, :]

In [24]:
x_train.shape

(157766, 14)

In [25]:
x_val.shape

(17530, 14)

In [26]:
y_train.shape

(157766, 3)

In [27]:
y_val.shape

(17530, 3)

# Creating Regression Tree

In [28]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [29]:
regressionTree = DecisionTreeRegressor(max_depth=20, max_leaf_nodes=20000, max_features=10)
regressionTree = regressionTree.fit(x_train, y_train)

In [30]:
y_pred_val_decisionTree = regressionTree.predict(x_val)

In [31]:
y_pred_val_decisionTree = pd.DataFrame(y_pred_val_decisionTree, columns=y_val.columns)
y_pred_val_decisionTree.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,80.8,907.0,557.4
1,81.25,864.0,499.75
2,75.333333,848.666667,449.555556
3,89.75,689.5,346.75
4,71.0,649.0,258.0


In [32]:
y_pred_val_decisionTree = y_pred_val_decisionTree.astype('int')

In [33]:
for col in y_pred_val_decisionTree:
    for i in range(len(y_pred_val_decisionTree[col])):
        if y_pred_val_decisionTree.loc[i, col] < 0:
            y_pred_val_decisionTree.loc[i, col] = 0

In [34]:
y_pred_val_decisionTree.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,80,907,557
1,81,864,499
2,75,848,449
3,89,689,346
4,71,649,258


In [35]:
mse_decisionTree = mean_squared_error(y_val, y_pred_val_decisionTree)

# Regression using Random Forest

In [36]:
from sklearn.ensemble import RandomForestRegressor
randomForest = RandomForestRegressor()

In [37]:
randomForest = randomForest.fit(x_train, y_train)

In [38]:
y_pred_val_forest = randomForest.predict(x_val)

In [39]:
y_pred_val_forest = pd.DataFrame(y_pred_val_forest, columns=y_val.columns)
y_pred_val_forest.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,94.56,859.92,548.54
1,109.11,757.35,477.48
2,86.17,793.39,428.34
3,82.45,718.81,342.23
4,65.05,700.42,268.04


In [40]:
y_pred_val_forest = y_pred_val_forest.astype('int')

In [41]:
for col in y_pred_val_forest:
    for i in range(len(y_pred_val_forest[col])):
        if y_pred_val_forest.loc[i, col] < 0:
            y_pred_val_forest.loc[i, col] = 0

In [42]:
mse_randomForest = mean_squared_error(y_val, y_pred_val_forest)
mse_randomForest

1121.620498193573

# Regression using AdaBoost

In [43]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import AdaBoostRegressor
adaBoost = MultiOutputRegressor(AdaBoostRegressor())

In [44]:
adaBoost = adaBoost.fit(x_train, y_train)

In [45]:
y_pred_val_adaboost = adaBoost.predict(x_val)

In [46]:
y_pred_val_adaboost = pd.DataFrame(y_pred_val_adaboost, columns=y_val.columns)

In [47]:
y_pred_val_adaboost = y_pred_val_adaboost.astype('int')

In [48]:
for col in y_pred_val_adaboost:
    for i in range(len(y_pred_val_adaboost[col])):
        if y_pred_val_adaboost.loc[i, col] < 0:
            y_pred_val_adaboost.loc[i, col] = 0

In [49]:
y_pred_val_adaboost.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,134,754,533
1,134,691,484
2,118,690,400
3,103,609,324
4,87,606,247


In [50]:
mse_adaBoost = mean_squared_error(y_val, y_pred_val_adaboost)
mse_adaBoost

2753.6113709830765

# Regression using Gradient Boost

In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
gradientBoost = MultiOutputRegressor(GradientBoostingRegressor())

In [52]:
gradientBoost = gradientBoost.fit(x_train, y_train)

In [53]:
y_pred_val_gradientboost = gradientBoost.predict(x_val)

In [54]:
y_pred_val_gradientboost = pd.DataFrame(y_pred_val_gradientboost, columns=y_val.columns)

In [55]:
y_pred_val_gradientboost = y_pred_val_gradientboost.astype('int')

In [56]:
for col in y_pred_val_gradientboost:
    for i in range(len(y_pred_val_gradientboost[col])):
        if y_pred_val_gradientboost.loc[i, col] < 0:
            y_pred_val_gradientboost.loc[i, col] = 0

In [57]:
mse_gradientBoost = mean_squared_error(y_val, y_pred_val_gradientboost)
mse_gradientBoost

997.3773340939342

# Comparing the MSE scores for all the different models tried

In [58]:
scores = []
scores.append(['Simple Decision Tree' ,mse_decisionTree])
scores.append(['Random Forest' ,mse_randomForest])
scores.append(['AdaBoost' ,mse_adaBoost])
scores.append(['GradientBoost' ,mse_gradientBoost])

In [59]:
scores = pd.DataFrame(scores, columns=['Model', 'MSE Score'])

In [60]:
scores.head()

Unnamed: 0,Model,MSE Score
0,Simple Decision Tree,1840.508842
1,Random Forest,1121.620498
2,AdaBoost,2753.611371
3,GradientBoost,997.377334
