In [1]:
#!/usr/bin/env python

# make sure to install these packages before running:
!pip install sodapy



In [2]:
#import the libraries 
import numpy as np
import pandas as pd
from pandas import DataFrame as df, Series as se
#import the library for the API
from sodapy import Socrata

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.melbourne.vic.gov.au", None)


#getting the pedestrian count data from city of melbourne website from 2009-05-01 to 
#2020-10-31
# results = client.get("b2ak-trbp", limit=3391522)

#update to 2020-12-31
results = client.get("b2ak-trbp", limit=3482938)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)




In [4]:
results_df

Unnamed: 0,id,date_time,year,month,mdate,day,time,sensor_id,sensor_name,hourly_counts
0,2887628,2019-11-01T17:00:00.000,2019,November,1,Friday,17,34,Flinders St-Spark La,300
1,2887629,2019-11-01T17:00:00.000,2019,November,1,Friday,17,39,Alfred Place,604
2,2887630,2019-11-01T17:00:00.000,2019,November,1,Friday,17,37,Lygon St (East),216
3,2887631,2019-11-01T17:00:00.000,2019,November,1,Friday,17,40,Lonsdale St-Spring St (West),627
4,2887632,2019-11-01T17:00:00.000,2019,November,1,Friday,17,36,Queen St (West),774
...,...,...,...,...,...,...,...,...,...,...
3482933,3482934,2020-12-31T23:00:00.000,2020,December,31,Thursday,23,68,Flinders Ln -Degraves St (North),0
3482934,3482935,2020-12-31T23:00:00.000,2020,December,31,Thursday,23,69,Flinders Ln -Degraves St (Crossing),0
3482935,3482936,2020-12-31T23:00:00.000,2020,December,31,Thursday,23,70,Errol Street (East),7
3482936,3482937,2020-12-31T23:00:00.000,2020,December,31,Thursday,23,71,Westwood Place,22


In [5]:
#Examining the data types of the features of the dataset  
results_df.dtypes

id               object
date_time        object
year             object
month            object
mdate            object
day              object
time             object
sensor_id        object
sensor_name      object
hourly_counts    object
dtype: object

In [6]:
results_df['date'] = results_df['mdate'] + '-' + results_df['month'] + '-' + results_df['year']
results_df.drop(columns=['id',	'date_time','year',	'month',	'mdate',	'day',	'time' ], inplace = True)

In [7]:
#converting 'sensor_id' and 'hourly_counts' features from object datatypes to int
results_df[['sensor_id','hourly_counts']] = results_df[['sensor_id','hourly_counts']].astype('int')
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,1-November-2019
1,39,Alfred Place,604,1-November-2019
2,37,Lygon St (East),216,1-November-2019
3,40,Lonsdale St-Spring St (West),627,1-November-2019
4,36,Queen St (West),774,1-November-2019
...,...,...,...,...
3482933,68,Flinders Ln -Degraves St (North),0,31-December-2020
3482934,69,Flinders Ln -Degraves St (Crossing),0,31-December-2020
3482935,70,Errol Street (East),7,31-December-2020
3482936,71,Westwood Place,22,31-December-2020


In [8]:
#Examining the features after data type conversion
results_df.dtypes

sensor_id         int64
sensor_name      object
hourly_counts     int64
date             object
dtype: object

In [9]:
#converting date_time feature from object datatypes to date
results_df['date'] = results_df['date'].astype('datetime64[ns]')
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,2019-11-01
1,39,Alfred Place,604,2019-11-01
2,37,Lygon St (East),216,2019-11-01
3,40,Lonsdale St-Spring St (West),627,2019-11-01
4,36,Queen St (West),774,2019-11-01
...,...,...,...,...
3482933,68,Flinders Ln -Degraves St (North),0,2020-12-31
3482934,69,Flinders Ln -Degraves St (Crossing),0,2020-12-31
3482935,70,Errol Street (East),7,2020-12-31
3482936,71,Westwood Place,22,2020-12-31


In [10]:
#Examining the features after data type conversion
results_df.dtypes

sensor_id                 int64
sensor_name              object
hourly_counts             int64
date             datetime64[ns]
dtype: object

In [11]:
#extracting data from 2015 till date
results_df = results_df[results_df['date'] > '2014-12-31']

In [12]:
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,2019-11-01
1,39,Alfred Place,604,2019-11-01
2,37,Lygon St (East),216,2019-11-01
3,40,Lonsdale St-Spring St (West),627,2019-11-01
4,36,Queen St (West),774,2019-11-01
...,...,...,...,...
3482933,68,Flinders Ln -Degraves St (North),0,2020-12-31
3482934,69,Flinders Ln -Degraves St (Crossing),0,2020-12-31
3482935,70,Errol Street (East),7,2020-12-31
3482936,71,Westwood Place,22,2020-12-31


In [13]:
#Aggregating hourly pedestrian count for each by sensor-id
new_results_df = pd.DataFrame(results_df.groupby(['date', 'sensor_id'])['hourly_counts'].sum())
new_results_df = new_results_df.reset_index()
new_results_df

Unnamed: 0,date,sensor_id,hourly_counts
0,2015-01-01,2,21217
1,2015-01-01,3,32695
2,2015-01-01,4,36958
3,2015-01-01,5,31224
4,2015-01-01,6,20457
...,...,...,...
102471,2020-12-31,68,3498
102472,2020-12-31,69,2212
102473,2020-12-31,70,3091
102474,2020-12-31,71,696


In [14]:
#renaming the hourly_count column
new_results_df = new_results_df.rename(columns={"hourly_counts": "Total_Pedestrian_Count"})
new_results_df

Unnamed: 0,date,sensor_id,Total_Pedestrian_Count
0,2015-01-01,2,21217
1,2015-01-01,3,32695
2,2015-01-01,4,36958
3,2015-01-01,5,31224
4,2015-01-01,6,20457
...,...,...,...
102471,2020-12-31,68,3498
102472,2020-12-31,69,2212
102473,2020-12-31,70,3091
102474,2020-12-31,71,696


In [15]:
#Aggregating hourly pedestrian count for each day
new_results_df_per_day = pd.DataFrame(new_results_df.groupby(['date'])['Total_Pedestrian_Count'].sum())
new_results_df_per_day = new_results_df_per_day.reset_index()
new_results_df_per_day = new_results_df_per_day.rename(columns={"Total_Pedestrian_Count": "Total_Pedestrian_Count_per_day"})
new_results_df_per_day

Unnamed: 0,date,Total_Pedestrian_Count_per_day
0,2015-01-01,471563
1,2015-01-02,389048
2,2015-01-03,291761
3,2015-01-04,382364
4,2015-01-05,487220
...,...,...
2187,2020-12-27,385422
2188,2020-12-28,473046
2189,2020-12-29,497027
2190,2020-12-30,467634


In [16]:
new_results_df_per_day.to_csv('new_results_df_per_day.csv')

In [17]:
# Loading the weather dataset 
dataset = pd.read_csv('export_df.csv') #export_df_csv as created by Hung son on Trello card https://trello.com/c/7dcc09P9
#converting date_time feature from object datatypes to date
dataset['date'] = dataset['date'].astype('datetime64[ns]')

dataset.dtypes

date                                    datetime64[ns]
Rainfall amount (millimetres)                  float64
Minimum temperature (Degree C)                 float64
Maximum temperature (Degree C)                 float64
Daily global solar exposure (MJ/m*m)           float64
dtype: object

In [18]:
dataset.isna().sum()

date                                    0
Rainfall amount (millimetres)           8
Minimum temperature (Degree C)          0
Maximum temperature (Degree C)          0
Daily global solar exposure (MJ/m*m)    1
dtype: int64

In [19]:
dataset_wt_weather_features = dataset.merge(new_results_df_per_day, left_on='date', right_on='date')

dataset_wt_weather_features

Unnamed: 0,date,Rainfall amount (millimetres),Minimum temperature (Degree C),Maximum temperature (Degree C),Daily global solar exposure (MJ/m*m),Total_Pedestrian_Count_per_day
0,2015-01-01,0.0,13.3,26.9,23.6,471563
1,2015-01-02,0.0,15.4,38.8,26.8,389048
2,2015-01-03,0.0,20.0,38.2,26.5,291761
3,2015-01-04,4.6,16.3,21.4,25.2,382364
4,2015-01-05,0.0,15.0,22.0,30.7,487220
...,...,...,...,...,...,...
2126,2020-10-27,0.0,11.1,19.6,20.1,177267
2127,2020-10-28,0.0,9.5,20.9,26.7,311278
2128,2020-10-29,0.0,11.2,24.3,17.1,263334
2129,2020-10-30,0.0,12.9,18.6,21.0,313225


In [20]:
#writing the dataset into a csv file
import csv  

# field names  
fields = []  
for col in dataset_wt_weather_features.columns: 
    fields.append(col) 

# data rows of csv file  
rows = dataset_wt_weather_features.values  
    
# name of the csv file  
filename = "dataset_wt_weather_features.csv"
    
# writing to the csv file  
with open(filename, 'w') as csvfile:

    # creating a csv writer object  
    csvwriter = csv.writer(csvfile)  
        
    # writing the fields  
    csvwriter.writerow(fields)  
        
    # writing the data rows  
    csvwriter.writerows(rows) 

In [21]:
#Incorporating the weekly index. Representing monday as 1 and sunday as 7.
dataset_wt_weather_features['day_of_week'] = dataset_wt_weather_features.date.dt.dayofweek
dataset_wt_weather_features['day_of_week'] = dataset_wt_weather_features['day_of_week'] + 1

In [22]:
#Incorporating the monthly index. Representing January as 1 and december as 12.
dataset_wt_weather_features['monthly_index'] = dataset_wt_weather_features.date.dt.month

In [23]:
#Incorporating the day of the year index. 
dataset_wt_weather_features['day_of_year'] = dataset_wt_weather_features.date.dt.dayofyear

In [24]:
#Public Holiday dataset
public = pd.read_csv('Public_Holidays.csv')
holidays = public[['Holiday']]

In [25]:
#View the dataset
holidays

Unnamed: 0,Holiday
0,2015-01-01
1,2015-01-26
2,2015-03-14
3,2015-03-25
4,2015-03-26
...,...
74,2020-04-12
75,2020-04-13
76,2020-04-25
77,2020-06-08


In [26]:
#converting Holiday feature from object datatypes to date
holidays['Holiday'] = holidays['Holiday'].astype('datetime64[ns]')
holidays['Public_Holiday'] = 'Public Holiday'

In [27]:
#Validating the changes in datatype
holidays.dtypes

Holiday           datetime64[ns]
Public_Holiday            object
dtype: object

In [28]:
#Incorporating the public holiday information in the dataset
dataset_wt_weather_features = dataset_wt_weather_features.set_index("date").join(holidays.set_index("Holiday"), how="outer")

In [29]:
#Resetting the index
dataset_wt_weather_features.reset_index(inplace = True)

In [30]:
#Renaming the columns
dataset_wt_weather_features.rename(columns = {'index':'Date'}, inplace = True)

In [31]:
#Replacing the na values as 'not a holiday'
dataset_wt_weather_features['Public_Holiday'].fillna('Not a public holiday', inplace=True)

In [32]:
#Rearranging the columns 
dataset_wt_weather_features = dataset_wt_weather_features[['Date', 'day_of_year', 'day_of_week', 'monthly_index','Public_Holiday','Rainfall amount (millimetres)','Minimum temperature (Degree C)','Maximum temperature (Degree C)','Daily global solar exposure (MJ/m*m)', 'Total_Pedestrian_Count_per_day']]

In [33]:
#displaying the dataset
dataset_wt_weather_features

Unnamed: 0,Date,day_of_year,day_of_week,monthly_index,Public_Holiday,Rainfall amount (millimetres),Minimum temperature (Degree C),Maximum temperature (Degree C),Daily global solar exposure (MJ/m*m),Total_Pedestrian_Count_per_day
0,2015-01-01,1,4,1,Public Holiday,0.0,13.3,26.9,23.6,471563
1,2015-01-02,2,5,1,Not a public holiday,0.0,15.4,38.8,26.8,389048
2,2015-01-03,3,6,1,Not a public holiday,0.0,20.0,38.2,26.5,291761
3,2015-01-04,4,7,1,Not a public holiday,4.6,16.3,21.4,25.2,382364
4,2015-01-05,5,1,1,Not a public holiday,0.0,15.0,22.0,30.7,487220
...,...,...,...,...,...,...,...,...,...,...
2127,2020-10-27,301,2,10,Not a public holiday,0.0,11.1,19.6,20.1,177267
2128,2020-10-28,302,3,10,Not a public holiday,0.0,9.5,20.9,26.7,311278
2129,2020-10-29,303,4,10,Not a public holiday,0.0,11.2,24.3,17.1,263334
2130,2020-10-30,304,5,10,Not a public holiday,0.0,12.9,18.6,21.0,313225


In [34]:
#using monthly_index, day of the week, year and day of the year as train data 
X_train = dataset_wt_weather_features[['monthly_index','day_of_week']]

X_train['Year'] = dataset_wt_weather_features.Date.dt.year

X_train['day_of_year'] = dataset_wt_weather_features.Date.dt.dayofyear

In [35]:
X_train = X_train[['Year', 'day_of_year', 'monthly_index','day_of_week' ]]
X_train

Unnamed: 0,Year,day_of_year,monthly_index,day_of_week
0,2015,1,1,4
1,2015,2,1,5
2,2015,3,1,6
3,2015,4,1,7
4,2015,5,1,1
...,...,...,...,...
2127,2020,301,10,2
2128,2020,302,10,3
2129,2020,303,10,4
2130,2020,304,10,5


#Predicting independent variables for Minimum temperature (Degree C) and Maximum temperature (Degree C) from 1-11-2020 to 31-01-2021

In [36]:
Y_train_min_temp = dataset_wt_weather_features['Minimum temperature (Degree C)']
Y_train_min_temp
Y_train_max_temp = dataset_wt_weather_features['Maximum temperature (Degree C)']
Y_train_max_temp

0       26.9
1       38.8
2       38.2
3       21.4
4       22.0
        ... 
2127    19.6
2128    20.9
2129    24.3
2130    18.6
2131    16.8
Name: Maximum temperature (Degree C), Length: 2132, dtype: float64

In [37]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np
#using standard scaler
scaler = StandardScaler()
#fit get the mean and standard deviation of the train features 
#while tranform applies the standardation formular
X_train_StandardScaler = scaler.fit_transform(X_train) 
X_train_StandardScaler

array([[-1.44730998e+00, -1.71507061e+00, -1.58631120e+00,
        -9.38637205e-04],
       [-1.44730998e+00, -1.70542213e+00, -1.58631120e+00,
         4.99354993e-01],
       [-1.44730998e+00, -1.69577366e+00, -1.58631120e+00,
         9.99648624e-01],
       ...,
       [ 1.53055614e+00,  1.19876862e+00,  1.06833203e+00,
        -9.38637205e-04],
       [ 1.53055614e+00,  1.20841709e+00,  1.06833203e+00,
         4.99354993e-01],
       [ 1.53055614e+00,  1.21806557e+00,  1.06833203e+00,
         9.99648624e-01]])

In [38]:
from datetime import datetime, timedelta
#generating the test data set
base = datetime.strptime('2020-11-1', '%Y-%m-%d')
date_list = [base + timedelta(days=x) for x in range(92)]
test_data = df(date_list)
#renaming a column 
test_data.rename(columns = {0:'Date'},inplace = True)
#Incorporating the monthly index. Representing January as 1 and December as 12.
test_data['monthly_index'] = test_data.Date.dt.month

#Incorporating the year
test_data['Year'] = test_data.Date.dt.year

#Incorporating the weekly index. Representing monday as 1 and sunday as 7.
test_data['day_of_week'] = test_data.Date.dt.dayofweek + 1

#Incorporating the day of the year
test_data['day_of_year'] = test_data.Date.dt.dayofyear

#rearranging columns for the test dataset
test_data = test_data[['Year', 'day_of_year', 'monthly_index','day_of_week' ]]

#displaying the test dataset
test_data


Unnamed: 0,Year,day_of_year,monthly_index,day_of_week
0,2020,306,11,7
1,2020,307,11,1
2,2020,308,11,2
3,2020,309,11,3
4,2020,310,11,4
...,...,...,...,...
87,2021,27,1,3
88,2021,28,1,4
89,2021,29,1,5
90,2021,30,1,6


In [39]:
#scaling the test dataset 
test_data_StandardScaler = scaler.fit_transform(test_data) 
test_data_StandardScaler

array([[-0.71287918,  0.51194302,  0.60852228,  1.47391105],
       [-0.71287918,  0.51852106,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.5250991 ,  0.60852228, -1.00960208],
       [-0.71287918,  0.53167714,  0.60852228, -0.51289945],
       [-0.71287918,  0.53825517,  0.60852228, -0.01619682],
       [-0.71287918,  0.54483321,  0.60852228,  0.4805058 ],
       [-0.71287918,  0.55141125,  0.60852228,  0.97720843],
       [-0.71287918,  0.55798929,  0.60852228,  1.47391105],
       [-0.71287918,  0.56456733,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.57114537,  0.60852228, -1.00960208],
       [-0.71287918,  0.57772341,  0.60852228, -0.51289945],
       [-0.71287918,  0.58430145,  0.60852228, -0.01619682],
       [-0.71287918,  0.59087948,  0.60852228,  0.4805058 ],
       [-0.71287918,  0.59745752,  0.60852228,  0.97720843],
       [-0.71287918,  0.60403556,  0.60852228,  1.47391105],
       [-0.71287918,  0.6106136 ,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.

In [40]:
# Loading the weather test dataset 
test_dataset = pd.read_csv('Test_dataset_updated.csv') 

# dataset.dtypes
test_dataset

Unnamed: 0,Date,Rainfall,Solar Exposure,Minimum temperature,Maximum Temperature
0,01-11-2020,0.0,23.2,12.6,17.0
1,02-11-2020,0.0,26.8,9.6,29.5
2,03-11-2020,0.0,26.9,12.1,30.4
3,04-11-2020,0.0,5.5,21.9,22.8
4,05-11-2020,2.8,23.1,11.1,15.8
...,...,...,...,...,...
56,27-12-2020,0.0,3.3,16.2,31.9
57,28-12-2020,3.2,26.5,11.5,17.8
58,29-12-2020,0.0,30.7,9.0,21.3
59,30-12-2020,0.0,30.3,13.7,21.4


##KNN Algorithm

In [50]:
#KNN parameters
n_neighbors = [2,4,5,10,25]
leaf_size = [10,20,30,50,100,500]
metric = ['euclidean','chebyshev','manhattan']
weights = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] 

###KNN for max temperature prediction

In [51]:
#KNN for max temperature
#declaring parameters for KNN algorithm to determine it performance on the test data
scores_ = []
neighbor = []
leaf_size_ = []
metric_ = []
weight = []
algorithm_ = []

for i in n_neighbors:
  for j in leaf_size:
    for k in metric:
      for l in weights:
        for m in algorithm:
          knn_maxi = KNeighborsRegressor(n_neighbors = i, leaf_size = j, metric = k, weights = l, algorithm = m)
          knn_maxi.fit(X_train_StandardScaler, Y_train_max_temp)
          scores_.append(mean_squared_error(test_dataset['Maximum Temperature'].values, knn_maxi.predict(test_data_StandardScaler[0:61])))
          neighbor.append(i)
          leaf_size_.append(j)
          metric_.append(k)
          weight.append(l)
          algorithm_.append(m)

In [52]:
#checking the minimun mean square error of the KNN algorithm used for maximum temperature prediction
min(scores_)

49.20865809836065

In [53]:
#getting the index of the minimun mean square error of the KNN algorithm from the list
index_max = np.argmin(scores_)
index_max

592

In [54]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
max_temp_knn_parameters = [neighbor[index_max], leaf_size_[index_max], metric_[index_max], weight[index_max], algorithm_[index_max]]
max_temp_knn_parameters

[25, 10, 'manhattan', 'uniform', 'auto']

In [57]:
# knn_max = KNeighborsRegressor(algorithm = 'auto', leaf_size = 10, metric = 'chebyshev', n_neighbors = 5, weights = 'uniform')
knn_max = KNeighborsRegressor(n_neighbors = neighbor[index_max], leaf_size = leaf_size_[index_max],  metric = metric_[index_max], weights = weight[index_max], algorithm = algorithm_[index_max])
knn_max.fit(X_train_StandardScaler, Y_train_max_temp)

KNeighborsRegressor(algorithm='auto', leaf_size=10, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                    weights='uniform')

In [58]:
#printing the test data side by side with the predicted data
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), knn_max.predict(test_data_StandardScaler)[0:61].reshape(len(knn_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.    17.416]
 [29.5   25.86 ]
 [30.4   24.528]
 [22.8   19.808]
 [15.8   17.72 ]
 [16.6   16.784]
 [16.1   16.784]
 [19.5   17.416]
 [29.    25.86 ]
 [32.    24.82 ]
 [28.3   19.708]
 [25.5   17.72 ]
 [20.6   16.784]
 [21.    16.784]
 [32.5   17.416]
 [24.3   25.86 ]
 [21.1   24.82 ]
 [29.5   20.   ]
 [34.4   17.312]
 [20.2   16.784]
 [25.7   16.784]
 [28.2   17.416]
 [18.8   26.488]
 [20.3   24.82 ]
 [30.5   19.972]
 [19.4   17.408]
 [36.    16.952]
 [22.9   16.128]
 [18.6   17.028]
 [27.2   26.84 ]
 [26.2   25.108]
 [18.6   20.176]
 [23.7   18.796]
 [22.9   17.524]
 [28.4   17.744]
 [20.5   17.68 ]
 [17.3   26.676]
 [17.1   25.3  ]
 [21.5   20.028]
 [17.7   18.552]
 [21.1   17.22 ]
 [28.1   17.74 ]
 [30.2   17.94 ]
 [33.7   25.388]
 [33.5   25.168]
 [24.6   20.124]
 [25.7   18.144]
 [18.4   17.5  ]
 [19.9   17.24 ]
 [18.9   17.784]
 [21.5   25.656]
 [20.4   25.38 ]
 [19.5   20.18 ]
 [18.7   18.244]
 [20.    17.244]
 [27.4   17.72 ]
 [31.9   17.756]
 [17.8   25.828]
 [21.3   25.38

In [102]:
import pickle
# save the model to disk
filename = 'finalized_knn_model_for_max_temp.sav'
pickle.dump(knn_max, open(filename, 'wb'))

###KNN for minimum Temperature prediction

In [59]:
#KNN for minimun temperature
#declaring parameters for KNN algorithm to determine it performance on the test data
scores_knn_min_temp = []
neighbor = []
leaf_size_ = []
metric_ = []
weight = []
algorithm_ = []

for i in n_neighbors:
  for j in leaf_size:
    for k in metric:
      for l in weights:
        for m in algorithm:
          # print(i,j,k,l,m)
          knn_mini = KNeighborsRegressor(n_neighbors = i, leaf_size = j, metric = k, weights = l, algorithm = m)
          knn_mini.fit(X_train_StandardScaler, Y_train_min_temp)
          scores_knn_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, knn_mini.predict(test_data_StandardScaler[0:61])))

          neighbor.append(i)
          leaf_size_.append(j)
          metric_.append(k)
          weight.append(l)
          algorithm_.append(m)

In [60]:
#checking the minimun mean square error of the KNN algorithm used for minimum temperature prediction
min(scores_knn_min_temp)

20.264958950819675

In [61]:
#getting the index of the minimun mean square error of the KNN algorithm from the list
index_min = np.argmin(scores_knn_min_temp)
index_min

592

In [62]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_temp_knn_parameters =  [neighbor[index_min], leaf_size_[index_min], metric_[index_min], weight[index_min], algorithm_[index_min]]
min_temp_knn_parameters

[25, 10, 'manhattan', 'uniform', 'auto']

In [64]:
knn_min = KNeighborsRegressor(n_neighbors = neighbor[index_min], leaf_size = leaf_size_[index_min],  metric =  metric_[index_min],  weights = weight[index_min], algorithm = algorithm_[index_min])

knn_min.fit(X_train_StandardScaler, Y_train_min_temp)

KNeighborsRegressor(algorithm='auto', leaf_size=10, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                    weights='uniform')

In [65]:
#printing the test data side by side with the predicted data for minimum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), knn_min.predict(test_data_StandardScaler)[0:61].reshape(len(knn_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6    9.96 ]
 [ 9.6   14.692]
 [12.1   13.548]
 [21.9   10.62 ]
 [11.1    9.712]
 [ 9.5    8.848]
 [11.3    9.056]
 [ 8.4    9.96 ]
 [11.3   14.692]
 [16.1   13.116]
 [22.8   10.536]
 [19.3    9.712]
 [15.4    8.848]
 [13.5    9.056]
 [12.3    9.96 ]
 [18.2   14.692]
 [11.9   13.116]
 [10.8   10.68 ]
 [16.1    9.444]
 [15.6    8.848]
 [13.3    9.056]
 [16.1    9.96 ]
 [16.6   14.92 ]
 [13.5   13.116]
 [13.2   10.752]
 [15.8    9.116]
 [10.2    8.788]
 [16.     8.512]
 [14.4    9.512]
 [12.1   14.516]
 [16.8   13.22 ]
 [11.9   10.836]
 [10.6   10.232]
 [11.4    9.152]
 [13.9    9.38 ]
 [12.9   10.216]
 [11.3   15.204]
 [10.6   13.4  ]
 [11.7   10.764]
 [13.1   10.228]
 [ 9.6    9.152]
 [11.9    9.312]
 [16.     9.712]
 [17.7   14.452]
 [20.4   13.708]
 [15.8   10.904]
 [15.5    9.96 ]
 [12.5    9.336]
 [12.3    9.036]
 [ 9.9    8.856]
 [11.2   14.624]
 [13.8   13.536]
 [13.5   11.032]
 [13.4    9.532]
 [13.6    9.3  ]
 [10.2    8.932]
 [16.2    9.04 ]
 [11.5   14.892]
 [ 9.    13.53

In [103]:
import pickle
# save the model to disk
filename = 'finalized_knn_model_for_min_temp.sav'
pickle.dump(knn_min, open(filename, 'wb'))

##MLP

In [None]:
hidden_layer_sizes = []
for i in range(1,10):
  hidden_layer_sizes.append(((i,i,i)))

activation = ['tanh', 'relu', 'logistic', 'identity']
solver = ['lbfgs', 'sgd', 'adam']
alpha = [0.0001, 0.05, 0.01, .001]
learning_rate = ['constant','adaptive', 'invscaling']

In [None]:
hidden_layer_sizes

[(1, 1, 1),
 (2, 2, 2),
 (3, 3, 3),
 (4, 4, 4),
 (5, 5, 5),
 (6, 6, 6),
 (7, 7, 7),
 (8, 8, 8),
 (9, 9, 9)]

##MLP for maximum Temperature

In [None]:
#MLP for Maximum Temperature

#declaring parameters for MLP algorithm to determine it performance on the test data
scores_mlp_max_temp = []
hidden_layer_sizes_ = []
solver_ = []
activation_ = []
alpha_ = []
learning_rate_ = []


for i in hidden_layer_sizes:
  for j in activation:
    for k in solver:
      for l in alpha:
        for m in learning_rate:

          mlp_maxi = MLPRegressor(hidden_layer_sizes = i, activation = j, solver = k, alpha = l, learning_rate = m)
          mlp_maxi.fit(X_train_StandardScaler, Y_train_max_temp)
          scores_mlp_max_temp.append(mean_squared_error(test_dataset['Maximum Temperature'].values, mlp_maxi.predict(test_data_StandardScaler[0:61])))

          hidden_layer_sizes_.append(i)
          activation_.append(j)
          solver_.append(k)
          alpha_.append(l)
          learning_rate_.append(m)

In [None]:
#checking the minimun mean square error of the MLP algorithm used for maximum temperature prediction
min(scores_mlp_max_temp)

166.4169045536952

In [None]:
#getting the index of the minimun mean square error of the MLP algorithm from the list
index_max = np.argmin(scores_mlp_max_temp)
index_max

1186

In [None]:
#obtaining the parameters with the minimun mean square error of the mlp algorithm 
max_temp_mlp_parameters = [hidden_layer_sizes_[index_max], activation_[index_max], solver_[index_max], alpha_[index_max], learning_rate_[index_max]]
max_temp_mlp_parameters

[(9, 9, 9), 'tanh', 'adam', 0.001, 'adaptive']

In [86]:
mlp_max = MLPRegressor(hidden_layer_sizes = (2, 2, 2), activation = 'relu', solver = 'sgd' , alpha = 0.001, learning_rate = 'invscaling')
mlp_max.fit(X_train_StandardScaler, Y_train_min_temp)

MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(2, 2, 2), learning_rate='invscaling',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='sgd',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [None]:
#printing the test data side by side with the predicted data for maximum temperature
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), mlp_max.predict(test_data_StandardScaler)[0:61].reshape(len(mlp_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.          7.4133913 ]
 [29.5         9.87434826]
 [30.4         9.28136362]
 [22.8         8.69064203]
 [15.8         8.09992043]
 [16.6         7.50919884]
 [16.1         7.16565577]
 [19.5         7.40137005]
 [29.          9.73929761]
 [32.          9.13429173]
 [28.3         8.54357013]
 [25.5         7.95284854]
 [20.6         7.36212694]
 [21.          7.15363452]
 [32.5         7.38934881]
 [24.3         9.60424697]
 [21.1         8.98721983]
 [29.5         8.39649824]
 [34.4         7.80577664]
 [20.2         7.21505505]
 [25.7         7.14161327]
 [28.2         7.37732756]
 [18.8         9.46919632]
 [20.3         8.84014794]
 [30.5         8.24942634]
 [19.4         7.65870475]
 [36.          7.06798316]
 [22.9         7.12959203]
 [18.6         7.36530631]
 [27.2         9.33414568]
 [26.2         8.92862147]
 [18.6         8.33789987]
 [23.7         7.74717828]
 [22.9         7.15645668]
 [28.4         7.32994731]
 [20.5         7.56566159]
 [17.3         9.37227117]
 

##MLP for minimum Temperature

In [None]:
#mlp for minimun temperature
#declaring parameters for MLP algorithm to determine it performance on the test data
scores_mlp_min_temp = []
hidden_layer_sizes_ = []
solver_ = []
activation_ = []
alpha_ = []
learning_rate_ = []


for i in hidden_layer_sizes:
  for j in activation:
    for k in solver:
      for l in alpha:
        for m in learning_rate:

          mlp_mini = MLPRegressor(hidden_layer_sizes = i, activation = j, solver = k, alpha = l, learning_rate = m)
          mlp_mini.fit(X_train_StandardScaler, Y_train_min_temp)
          scores_mlp_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, mlp_mini.predict(test_data_StandardScaler[0:61])))

          hidden_layer_sizes_.append(i)
          activation_.append(j)
          solver_.append(k)
          alpha_.append(l)
          learning_rate_.append(m)

In [None]:
#checking the minimun mean square error of the MLP algorithm used for minimum temperature prediction
min(scores_mlp_min_temp)

11.957084593444305

In [None]:
#getting the index of the minimun mean square error of the MLP algorithm from the list
index_min = np.argmin(scores_mlp_min_temp)
index_min

1182

In [None]:
#obtaining the parameters with the minimun mean square error of the MLP algorithm 
min_temp_mlp_parameters = [hidden_layer_sizes_[index_min], activation_[index_min], solver_[index_min], alpha_[index_min], learning_rate_[index_min]]
min_temp_mlp_parameters

[(9, 9, 9), 'tanh', 'adam', 0.01, 'constant']

In [84]:
mlp_min = MLPRegressor(hidden_layer_sizes = (14, 14, 14), activation = 'tanh', solver = 'adam' , alpha = 0.01, learning_rate = 'invscaling')
mlp_min.fit(X_train_StandardScaler, Y_train_min_temp)

MLPRegressor(activation='tanh', alpha=0.01, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='invscaling',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [85]:
#printing the test data side by side with the predicted data for minimum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), mlp_min.predict(test_data_StandardScaler)[0:61].reshape(len(mlp_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6         4.26785885]
 [ 9.6        14.25947977]
 [12.1        14.04225066]
 [21.9        12.1163589 ]
 [11.1         8.2756625 ]
 [ 9.5         6.06032466]
 [11.3         4.7687585 ]
 [ 8.4         4.2593419 ]
 [11.3        14.25869722]
 [16.1        14.03689699]
 [22.8        12.09791181]
 [19.3         8.28738861]
 [15.4         6.08090953]
 [13.5         4.78072026]
 [12.3         4.25217967]
 [18.2        14.25786564]
 [11.9        14.03118786]
 [10.8        12.07829566]
 [16.1         8.2996916 ]
 [15.6         6.10280137]
 [13.3         4.79480018]
 [16.1         4.2464625 ]
 [16.6        14.25698443]
 [13.5        14.02512192]
 [13.2        12.05759734]
 [15.8         8.31253515]
 [10.2         6.12595602]
 [16.          4.81099542]
 [14.4         4.24227849]
 [12.1        14.25605285]
 [16.8        14.00815377]
 [11.9        11.99535809]
 [10.6         8.30983327]
 [11.4         6.11617406]
 [13.9         4.77415058]
 [12.9         4.16354813]
 [11.3        14.25461699]
 

##Decision Tree

###Maximum Temperature

In [66]:
splitter=['best','random']
criterion = ['mse', 'friedman_mse', 'mae']
max_depth = [1,2,3,4,5,10,20,30,40]
max_features =['auto','sqrt','log2'] 
min_samples_split= [2,3,4,5,10,20,30,40]
random_state= [1,2,3,4,5,6,7,8,9,42]

In [67]:
#DT for Maximum Temperature

#declaring parameters for DT algorithm to determine it performance on the test data
scores_dt_max_temp = []
splitter_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []


for i in splitter:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            dt_maxi = DecisionTreeRegressor(splitter = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n )
            dt_maxi.fit(X_train_StandardScaler, Y_train_max_temp)
            scores_dt_max_temp.append(mean_squared_error(test_dataset['Maximum Temperature'].values, dt_maxi.predict(test_data_StandardScaler[0:61])))

            splitter_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

In [68]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_dt_max_temp)

28.51407225156039

In [69]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_max = np.argmin(scores_dt_max_temp)
index_max

7555

In [87]:
#obtaining the parameters with the minimun mean square error of the DT algorithm 
max_temp_dt_parameters = [splitter_[index_max], criterion_[index_max], max_depth_[index_max], max_features_[index_max], min_samples_split_[index_max],random_state_[index_max]]
max_temp_dt_parameters

['random', 'mse', 5, 'sqrt', 5, 6]

In [88]:
dt_max = DecisionTreeRegressor(splitter = splitter_[index_max] , criterion = criterion_[index_max], max_depth = max_depth_[index_max], max_features = max_features_[index_max], min_samples_split = min_samples_split_[index_max],random_state = random_state_[index_max])
dt_max.fit(X_train_StandardScaler, Y_train_min_temp)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=6, splitter='random')

In [89]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), dt_max.predict(test_data_StandardScaler)[0:61].reshape(len(dt_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.         14.04956522]
 [29.5        14.26387337]
 [30.4        14.26387337]
 [22.8        14.26387337]
 [15.8        14.26387337]
 [16.6        14.26387337]
 [16.1        14.26387337]
 [19.5        14.04956522]
 [29.         14.26387337]
 [32.         14.26387337]
 [28.3        14.26387337]
 [25.5        14.26387337]
 [20.6        14.26387337]
 [21.         14.26387337]
 [32.5        14.04956522]
 [24.3        14.26387337]
 [21.1        14.26387337]
 [29.5        14.26387337]
 [34.4        14.26387337]
 [20.2        14.26387337]
 [25.7        14.26387337]
 [28.2        14.04956522]
 [18.8        14.26387337]
 [20.3        14.26387337]
 [30.5        14.26387337]
 [19.4        14.26387337]
 [36.         14.26387337]
 [22.9        14.26387337]
 [18.6        14.04956522]
 [27.2        14.26387337]
 [26.2        14.26387337]
 [18.6        14.26387337]
 [23.7        14.26387337]
 [22.9        14.26387337]
 [28.4        14.26387337]
 [20.5        14.04956522]
 [17.3        14.26387337]
 

###Minimum Temperature

In [73]:
#DT for Minimum Temperature

#declaring parameters for DT algorithm to determine it performance on the test data
scores_dt_min_temp = []
splitter_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []


for i in splitter:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            dt_mini = DecisionTreeRegressor(splitter = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n )
            dt_mini.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_dt_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, dt_mini.predict(test_data_StandardScaler[0:61])))

            splitter_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

In [74]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_dt_min_temp)

9.385081967213116

In [75]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_min = np.argmin(scores_dt_min_temp)
index_min

10801

In [76]:
#obtaining the parameters with the minimun mean square error of the DT algorithm 
min_temp_dt_parameters = [splitter_[index_min], criterion_[index_min], max_depth_[index_min], max_features_[index_min], min_samples_split_[index_min],random_state_[index_min]]
min_temp_dt_parameters

['random', 'mae', 1, 'auto', 2, 2]

In [90]:
#build a decision tree model using the hyper parameter with the least mean square error
dt_min = DecisionTreeRegressor(splitter = 'best', criterion = criterion_[index_min], max_depth = max_depth_[index_min], max_features = max_features_[index_min], min_samples_split =5,random_state = random_state_[index_min])
dt_min.fit(X_train_StandardScaler, Y_train_min_temp)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mae', max_depth=1,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=2, splitter='best')

In [91]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), dt_min.predict(test_data_StandardScaler)[0:61].reshape(len(dt_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6  9.9]
 [ 9.6 15.9]
 [12.1 15.9]
 [21.9  9.9]
 [11.1  9.9]
 [ 9.5  9.9]
 [11.3  9.9]
 [ 8.4  9.9]
 [11.3 15.9]
 [16.1 15.9]
 [22.8  9.9]
 [19.3  9.9]
 [15.4  9.9]
 [13.5  9.9]
 [12.3  9.9]
 [18.2 15.9]
 [11.9 15.9]
 [10.8  9.9]
 [16.1  9.9]
 [15.6  9.9]
 [13.3  9.9]
 [16.1  9.9]
 [16.6 15.9]
 [13.5 15.9]
 [13.2  9.9]
 [15.8  9.9]
 [10.2  9.9]
 [16.   9.9]
 [14.4  9.9]
 [12.1 15.9]
 [16.8 15.9]
 [11.9  9.9]
 [10.6  9.9]
 [11.4  9.9]
 [13.9  9.9]
 [12.9  9.9]
 [11.3 15.9]
 [10.6 15.9]
 [11.7  9.9]
 [13.1  9.9]
 [ 9.6  9.9]
 [11.9  9.9]
 [16.   9.9]
 [17.7 15.9]
 [20.4 15.9]
 [15.8  9.9]
 [15.5  9.9]
 [12.5  9.9]
 [12.3  9.9]
 [ 9.9  9.9]
 [11.2 15.9]
 [13.8 15.9]
 [13.5  9.9]
 [13.4  9.9]
 [13.6  9.9]
 [10.2  9.9]
 [16.2  9.9]
 [11.5 15.9]
 [ 9.  15.9]
 [13.7  9.9]
 [15.5  9.9]]


##Random Forest

###RF Maximum Temperature

In [94]:

criterion = ['mse', 'friedman_mse', 'mae']
max_depth = [10,20,30,40]
max_features =['auto','sqrt','log2'] 
min_samples_split= [10,20,30,40]
random_state= [1,2,3,42]
n_estimators =[10,50,100]



In [95]:
#RF for Maximum Temperature

#declaring parameters for RF algorithm to determine it performance on the test data
scores_rf_max_temp = []
n_estimators_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []


for i in n_estimators:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            rf_maxi = RandomForestRegressor(n_estimators = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n )
            rf_maxi.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_rf_max_temp.append(mean_squared_error(test_dataset['Maximum Temperature'].values, rf_maxi.predict(test_data_StandardScaler[0:61])))

            n_estimators_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

In [104]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_rf_max_temp)

152.68955823125106

In [97]:
#getting the index of the minimun mean square error of the RF algorithm from the list
index_max = np.argmin(scores_rf_max_temp)
index_max

21

In [98]:
#obtaining the parameters with the minimun mean square error of the RF algorithm 
max_temp_rf_parameters = [n_estimators_[index_max], criterion_[index_max], max_depth_[index_max], max_features_[index_max],min_samples_split_[index_max],random_state_[index_max] ]
max_temp_rf_parameters


[10, 'mse', 10, 'sqrt', 20, 2]

In [99]:
rf_max = RandomForestRegressor(n_estimators = n_estimators_[index_max], criterion = criterion_[index_max], max_depth = max_depth_[index_max], max_features =  max_features_[index_max], min_samples_split = min_samples_split_[index_max],random_state = random_state_[index_max])
rf_max.fit(X_train_StandardScaler, Y_train_min_temp)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=20, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=2, verbose=0, warm_start=False)

In [100]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), rf_max.predict(test_data_StandardScaler)[0:61].reshape(len(rf_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.         12.38693773]
 [29.5        16.88446392]
 [30.4        13.68804173]
 [22.8        12.5071571 ]
 [15.8        10.73177396]
 [16.6        11.06521443]
 [16.1        11.51281731]
 [19.5        12.38693773]
 [29.         16.88446392]
 [32.         13.68804173]
 [28.3        12.5071571 ]
 [25.5        10.73177396]
 [20.6        11.06521443]
 [21.         11.51281731]
 [32.5        12.38693773]
 [24.3        16.88446392]
 [21.1        13.68804173]
 [29.5        12.5071571 ]
 [34.4        10.73177396]
 [20.2        11.06521443]
 [25.7        11.51281731]
 [28.2        12.38693773]
 [18.8        16.88446392]
 [20.3        13.68804173]
 [30.5        12.5071571 ]
 [19.4        10.73177396]
 [36.         11.06521443]
 [22.9        11.51281731]
 [18.6        12.38693773]
 [27.2        16.88446392]
 [26.2        13.31337253]
 [18.6        12.28537836]
 [23.7        10.36491348]
 [22.9        10.58552788]
 [28.4        11.55957621]
 [20.5        11.74669667]
 [17.3        16.88446392]
 

In [101]:
#rf for Minimum Temperature

#declaring parameters for rf algorithm to determine it performance on the test data
scores_rf_min_temp = []
n_estimators_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []


for i in n_estimators:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            rf_mini = RandomForestRegressor(n_estimators = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n )
            rf_mini.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_rf_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, rf_mini.predict(test_data_StandardScaler[0:61])))

            n_estimators_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

In [105]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_rf_min_temp)

14.254029856912899

In [106]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_min = np.argmin(scores_rf_min_temp)
index_min

21

In [107]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_temp_rf_parameters = [n_estimators_[index_max], criterion_[index_max], max_depth_[index_max], max_features_[index_max],min_samples_split_[index_max],random_state_[index_max] ]
min_temp_rf_parameters

[10, 'mse', 10, 'sqrt', 20, 2]

In [108]:
rf_min =  RandomForestRegressor(n_estimators = 10, criterion = 'mse', max_depth = 20, max_features = 'sqrt', min_samples_split = 10,random_state = 2 )
rf_min.fit(X_train_StandardScaler, Y_train_min_temp)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=2, verbose=0, warm_start=False)

In [109]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), rf_min.predict(test_data_StandardScaler)[0:61].reshape(len(rf_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6        11.64033964]
 [ 9.6        16.25947689]
 [12.1        13.95704742]
 [21.9        10.48035897]
 [11.1         8.51722305]
 [ 9.5         8.86416944]
 [11.3         8.89539438]
 [ 8.4        11.64033964]
 [11.3        16.25947689]
 [16.1        13.95704742]
 [22.8        10.48035897]
 [19.3         8.51722305]
 [15.4         8.86416944]
 [13.5         8.89539438]
 [12.3        11.64033964]
 [18.2        16.25947689]
 [11.9        13.95704742]
 [10.8        10.48035897]
 [16.1         8.51722305]
 [15.6         8.86416944]
 [13.3         8.89539438]
 [16.1        11.64033964]
 [16.6        16.25947689]
 [13.5        13.95704742]
 [13.2        10.48035897]
 [15.8         8.51722305]
 [10.2         8.86416944]
 [16.          8.89539438]
 [14.4        11.64033964]
 [12.1        16.25947689]
 [16.8        13.66671012]
 [11.9        10.20378205]
 [10.6         8.40519841]
 [11.4         8.96818376]
 [13.9         8.94402991]
 [12.9        11.34602839]
 [11.3        16.5715007 ]
 

##SVR

###SVR Maximum Temperature

In [110]:
kernel =['linear', 'poly', 'rbf']
gamma = ['auto', 'scale']
degree = [3,4,5,10]
C = [2.0,3.0,4.0]

In [124]:
#SVR for Maximum Temperature

#declaring parameters for SVR algorithm to determine it performance on the test data
scores_svr_max_temp = []
kernel_ = []
gamma_ =  []
degree_ = []
C_ = []



for i in kernel:
  for j in gamma:
    for k in degree:
      for l in C:
            svr_maxi = SVR(kernel = i, gamma = j, degree = k, C = l )
            svr_maxi.fit(X_train_StandardScaler, Y_train_max_temp)
            scores_svr_max_temp.append(mean_squared_error(test_dataset['Maximum Temperature'].values, svr_maxi.predict(test_data_StandardScaler[0:61])))

            kernel_.append(i)
            gamma_.append(j)
            degree_.append(k)
            C_.append(l)

In [125]:
#checking the minimun mean square error of the SVR algorithm used for maximum temperature prediction
min(scores_svr_max_temp)

51.15573156900438

In [126]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_max = np.argmin(scores_svr_max_temp)
index_max

24

In [127]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
max_temp_svr_parameters = [kernel_[index_max], gamma_[index_max], degree_[index_max], C_[index_max]]
max_temp_svr_parameters

['poly', 'auto', 3, 2.0]

In [128]:
svr_max = SVR(kernel = kernel_[index_max], gamma = gamma_[index_max], degree = degree_[index_max], C = C_[index_max])
svr_max.fit(X_train_StandardScaler, Y_train_max_temp)

SVR(C=2.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [129]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), svr_max.predict(test_data_StandardScaler)[0:61].reshape(len(svr_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.         18.53564571]
 [29.5        18.65510191]
 [30.4        19.18022717]
 [22.8        19.33094762]
 [15.8        19.22257272]
 [16.6        18.97041197]
 [16.1        18.68977485]
 [19.5        18.49597082]
 [29.         18.70399344]
 [32.         19.21512497]
 [28.3        19.35144125]
 [25.5        19.22825174]
 [20.6        18.96086593]
 [21.         18.6645933 ]
 [32.5        18.45474334]
 [24.3        18.75610248]
 [21.1        19.25244582]
 [29.5        19.37356346]
 [34.4        19.23476488]
 [20.2        18.95135955]
 [25.7        18.63865696]
 [28.2        18.41196659]
 [18.8        18.81143236]
 [20.3        19.29219305]
 [30.5        19.39731759]
 [19.4        19.24211547]
 [36.         18.94189617]
 [22.9        18.61196915]
 [18.6        18.36764391]
 [27.2        18.86998642]
 [26.2        19.51721298]
 [18.6        19.53351205]
 [23.7        19.27343005]
 [22.9        18.85227646]
 [28.4        18.38536076]
 [20.5        17.98799242]
 [17.3        19.17606668]
 

###SVR Minimum Temperature

In [118]:
#svr for Minimum Temperature

#declaring parameters for svr algorithm to determine it performance on the test data
scores_svr_min_temp = []
kernel_ = []
gamma_ =  []
degree_ = []
C_ = []


for i in kernel:
  for j in gamma:
    for k in degree:
      for l in C:
            svr_mini =  SVR(kernel = i, gamma = j, degree = k, C = l )
            svr_mini.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_svr_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, svr_mini.predict(test_data_StandardScaler[0:61])))

            kernel_.append(i)
            gamma_.append(j)
            degree_.append(k)
            C_.append(l)

In [130]:
#checking the minimun mean square error of the SVR algorithm used for maximum temperature prediction
min(scores_svr_min_temp)

13.27778084644603

In [131]:
#getting the index of the minimun mean square error of the SVR algorithm from the list
index_min = np.argmin(scores_svr_min_temp)
index_min

0

In [132]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_temp_svr_parameters = [kernel_[index_min], gamma_[index_min], degree_[index_min], C_[index_min]]
min_temp_svr_parameters

['linear', 'auto', 3, 2.0]

In [133]:
svr_min =  SVR(kernel = kernel_[index_min], gamma = gamma_[index_min], degree = degree_[index_min], C = C_[index_min])
svr_min.fit(X_train_StandardScaler, Y_train_min_temp)

SVR(C=2.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [134]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), svr_min.predict(test_data_StandardScaler)[0:61].reshape(len(svr_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6        10.18153025]
 [ 9.6        13.12661601]
 [12.1        12.63673467]
 [21.9        12.14685333]
 [11.1        11.65697199]
 [ 9.5        11.16709065]
 [11.3        10.67720931]
 [ 8.4        10.18732797]
 [11.3        13.13241373]
 [16.1        12.64253239]
 [22.8        12.15265105]
 [19.3        11.66276971]
 [15.4        11.17288837]
 [13.5        10.68300703]
 [12.3        10.19312569]
 [18.2        13.13821145]
 [11.9        12.64833011]
 [10.8        12.15844877]
 [16.1        11.66856743]
 [15.6        11.17868609]
 [13.3        10.68880475]
 [16.1        10.19892341]
 [16.6        13.14400917]
 [13.5        12.65412783]
 [13.2        12.16424649]
 [15.8        11.67436515]
 [10.2        11.18448381]
 [16.         10.69460247]
 [14.4        10.20472113]
 [12.1        13.14980689]
 [16.8        12.60409757]
 [11.9        12.11421622]
 [10.6        11.62433488]
 [11.4        11.13445354]
 [13.9        10.6445722 ]
 [12.9        10.15469086]
 [11.3        13.09977663]
 

In [136]:
import pickle
# save the model to disk
filename = 'finalized_svr_model_for_min_temp.sav'
pickle.dump(svr_min, open(filename, 'wb'))

In [135]:
#Using Sum of squared residuals to evaluate the performance of the models
from sklearn.metrics import r2_score
from prettytable import PrettyTable

rt = PrettyTable(["Algorithm", "r2 for minimum temperature","r2 for maximum temperature", "MSE for minimum temperature","MSE for maximum temperature",])
rt.align["Algorithm"] = "l" # Left align city names
rt.padding_width = 1 # One space between column edges and contents (default)
rt.add_row(["MLP",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, mlp_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, mlp_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, mlp_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, mlp_max.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["Decision Tree",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, dt_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, dt_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, dt_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, dt_max.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["KNN",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, knn_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, knn_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, knn_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, knn_max.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["Random forest",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, rf_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, rf_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, rf_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, rf_max.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["SVM",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, svr_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, svr_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, svr_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, svr_max.predict(test_data_StandardScaler[0:61])))])

print(rt)

+---------------+----------------------------+----------------------------+-----------------------------+-----------------------------+
| Algorithm     | r2 for minimum temperature | r2 for maximum temperature | MSE for minimum temperature | MSE for maximum temperature |
+---------------+----------------------------+----------------------------+-----------------------------+-----------------------------+
| MLP           |           -3.36            |           -18.04           |            40.84            |            543.21           |
| Decision Tree |           -1.20            |           -3.11            |            20.65            |            117.31           |
| KNN           |           -1.16            |           -0.72            |            20.26            |            49.21            |
| Random forest |           -1.35            |           -4.35            |            22.05            |            152.69           |
| SVM           |           -0.42            |  

In [None]:
dataset_wt_weather_features.to_csv('dataset_with_public_holidays.csv')

# Daily global solar exposure

In [41]:
Y_train_sun = dataset_wt_weather_features['Daily global solar exposure (MJ/m*m)']
# clean
Y_train_sun = Y_train_sun.fillna(Y_train_sun.mean())

In [42]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train) 
X_train_StandardScaler

array([[-1.44730998e+00, -1.71507061e+00, -1.58631120e+00,
        -9.38637205e-04],
       [-1.44730998e+00, -1.70542213e+00, -1.58631120e+00,
         4.99354993e-01],
       [-1.44730998e+00, -1.69577366e+00, -1.58631120e+00,
         9.99648624e-01],
       ...,
       [ 1.53055614e+00,  1.19876862e+00,  1.06833203e+00,
        -9.38637205e-04],
       [ 1.53055614e+00,  1.20841709e+00,  1.06833203e+00,
         4.99354993e-01],
       [ 1.53055614e+00,  1.21806557e+00,  1.06833203e+00,
         9.99648624e-01]])

In [43]:
# test_date
# test_data_StandardScaler = scaler.fit_transform(test_date) 
test_data_StandardScaler

array([[-0.71287918,  0.51194302,  0.60852228,  1.47391105],
       [-0.71287918,  0.51852106,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.5250991 ,  0.60852228, -1.00960208],
       [-0.71287918,  0.53167714,  0.60852228, -0.51289945],
       [-0.71287918,  0.53825517,  0.60852228, -0.01619682],
       [-0.71287918,  0.54483321,  0.60852228,  0.4805058 ],
       [-0.71287918,  0.55141125,  0.60852228,  0.97720843],
       [-0.71287918,  0.55798929,  0.60852228,  1.47391105],
       [-0.71287918,  0.56456733,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.57114537,  0.60852228, -1.00960208],
       [-0.71287918,  0.57772341,  0.60852228, -0.51289945],
       [-0.71287918,  0.58430145,  0.60852228, -0.01619682],
       [-0.71287918,  0.59087948,  0.60852228,  0.4805058 ],
       [-0.71287918,  0.59745752,  0.60852228,  0.97720843],
       [-0.71287918,  0.60403556,  0.60852228,  1.47391105],
       [-0.71287918,  0.6106136 ,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.

In [44]:
# SVR
scores = []
kernel = []
gamma = []
C = []
degree = []

kernels = ['linear', 'rbf']
gammas = [0.1, 1, 10, 100]
Cs = [0.1, 1, 10, 100]
degrees = [0, 1, 2, 3, 4, 5, 6]


for i in kernels:
  for j in gammas:
    for k in Cs:
        for l in degrees: 
          print(i,j,k,l)
          svm = SVR(kernel=i, gamma=j, C=k, degree=l)
          svm.fit(X_train_StandardScaler, Y_train_sun)
          scores.append(mean_squared_error(test_dataset['Solar Exposure'].values, svm.predict(test_data_StandardScaler[0:61])))
          kernel.append(i)
          gamma.append(j)
          C.append(k)
          degree.append(l)

linear 0.1 0.1 0
linear 0.1 0.1 1
linear 0.1 0.1 2
linear 0.1 0.1 3
linear 0.1 0.1 4
linear 0.1 0.1 5
linear 0.1 0.1 6
linear 0.1 1 0
linear 0.1 1 1
linear 0.1 1 2
linear 0.1 1 3
linear 0.1 1 4
linear 0.1 1 5
linear 0.1 1 6
linear 0.1 10 0
linear 0.1 10 1
linear 0.1 10 2
linear 0.1 10 3
linear 0.1 10 4
linear 0.1 10 5
linear 0.1 10 6
linear 0.1 100 0
linear 0.1 100 1
linear 0.1 100 2
linear 0.1 100 3
linear 0.1 100 4
linear 0.1 100 5
linear 0.1 100 6
linear 1 0.1 0
linear 1 0.1 1
linear 1 0.1 2
linear 1 0.1 3
linear 1 0.1 4
linear 1 0.1 5
linear 1 0.1 6
linear 1 1 0
linear 1 1 1
linear 1 1 2
linear 1 1 3
linear 1 1 4
linear 1 1 5
linear 1 1 6
linear 1 10 0
linear 1 10 1
linear 1 10 2
linear 1 10 3
linear 1 10 4
linear 1 10 5
linear 1 10 6
linear 1 100 0
linear 1 100 1
linear 1 100 2
linear 1 100 3
linear 1 100 4
linear 1 100 5
linear 1 100 6
linear 10 0.1 0
linear 10 0.1 1
linear 10 0.1 2
linear 10 0.1 3
linear 10 0.1 4
linear 10 0.1 5
linear 10 0.1 6
linear 10 1 0
linear 10 1 1
linear

In [45]:
min(scores)
import numpy as np
index = np.argmin(scores)
index

217

In [46]:
p = [kernel[217], gamma[217], C[217], degree[217]]
p

['rbf', 100, 100, 0]

In [47]:
# KNN 
scores = []
neighbor = []
leaf_size = []
metric = []
weight = []
algorithm = []

n_neighbors = [2,4,5,10,25]
leaf_sizes = [10,20,30,50,100,500]
metrics = ['euclidean','chebyshev','manhattan']
weights = ['uniform', 'distance']
algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute'] 

for i in n_neighbors:
  for j in leaf_sizes:
    for k in metrics:
      for l in weights:
        for m in algorithms:
          print(i,j,k,l)
          knn = KNeighborsRegressor(n_neighbors = i, leaf_size = j, metric = k, weights = l, algorithm = m)
          knn.fit(X_train_StandardScaler, Y_train_sun)
          scores.append(mean_squared_error(test_dataset['Solar Exposure'].values, knn.predict(test_data_StandardScaler[0:61])))
          neighbor.append(i)
          leaf_size.append(j)
          metric.append(k)
          weight.append(l)
          algorithm.append(m)

2 10 euclidean uniform
2 10 euclidean uniform
2 10 euclidean uniform
2 10 euclidean uniform
2 10 euclidean distance
2 10 euclidean distance
2 10 euclidean distance
2 10 euclidean distance
2 10 chebyshev uniform
2 10 chebyshev uniform
2 10 chebyshev uniform
2 10 chebyshev uniform
2 10 chebyshev distance
2 10 chebyshev distance
2 10 chebyshev distance
2 10 chebyshev distance
2 10 manhattan uniform
2 10 manhattan uniform
2 10 manhattan uniform
2 10 manhattan uniform
2 10 manhattan distance
2 10 manhattan distance
2 10 manhattan distance
2 10 manhattan distance
2 20 euclidean uniform
2 20 euclidean uniform
2 20 euclidean uniform
2 20 euclidean uniform
2 20 euclidean distance
2 20 euclidean distance
2 20 euclidean distance
2 20 euclidean distance
2 20 chebyshev uniform
2 20 chebyshev uniform
2 20 chebyshev uniform
2 20 chebyshev uniform
2 20 chebyshev distance
2 20 chebyshev distance
2 20 chebyshev distance
2 20 chebyshev distance
2 20 manhattan uniform
2 20 manhattan uniform
2 20 manhattan

In [48]:
min(scores)
import numpy as np
index = np.argmin(scores)
index

432

In [49]:
p = [neighbor[432], leaf_size[432], metric[432], weight[432], algorithm[432]]
p

[10, 10, 'euclidean', 'uniform', 'auto']

In [51]:
# Decision Tree
scores = []
splitter = []
criterion =  []
max_depth = []
max_feature = []
min_samples_split = []

splitters =['best','random']
criterions = ['mse', 'friedman_mse', 'mae']
max_depths = [1,2,3,4,5,10,20,30,40]
max_features =['auto','sqrt','log2'] 
min_samples_splits = [2,3,4,5,10,20,30,40]

for i in splitters:
  for j in criterions:
    for k in max_depths:
      for l in max_features:
        for m in min_samples_splits:
            print(i,j,k,l,m)
            tree = DecisionTreeRegressor(splitter = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m)
            tree.fit(X_train_StandardScaler, Y_train_sun)
            scores.append(mean_squared_error(test_dataset['Solar Exposure'].values, tree.predict(test_data_StandardScaler[0:61])))

            splitter.append(i)
            criterion.append(j)
            max_depth.append(k)
            max_feature.append(l)
            min_samples_split.append(m)

edman_mse 40 sqrt 4
best friedman_mse 40 sqrt 5
best friedman_mse 40 sqrt 10
best friedman_mse 40 sqrt 20
best friedman_mse 40 sqrt 30
best friedman_mse 40 sqrt 40
best friedman_mse 40 log2 2
best friedman_mse 40 log2 3
best friedman_mse 40 log2 4
best friedman_mse 40 log2 5
best friedman_mse 40 log2 10
best friedman_mse 40 log2 20
best friedman_mse 40 log2 30
best friedman_mse 40 log2 40
best mae 1 auto 2
best mae 1 auto 3
best mae 1 auto 4
best mae 1 auto 5
best mae 1 auto 10
best mae 1 auto 20
best mae 1 auto 30
best mae 1 auto 40
best mae 1 sqrt 2
best mae 1 sqrt 3
best mae 1 sqrt 4
best mae 1 sqrt 5
best mae 1 sqrt 10
best mae 1 sqrt 20
best mae 1 sqrt 30
best mae 1 sqrt 40
best mae 1 log2 2
best mae 1 log2 3
best mae 1 log2 4
best mae 1 log2 5
best mae 1 log2 10
best mae 1 log2 20
best mae 1 log2 30
best mae 1 log2 40
best mae 2 auto 2
best mae 2 auto 3
best mae 2 auto 4
best mae 2 auto 5
best mae 2 auto 10
best mae 2 auto 20
best mae 2 auto 30
best mae 2 auto 40
best mae 2 sqrt 

In [52]:
min(scores)
import numpy as np
index = np.argmin(scores)
index

655

In [53]:
p = [splitter[874], criterion[874], max_depth[874], max_feature[874], min_samples_split[874]]
p

['random', 'friedman_mse', 1, 'sqrt', 4]

In [54]:
# Random forest
scores = []
n_estimator = []
criterion =  []
max_depth = []
max_feature = []
min_samples_split = []


criterions = ['mse', 'friedman_mse']
max_depths = [10,20,30,40]
max_features =['auto','sqrt','log2'] 
min_samples_splits = [10,20,30,40]
n_estimators =[10,50,100]


for i in n_estimators:
  for j in criterions:
    for k in max_depths:
      for l in max_features:
        for m in min_samples_splits:
            print(i,j,k,l,m)
            rf = RandomForestRegressor(n_estimators = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m)
            rf.fit(X_train_StandardScaler, Y_train_sun)
            scores.append(mean_squared_error(test_dataset['Solar Exposure'].values, rf.predict(test_data_StandardScaler[0:61])))

            n_estimator.append(i)
            criterion.append(j)
            max_depth.append(k)
            max_feature.append(l)
            min_samples_split.append(m)

10 mse 10 auto 10
10 mse 10 auto 20
10 mse 10 auto 30
10 mse 10 auto 40
10 mse 10 sqrt 10
10 mse 10 sqrt 20
10 mse 10 sqrt 30
10 mse 10 sqrt 40
10 mse 10 log2 10
10 mse 10 log2 20
10 mse 10 log2 30
10 mse 10 log2 40
10 mse 20 auto 10
10 mse 20 auto 20
10 mse 20 auto 30
10 mse 20 auto 40
10 mse 20 sqrt 10
10 mse 20 sqrt 20
10 mse 20 sqrt 30
10 mse 20 sqrt 40
10 mse 20 log2 10
10 mse 20 log2 20
10 mse 20 log2 30
10 mse 20 log2 40
10 mse 30 auto 10
10 mse 30 auto 20
10 mse 30 auto 30
10 mse 30 auto 40
10 mse 30 sqrt 10
10 mse 30 sqrt 20
10 mse 30 sqrt 30
10 mse 30 sqrt 40
10 mse 30 log2 10
10 mse 30 log2 20
10 mse 30 log2 30
10 mse 30 log2 40
10 mse 40 auto 10
10 mse 40 auto 20
10 mse 40 auto 30
10 mse 40 auto 40
10 mse 40 sqrt 10
10 mse 40 sqrt 20
10 mse 40 sqrt 30
10 mse 40 sqrt 40
10 mse 40 log2 10
10 mse 40 log2 20
10 mse 40 log2 30
10 mse 40 log2 40
10 friedman_mse 10 auto 10
10 friedman_mse 10 auto 20
10 friedman_mse 10 auto 30
10 friedman_mse 10 auto 40
10 friedman_mse 10 sqrt 10
1

In [61]:
min(scores)
import numpy as np
index = np.argmin(scores)
index

11

In [62]:
p = [n_estimator[index], criterion[index], max_depth[index], max_feature[index], min_samples_split[index]]
p

[10, 'mse', 10, 'log2', 40]

In [63]:
# MLP

In [64]:
#  ML models
mlp_reg_sun = MLPRegressor(random_state=42)
dct_sun = DecisionTreeRegressor(splitter = 'random', criterion = 'friedman_mse', max_depth = 1, max_features = 'sqrt', min_samples_split = 4)
knn_sun = KNeighborsRegressor(n_neighbors = 10, leaf_size = 10, metric = 'euclidean', weights = 'uniform', algorithm = 'auto')
rf_sun = RandomForestRegressor(n_estimators = 10, criterion = 'mse', max_depth = 10, max_features = 'log2', min_samples_split = 40)
svm_sun = SVR(kernel = 'rbf', gamma = 100, C = 100, degree = 0) #Using Radial-basis function kernel


In [65]:
mlp_reg_sun.fit(X_train_StandardScaler, Y_train_sun)
dct_sun.fit(X_train_StandardScaler, Y_train_sun)
knn_sun.fit(X_train_StandardScaler, Y_train_sun)
rf_sun.fit(X_train_StandardScaler, Y_train_sun)
svm_sun.fit(X_train_StandardScaler, Y_train_sun)

SVR(C=100, degree=0, gamma=100)

In [71]:
import pickle
# save the model to disk
filename = 'Model/finalized_svr_model_for_Solar_exposure'
pickle.dump(svm_sun, open(filename, 'wb'))
pickle.dump(scaler, open('Model/scaler', 'wb'))

In [67]:
y_pred_sun_mlp = mlp_reg_sun.predict(test_data_StandardScaler)
y_pred_sun_dt = dct_sun.predict(test_data_StandardScaler)
y_pred_sun_knn = knn_sun.predict(test_data_StandardScaler)
y_pred_sun_rf = rf_sun.predict(test_data_StandardScaler)
y_pred_sun_svm = svm_sun.predict(test_data_StandardScaler)
y_pred_sun_svm

array([14.62870348, 14.69507108, 14.85034217, 14.73607821, 14.72058458,
       14.5140782 , 14.71754881, 14.59702633, 14.67865717, 14.79044037,
       14.78571527, 14.68294277, 14.69095409, 14.77097901, 14.69068854,
       14.82133097, 14.67050163, 14.70238419, 14.70933398, 14.81070682,
       14.78483956, 14.79625479, 14.90499357, 14.6061133 , 14.64079831,
       14.79006547, 14.78921833, 14.74837717, 14.83688028, 14.83591302,
       14.04229734, 15.05581268, 14.78822008, 13.79951688, 14.09773424,
       14.36060057, 13.86110505, 13.30055777, 14.63362959, 13.91729984,
       14.38769534, 15.06200646, 13.74470897, 14.69575331, 14.63832991,
       14.04899821, 13.78182523, 14.90533309, 15.53173522, 14.11054206,
       15.08426944, 15.49088818, 14.36110172, 14.50999629, 15.04522825,
       15.09605409, 15.31002026, 14.57787237, 14.54658819, 15.40942774,
       14.98770937, 14.98288529, 13.48720361, 15.1299186 , 15.65894359,
       15.3141106 , 15.31391179, 14.96124783, 15.16072096, 13.51

In [68]:
from sklearn.metrics import r2_score
from prettytable import PrettyTable

rt = PrettyTable(["Algorithm", "r2", "MSE"])
rt.align["Algorithm"] = "l" # Left align city names
rt.padding_width = 1 # One space between column edges and contents (default)
rt.add_row(["MLP",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_mlp[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_mlp[0:61]))])

rt.add_row(["Decision Tree",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_dt[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_dt[0:61]))])

rt.add_row(["KNN",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_knn[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_knn[0:61]))])

rt.add_row(["Random Forest",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_rf[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_rf[0:61]))])

rt.add_row(["SVR",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_svm[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_svm[0:61]))])

print(rt)

+---------------+-------+--------+
| Algorithm     |   r2  |  MSE   |
+---------------+-------+--------+
| MLP           | -1.37 | 163.58 |
| Decision Tree | -1.34 | 161.63 |
| KNN           | -1.51 | 173.31 |
| Random Forest | -1.51 | 173.33 |
| SVR           | -0.94 | 133.86 |
+---------------+-------+--------+


#Predicting Daily Rainfall

*In the subsequent section, I will be using the 5 regression models(KNN, Decision Tree, Multilayer Perceptron, SVR and Random Forest Regressor) to predict the Daily Rainfall from November 2020 - January 2021. The predictions would be compared with the actual data for daily rainfall(for November 2020 and December 2020). The model having the lowest MAE(Mean absolute error would be determined as the best fit for predicting the daily rainfall).*

In [151]:
#Rainfall data
y_train_rainfall = dataset_wt_weather_features['Rainfall amount (millimetres)']

In [152]:
#Dealing with NA values
y_train_rainfall = y_train_rainfall.fillna(y_train_rainfall.mean())

##KNN Regressor

In [None]:
#KNN parameters
n_neighbors = [1,2,3,4,5,6,7,8,9,10]
leaf_size = [10,20,30,50,100,500]
metric = ['euclidean','chebyshev','manhattan']
weights = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] 
iteration_knn = 1
total_knn = 1440

In [None]:
#KNN for rainfall
#declaring parameters for KNN algorithm to determine it performance on the test data
import time

scores_rainfall_knn = []
neighbor = []
leaf_size_ = []
metric_ = []
weight = []
algorithm_ = []


for i in n_neighbors:
  for j in leaf_size:
    for k in metric:
      for l in weights:
        for m in algorithm:
          start = time.process_time()
          knn_rainfall = KNeighborsRegressor(n_neighbors = i, leaf_size = j, metric = k, weights = l, algorithm = m)
          knn_rainfall.fit(X_train_StandardScaler, y_train_rainfall)
          scores_rainfall_knn.append(mean_squared_error(test_dataset['Rainfall '].values, knn_rainfall.predict(test_data_StandardScaler[0:61])))
          neighbor.append(i)
          leaf_size_.append(j)
          metric_.append(k)
          weight.append(l)
          algorithm_.append(m)

          print("Iteration no.", iteration_knn, "of", total_knn)
          print("Estimated time left", round((total_knn - iteration_knn)*round((time.process_time() - start),2),2), "seconds")
          iteration_knn = iteration_knn + 1

Iteration no. 1 of 1440
Estimated time left 0.0 seconds
Iteration no. 2 of 1440
Estimated time left 0.0 seconds
Iteration no. 3 of 1440
Estimated time left 0.0 seconds
Iteration no. 4 of 1440
Estimated time left 14.36 seconds
Iteration no. 5 of 1440
Estimated time left 14.35 seconds
Iteration no. 6 of 1440
Estimated time left 14.34 seconds
Iteration no. 7 of 1440
Estimated time left 14.33 seconds
Iteration no. 8 of 1440
Estimated time left 14.32 seconds
Iteration no. 9 of 1440
Estimated time left 14.31 seconds
Iteration no. 10 of 1440
Estimated time left 14.3 seconds
Iteration no. 11 of 1440
Estimated time left 14.29 seconds
Iteration no. 12 of 1440
Estimated time left 14.28 seconds
Iteration no. 13 of 1440
Estimated time left 14.27 seconds
Iteration no. 14 of 1440
Estimated time left 14.26 seconds
Iteration no. 15 of 1440
Estimated time left 14.25 seconds
Iteration no. 16 of 1440
Estimated time left 14.24 seconds
Iteration no. 17 of 1440
Estimated time left 14.23 seconds
Iteration no.

In [None]:
#checking the Minimum mean square error of the KNN algorithm used for daily rainfall
min(scores_rainfall_knn)

22.37080897260598

In [None]:
#getting the index of the minimun mean square error of the KNN algorithm from the list
index_rainfall = np.argmin(scores_rainfall_knn)
index_rainfall

1168

In [None]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_rainfall_knn_parameters = [neighbor[1168], leaf_size_[1168], metric_[1168], weight[1168], algorithm_[1168]]
min_rainfall_knn_parameters

[9, 10, 'manhattan', 'uniform', 'auto']

In [None]:
#Creating the KNN regressor with the parameters with least error 
knn_rainfall_ = KNeighborsRegressor(algorithm = 'auto', leaf_size = 10, metric = 'manhattan', n_neighbors = 9, weights = 'uniform')
knn_rainfall_.fit(X_train_StandardScaler, y_train_rainfall)

KNeighborsRegressor(algorithm='auto', leaf_size=10, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                    weights='uniform')

In [None]:
#printing the test data side by side with the predicted data for daily rainfall
print(np.concatenate((test_dataset['Rainfall '].values.reshape(len(test_dataset['Rainfall ']),1), knn_rainfall_.predict(test_data_StandardScaler)[0:61].reshape(len(knn_rainfall_.predict(test_data_StandardScaler)[0:61]),1)),1))

[[ 0.          0.88888889]
 [ 0.          1.46666667]
 [ 0.          4.37777778]
 [ 0.          3.13333333]
 [ 2.8         2.66666667]
 [ 0.2         1.93333333]
 [ 1.6         3.17777778]
 [ 0.          1.68254865]
 [ 0.          1.42222222]
 [ 0.          4.13333333]
 [ 0.          3.24444444]
 [ 4.          2.66666667]
 [ 0.          1.93333333]
 [ 0.2         3.17777778]
 [ 0.          1.68254865]
 [ 0.          1.46666667]
 [ 0.          5.86666667]
 [ 0.          3.26666667]
 [ 0.          1.93333333]
 [ 0.          2.13333333]
 [ 0.          3.51111111]
 [ 0.          1.53333333]
 [30.2         1.46666667]
 [ 6.          5.86666667]
 [ 0.          3.26666667]
 [ 0.          1.93333333]
 [ 0.          2.13333333]
 [ 0.          3.35555556]
 [ 2.8         1.48888889]
 [ 1.          1.46666667]
 [ 0.          4.86666667]
 [ 0.2         3.86666667]
 [ 0.          1.53333333]
 [ 0.          2.57777778]
 [ 0.          4.41111111]
 [ 1.4         2.48888889]
 [ 2.6         1.86666667]
 

##Multi-layer perceptron regressor

In [None]:
#Parameters for the MLP regressor 
hidden_layer_sizes = []
for i in range(1,21):
  hidden_layer_sizes.append(((i,i,i)))

activation = ['tanh', 'relu', 'logistic', 'identity']
solver = ['lbfgs', 'sgd', 'adam']
alpha = [0.0001, 0.05, 0.01, .001]
learning_rate = ['constant','adaptive', 'invscaling']
iteration_MLP = 1
total_MLP = 2880

In [None]:
#The hidden layer sizes for MLP
hidden_layer_sizes

[(1, 1, 1),
 (2, 2, 2),
 (3, 3, 3),
 (4, 4, 4),
 (5, 5, 5),
 (6, 6, 6),
 (7, 7, 7),
 (8, 8, 8),
 (9, 9, 9),
 (10, 10, 10),
 (11, 11, 11),
 (12, 12, 12),
 (13, 13, 13),
 (14, 14, 14),
 (15, 15, 15),
 (16, 16, 16),
 (17, 17, 17),
 (18, 18, 18),
 (19, 19, 19),
 (20, 20, 20)]

In [None]:
#MLP for Rainfall

#declaring parameters for MLP algorithm to determine it performance on the test data
scores_mlp_rainfall = []
hidden_layer_sizes_ = []
solver_ = []
activation_ = []
alpha_ = []
learning_rate_ = []
time_left = []

from statistics import mean

for i in hidden_layer_sizes:
  for j in activation:
    for k in solver:
      for l in alpha:
        for m in learning_rate:

          start = time.process_time()


          mlp_rainfall = MLPRegressor(hidden_layer_sizes = i, activation = j, solver = k, alpha = l, learning_rate = m)
          mlp_rainfall.fit(X_train_StandardScaler, y_train_rainfall)

          scores_mlp_rainfall.append(mean_squared_error(test_dataset['Rainfall '].values, mlp_rainfall.predict(test_data_StandardScaler[0:61])))
          

          hidden_layer_sizes_.append(i)
          activation_.append(j)
          solver_.append(k)
          alpha_.append(l)
          learning_rate_.append(m)
          time_left.append(time.process_time() - start)

          print("Iteration no.", iteration_MLP, "of", total_MLP)
          print("Estimated time left", round((round((total_MLP - iteration_MLP)*mean(time_left),2)/60),2), "minutes")
          iteration_MLP = iteration_MLP + 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration no. 381 of 2880
Estimated time left 21.1 minutes
Iteration no. 382 of 2880
Estimated time left 21.06 minutes
Iteration no. 383 of 2880
Estimated time left 21.04 minutes
Iteration no. 384 of 2880
Estimated time left 21.07 minutes
Iteration no. 385 of 2880
Estimated time left 21.07 minutes
Iteration no. 386 of 2880
Estimated time left 21.08 minutes
Iteration no. 387 of 2880
Estimated time left 21.07 minutes
Iteration no. 388 of 2880
Estimated time left 21.05 minutes
Iteration no. 389 of 2880
Estimated time left 21.05 minutes
Iteration no. 390 of 2880
Estimated time left 21.05 minutes
Iteration no. 391 of 2880
Estimated time left 21.06 minutes
Iteration no. 392 of 2880
Estimated time left 21.1 minutes
Iteration no. 393 of 2880
Estimated time left 21.09 minutes
Iteration no. 394 of 2880
Estimated time left 21.09 minutes
Iteration no. 395 of 2880
Estimated time left 21.07 minutes
Iteration no. 396 of 2880
Estimated t

In [None]:
#checking the Minimum mean square error of the MLP algorithm used for daily rainfall
min(scores_mlp_rainfall)

18.189981133765087

In [None]:
#getting the index of the minimum mean square error of the MLP algorithm from the list
index_rainfall = np.argmin(scores_mlp_rainfall)
index_rainfall

623

In [None]:
#getting the index of the minimun mean square error of the MLP algorithm from the list
min_rainfall_mlp_parameters = [hidden_layer_sizes_[1153], activation_[1153], solver_[1153], alpha_[1153], learning_rate_[1153]]
min_rainfall_mlp_parameters

[(9, 9, 9), 'tanh', 'lbfgs', 0.0001, 'adaptive']

In [None]:
#Creating the MLP regressor with the parameters with least error 
mlp_min_rainfall = MLPRegressor(hidden_layer_sizes = (9,9,9), activation = 'tanh', solver = 'lbfgs' , alpha = 0.0001, learning_rate = 'adaptive')
mlp_min_rainfall.fit(X_train_StandardScaler, y_train_rainfall)

MLPRegressor(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(9, 9, 9), learning_rate='adaptive',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='lbfgs',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [None]:
#printing the test data side by side with the predicted data for daily rainfall
print(np.concatenate((test_dataset['Rainfall '].values.reshape(len(test_dataset['Rainfall ']),1), mlp_min_rainfall.predict(test_data_StandardScaler)[0:61].reshape(len(mlp_min_rainfall.predict(test_data_StandardScaler)[0:61]),1)),1))

[[ 0.          1.19318876]
 [ 0.          0.92340708]
 [ 0.          1.38769256]
 [ 0.          1.07178388]
 [ 2.8         1.57487852]
 [ 0.2         1.79118886]
 [ 1.6         2.1143853 ]
 [ 0.          1.19464464]
 [ 0.          0.93127278]
 [ 0.          1.38704391]
 [ 0.          1.14049773]
 [ 4.          1.62454745]
 [ 0.          1.79125767]
 [ 0.2         1.9927287 ]
 [ 0.          1.19631626]
 [ 0.          0.94939965]
 [ 0.          1.38638057]
 [ 0.          2.06807223]
 [ 0.          1.63167915]
 [ 0.          1.79133148]
 [ 0.          1.91420394]
 [ 0.          1.19823728]
 [30.2         0.99428849]
 [ 6.          1.3857213 ]
 [ 0.         16.25905215]
 [ 0.          1.59626845]
 [ 0.          1.79141144]
 [ 0.          1.86471727]
 [ 2.8         1.20044738]
 [ 1.          1.10653464]
 [ 0.          1.3862217 ]
 [ 0.2         1.13634144]
 [ 0.          1.13347966]
 [ 0.          1.79114753]
 [ 0.          1.78876505]
 [ 1.4         1.25196807]
 [ 2.6         1.38533618]
 

##Decision Tree Regressor

In [148]:
splitter=['best','random']
criterion = ['mse', 'friedman_mse', 'mae']
max_depth = [100,200,None]
max_features =['auto','sqrt','log2', None] 
min_samples_split= [2,3,4,5,10]
random_state = []
for i in range(1,101):
  random_state.append(i)

dct_iterations = 1
total_dct = 36000

In [154]:
#DT for rainfall
import time
from statistics import mean
#declaring parameters for DT algorithm to determine it performance on the test data
scores_dt_rainfall = []
splitter_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []
time_left = []


for i in splitter:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            start = time.process_time()
            
            dt_rainfall = DecisionTreeRegressor(splitter = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n)
            dt_rainfall.fit(X_train_StandardScaler, y_train_rainfall)
            scores_dt_rainfall.append(mean_squared_error(test_dataset['Rainfall '].values, dt_rainfall.predict(test_data_StandardScaler[0:61])))

            splitter_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

            time_left.append(time.process_time() - start)
            print("Iteration no.", dct_iterations, "of", total_dct)
            print("Estimated time left", round((round((total_dct - dct_iterations)*mean(time_left),2)/60),2), "minutes")
            dct_iterations = dct_iterations + 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration no. 33501 of 36000
Estimated time left 0.36 minutes
Iteration no. 33502 of 36000
Estimated time left 0.36 minutes
Iteration no. 33503 of 36000
Estimated time left 0.36 minutes
Iteration no. 33504 of 36000
Estimated time left 0.36 minutes
Iteration no. 33505 of 36000
Estimated time left 0.36 minutes
Iteration no. 33506 of 36000
Estimated time left 0.36 minutes
Iteration no. 33507 of 36000
Estimated time left 0.36 minutes
Iteration no. 33508 of 36000
Estimated time left 0.36 minutes
Iteration no. 33509 of 36000
Estimated time left 0.36 minutes
Iteration no. 33510 of 36000
Estimated time left 0.36 minutes
Iteration no. 33511 of 36000
Estimated time left 0.36 minutes
Iteration no. 33512 of 36000
Estimated time left 0.36 minutes
Iteration no. 33513 of 36000
Estimated time left 0.36 minutes
Iteration no. 33514 of 36000
Estimated time left 0.36 minutes
Iteration no. 33515 of 36000
Estimated time left 0.36 minutes
Itera

In [155]:
#checking the Minimum mean square error of the Decision Tree algorithm used for daily rainfall
min(scores_dt_rainfall)

16.199732847601698

In [156]:
#getting the index of the minimum mean square error of the Decision Tree algorithm from the list
index_rainfall = np.argmin(scores_dt_rainfall)
index_rainfall

18966

In [157]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_rainfall_dct_parameters = [splitter_[index_rainfall], criterion_[index_rainfall], max_depth_[index_rainfall], max_features_[index_rainfall], min_samples_split_[index_rainfall],random_state_[index_rainfall]]
min_rainfall_dct_parameters

['random', 'mse', 100, 'sqrt', 10, 67]

In [174]:
#Creating the Decision Tree regressor with the parameters with least error 
dt_rainfall_ = DecisionTreeRegressor(splitter = 'random', criterion = 'mse', max_depth = 100, max_features = 'sqrt', min_samples_split = 3, random_state = 67)
dt_rainfall_.fit(X_train_StandardScaler, y_train_rainfall)




DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=100,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=3,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=67, splitter='random')

In [175]:
#printing the test data side by side with the predicted data for daily rainfall
print(np.concatenate((test_dataset['Rainfall '].values.reshape(len(test_dataset['Rainfall ']),1), dt_rainfall_.predict(test_data_StandardScaler)[0:61].reshape(len(dt_rainfall_.predict(test_data_StandardScaler)[0:61]),1)),1))

[[ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 2.8  0. ]
 [ 0.2  0. ]
 [ 1.6  0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 4.   0. ]
 [ 0.   0. ]
 [ 0.2  0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [30.2  0. ]
 [ 6.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 2.8  0. ]
 [ 1.   0. ]
 [ 0.   0. ]
 [ 0.2  0.2]
 [ 0.   1.2]
 [ 0.   0.4]
 [ 0.   0. ]
 [ 1.4  0. ]
 [ 2.6  0. ]
 [ 6.   0. ]
 [ 0.   0.2]
 [ 0.6  1.2]
 [ 0.   0.4]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0.2]
 [ 0.   1.2]
 [ 7.6  0.4]
 [ 0.2  0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [13.2  0. ]
 [ 6.   0.2]
 [ 1.   1.2]
 [ 0.   0.4]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 3.2  0. ]
 [ 0.   0. ]
 [ 0.   2. ]
 [ 0.   1.5]]


In [176]:
mean_squared_error(test_dataset['Rainfall '].values, dt_rainfall_.predict(test_data_StandardScaler[0:61]))

21.445409836065572

##Support Vector Regressor

In [None]:
kernel =['linear', 'poly', 'rbf']
gamma = ['auto', 'scale']
degree = [1,2,3,4,5,10]
C = [0.05,0.5,1.0,2.0,3.0,4.0,10]
total_svm = 252
iter_svm = 1

In [None]:
#SVR for Daily rainfall

#declaring parameters for SVR algorithm to determine it performance on the test data
import time
from statistics import mean
scores_svr_rainfall = []
kernel_ = []
gamma_ =  []
degree_ = []
C_ = []
time_left = []


for i in kernel:
  for j in gamma:
    for k in degree:
      for l in C:
            start = time.process_time()
            svr_rainfall = SVR(kernel = i, gamma = j, degree = k, C = l )
            svr_rainfall.fit(X_train_StandardScaler, y_train_rainfall)
            scores_svr_rainfall.append(mean_squared_error(test_dataset['Rainfall '].values, svr_rainfall.predict(test_data_StandardScaler[0:61])))
            
            kernel_.append(i)
            gamma_.append(j)
            degree_.append(k)
            C_.append(l)

            
            time_left.append(time.process_time() - start)
            print("Iteration no.", iter_svm, "of", total_svm)
            print("Estimated time left", round((round((total_svm - iter_svm)*mean(time_left),2)/60),2), "minutes")
            iter_svm = iter_svm + 1

Iteration no. 1 of 252
Estimated time left 0.74 minutes
Iteration no. 2 of 252
Estimated time left 0.73 minutes
Iteration no. 3 of 252
Estimated time left 0.74 minutes
Iteration no. 4 of 252
Estimated time left 0.77 minutes
Iteration no. 5 of 252
Estimated time left 0.81 minutes
Iteration no. 6 of 252
Estimated time left 0.85 minutes
Iteration no. 7 of 252
Estimated time left 0.94 minutes
Iteration no. 8 of 252
Estimated time left 0.9 minutes
Iteration no. 9 of 252
Estimated time left 0.87 minutes
Iteration no. 10 of 252
Estimated time left 0.86 minutes
Iteration no. 11 of 252
Estimated time left 0.85 minutes
Iteration no. 12 of 252
Estimated time left 0.86 minutes
Iteration no. 13 of 252
Estimated time left 0.87 minutes
Iteration no. 14 of 252
Estimated time left 0.91 minutes
Iteration no. 15 of 252
Estimated time left 0.88 minutes
Iteration no. 16 of 252
Estimated time left 0.86 minutes
Iteration no. 17 of 252
Estimated time left 0.85 minutes
Iteration no. 18 of 252
Estimated time le

In [None]:
min(scores_svr_rainfall)

20.93960024807976

In [None]:
#getting the index of the minimum mean square error of the SVM algorithm from the list
index_rainfall = np.argmin(scores_svr_rainfall)
index_rainfall

174

In [None]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_rainfall_svr_parameters = [kernel_[index_rainfall], gamma_[index_rainfall], degree_[index_rainfall], C_[index_rainfall]]
min_rainfall_svr_parameters

['rbf', 'auto', 1, 10]

In [None]:
svr_rainfall_ = SVR(kernel = 'rbf', gamma = 'auto', degree = 1, C = 10)
svr_rainfall_.fit(X_train_StandardScaler, y_train_rainfall)

SVR(C=10, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
#printing the test data side by side with the predicted data for daily rainfall
print(np.concatenate((test_dataset['Rainfall '].values.reshape(len(test_dataset['Rainfall ']),1), svr_rainfall_.predict(test_data_StandardScaler)[0:61].reshape(len(svr_rainfall_.predict(test_data_StandardScaler)[0:61]),1)),1))

[[ 0.          0.25754582]
 [ 0.          0.18606707]
 [ 0.          0.1677954 ]
 [ 0.          0.16856531]
 [ 2.8         0.19922731]
 [ 0.2         0.25658142]
 [ 1.6         0.30024872]
 [ 0.          0.26502945]
 [ 0.          0.18233119]
 [ 0.          0.16435133]
 [ 0.          0.1668532 ]
 [ 4.          0.19978111]
 [ 0.          0.25915718]
 [ 0.2         0.30470071]
 [ 0.          0.27173694]
 [ 0.          0.17896649]
 [ 0.          0.16139806]
 [ 0.          0.16558218]
 [ 0.          0.20052407]
 [ 0.          0.26152465]
 [ 0.          0.30855557]
 [ 0.          0.27764845]
 [30.2         0.17597171]
 [ 6.          0.15892692]
 [ 0.          0.1647369 ]
 [ 0.          0.20144004]
 [ 0.          0.26367223]
 [ 0.          0.31180443]
 [ 2.8         0.28274895]
 [ 1.          0.17334232]
 [ 0.          0.16706643]
 [ 0.2         0.16208741]
 [ 0.          0.17447383]
 [ 0.          0.20776306]
 [ 0.          0.23847995]
 [ 1.4         0.21601484]
 [ 2.6         0.17961254]
 

##Random Forest Regressor

In [None]:
n_estimators = [10,20,30,40,50,100,200]
criterion = ['mae', 'mse']
max_depth = [50,100,200,None]
max_features = ['auto','sqrt','log2', None] 
random_state = []
for i in range(1,50):
  random_state.append(i)

rf_iteration = 1
total_rf = 10976

In [None]:
from sklearn.ensemble import RandomForestRegressor

#Random forest for rainfall

#declaring parameters for DT algorithm to determine it performance on the test data
scores_rf_rainfall = []
n_estimators_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
random_state_ = []
time_left = []


for i in n_estimators:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
          for n in random_state:
            start = time.process_time()
            
            rf_rainfall = RandomForestRegressor(n_estimators = i, criterion = j, max_depth = k, max_features = l,random_state = n)
            rf_rainfall.fit(X_train_StandardScaler, y_train_rainfall)
            scores_rf_rainfall.append(mean_squared_error(test_dataset['Rainfall '].values, rf_rainfall.predict(test_data_StandardScaler[0:61])))

            n_estimators_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            random_state_.append(n)

            time_left.append(time.process_time() - start)
            print("Iteration no.", rf_iteration, "of", total_rf)
            print("Estimated time left", round((round((total_rf - rf_iteration)*mean(time_left),2)/60),2), "minutes")
            rf_iteration = rf_iteration + 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration no. 8477 of 10976
Estimated time left 18.37 minutes
Iteration no. 8478 of 10976
Estimated time left 18.36 minutes
Iteration no. 8479 of 10976
Estimated time left 18.36 minutes
Iteration no. 8480 of 10976
Estimated time left 18.36 minutes
Iteration no. 8481 of 10976
Estimated time left 18.35 minutes
Iteration no. 8482 of 10976
Estimated time left 18.35 minutes
Iteration no. 8483 of 10976
Estimated time left 18.35 minutes
Iteration no. 8484 of 10976
Estimated time left 18.34 minutes
Iteration no. 8485 of 10976
Estimated time left 18.34 minutes
Iteration no. 8486 of 10976
Estimated time left 18.34 minutes
Iteration no. 8487 of 10976
Estimated time left 18.34 minutes
Iteration no. 8488 of 10976
Estimated time left 18.33 minutes
Iteration no. 8489 of 10976
Estimated time left 18.33 minutes
Iteration no. 8490 of 10976
Estimated time left 18.33 minutes
Iteration no. 8491 of 10976
Estimated time left 18.32 minutes
Itera

In [None]:
min(scores_rf_rainfall)

20.079055737704913

In [None]:
#getting the index of the minimum mean square error of the Random Forest algorithm from the list
index_rainfall = np.argmin(scores_rf_rainfall)
index_rainfall

25

In [None]:
#Parameters with lowest MAE
min_rainfall_rf_parameters = [n_estimators_[index_rainfall], criterion_[index_rainfall], max_depth_[index_rainfall], max_features_[index_rainfall], random_state_[index_rainfall]]
min_rainfall_rf_parameters

[10, 'mse', 100, 'auto', 26]

In [None]:
rf_rainfall_ = RandomForestRegressor(n_estimators=10, criterion = 'mse', max_depth = 100, max_features = 'auto', random_state = 26)
rf_rainfall_.fit(X_train_StandardScaler, y_train_rainfall)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=100, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=26, verbose=0, warm_start=False)

In [None]:
#printing the test data side by side with the predicted data for daily rainfall
print(np.concatenate((test_dataset['Rainfall '].values.reshape(len(test_dataset['Rainfall ']),1), rf_rainfall_.predict(test_data_StandardScaler)[0:61].reshape(len(rf_rainfall_.predict(test_data_StandardScaler)[0:61]),1)),1))

[[ 0.          2.3       ]
 [ 0.          2.94      ]
 [ 0.          5.44      ]
 [ 0.          4.06      ]
 [ 2.8         2.8       ]
 [ 0.2         4.26      ]
 [ 1.6         6.16      ]
 [ 0.          2.39429379]
 [ 0.          2.36      ]
 [ 0.          2.8       ]
 [ 0.          2.5       ]
 [ 4.          1.68      ]
 [ 0.          3.92      ]
 [ 0.2         0.4       ]
 [ 0.          0.24      ]
 [ 0.          0.66      ]
 [ 0.          0.14      ]
 [ 0.          0.62      ]
 [ 0.          0.82      ]
 [ 0.          0.66      ]
 [ 0.          0.7       ]
 [ 0.          0.54      ]
 [30.2         0.62      ]
 [ 6.          0.42      ]
 [ 0.          0.48      ]
 [ 0.          0.3       ]
 [ 0.          0.36      ]
 [ 0.          0.28      ]
 [ 2.8         0.4       ]
 [ 1.          1.22      ]
 [ 0.          0.4       ]
 [ 0.2         0.38      ]
 [ 0.          3.94      ]
 [ 0.          5.7       ]
 [ 0.          7.1       ]
 [ 1.4         5.34      ]
 [ 2.6         3.88      ]
 

In [None]:
#Visualizing the metrics for the algorithms
from sklearn.metrics import r2_score
from prettytable import PrettyTable

rt = PrettyTable(["Algorithm", "r2", "MSE"])
rt.align["Algorithm"] = "l" # Left align city names
rt.padding_width = 1 # One space between column edges and contents (default)
rt.add_row(["MLP",
            '{:.2f}'.format(r2_score(test_dataset['Rainfall '].values, mlp_rainfall.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Rainfall '].values, mlp_rainfall.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["KNN",
            '{:.2f}'.format(r2_score(test_dataset['Rainfall '].values, knn_rainfall_.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Rainfall '].values,knn_rainfall_.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["Random Forest",
            '{:.2f}'.format(r2_score(test_dataset['Rainfall '].values, rf_rainfall_.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Rainfall '].values, rf_rainfall_.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["Decision Tree",
            '{:.2f}'.format(r2_score(test_dataset['Rainfall '].values, dt_rainfall_.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Rainfall '].values, dt_rainfall_.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["SVR",
            '{:.2f}'.format(r2_score(test_dataset['Rainfall '].values, svr_rainfall_.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Rainfall '].values, svr_rainfall_.predict(test_data_StandardScaler[0:61])))])

print(rt)

+---------------+-------+-------+
| Algorithm     |   r2  |  MSE  |
+---------------+-------+-------+
| MLP           | -0.02 | 19.61 |
| KNN           | -0.16 | 22.37 |
| Random Forest | -0.94 | 37.28 |
| Decision Tree |  0.03 | 18.71 |
| SVR           | -0.09 | 20.94 |
+---------------+-------+-------+
