In [1]:
#!/usr/bin/env python

# make sure to install these packages before running:
!pip install sodapy

Collecting sodapy
  Downloading https://files.pythonhosted.org/packages/9e/74/95fb7d45bbe7f1de43caac45d7dd4807ef1e15881564a00eef489a3bb5c6/sodapy-2.1.0-py2.py3-none-any.whl
Installing collected packages: sodapy
Successfully installed sodapy-2.1.0


In [2]:
#import the libraries 
import numpy as np
import pandas as pd
from pandas import DataFrame as df, Series as se
#import the library for the API
from sodapy import Socrata

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.melbourne.vic.gov.au", None)


#getting the pedestrian count data from city of melbourne website from 2009-05-01 to 
#2020-10-31
results = client.get("b2ak-trbp", limit=3391522)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)




In [4]:
results_df

Unnamed: 0,id,date_time,year,month,mdate,day,time,sensor_id,sensor_name,hourly_counts
0,2887628,2019-11-01T17:00:00.000,2019,November,1,Friday,17,34,Flinders St-Spark La,300
1,2887629,2019-11-01T17:00:00.000,2019,November,1,Friday,17,39,Alfred Place,604
2,2887630,2019-11-01T17:00:00.000,2019,November,1,Friday,17,37,Lygon St (East),216
3,2887631,2019-11-01T17:00:00.000,2019,November,1,Friday,17,40,Lonsdale St-Spring St (West),627
4,2887632,2019-11-01T17:00:00.000,2019,November,1,Friday,17,36,Queen St (West),774
...,...,...,...,...,...,...,...,...,...,...
3391517,3391518,2020-10-31T23:00:00.000,2020,October,31,Saturday,23,67,Flinders Ln -Degraves St (South),0
3391518,3391519,2020-10-31T23:00:00.000,2020,October,31,Saturday,23,68,Flinders Ln -Degraves St (North),0
3391519,3391520,2020-10-31T23:00:00.000,2020,October,31,Saturday,23,69,Flinders Ln -Degraves St (Crossing),0
3391520,3391521,2020-10-31T23:00:00.000,2020,October,31,Saturday,23,70,Errol Street (East),0


In [5]:
#Examining the data types of the features of the dataset  
results_df.dtypes

id               object
date_time        object
year             object
month            object
mdate            object
day              object
time             object
sensor_id        object
sensor_name      object
hourly_counts    object
dtype: object

In [6]:
results_df['date'] = results_df['mdate'] + '-' + results_df['month'] + '-' + results_df['year']
results_df.drop(columns=['id',	'date_time','year',	'month',	'mdate',	'day',	'time' ], inplace = True)

In [7]:
#converting 'sensor_id' and 'hourly_counts' features from object datatypes to int
results_df[['sensor_id','hourly_counts']] = results_df[['sensor_id','hourly_counts']].astype('int')
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,1-November-2019
1,39,Alfred Place,604,1-November-2019
2,37,Lygon St (East),216,1-November-2019
3,40,Lonsdale St-Spring St (West),627,1-November-2019
4,36,Queen St (West),774,1-November-2019
...,...,...,...,...
3391517,67,Flinders Ln -Degraves St (South),0,31-October-2020
3391518,68,Flinders Ln -Degraves St (North),0,31-October-2020
3391519,69,Flinders Ln -Degraves St (Crossing),0,31-October-2020
3391520,70,Errol Street (East),0,31-October-2020


In [8]:
#Examining the features after data type conversion
results_df.dtypes

sensor_id         int64
sensor_name      object
hourly_counts     int64
date             object
dtype: object

In [9]:
#converting date_time feature from object datatypes to date
results_df['date'] = results_df['date'].astype('datetime64[ns]')
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,2019-11-01
1,39,Alfred Place,604,2019-11-01
2,37,Lygon St (East),216,2019-11-01
3,40,Lonsdale St-Spring St (West),627,2019-11-01
4,36,Queen St (West),774,2019-11-01
...,...,...,...,...
3391517,67,Flinders Ln -Degraves St (South),0,2020-10-31
3391518,68,Flinders Ln -Degraves St (North),0,2020-10-31
3391519,69,Flinders Ln -Degraves St (Crossing),0,2020-10-31
3391520,70,Errol Street (East),0,2020-10-31


In [10]:
#Examining the features after data type conversion
results_df.dtypes

sensor_id                 int64
sensor_name              object
hourly_counts             int64
date             datetime64[ns]
dtype: object

In [11]:
#extracting data from 2015 till date
results_df = results_df[results_df['date'] > '2014-12-31']

In [12]:
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,2019-11-01
1,39,Alfred Place,604,2019-11-01
2,37,Lygon St (East),216,2019-11-01
3,40,Lonsdale St-Spring St (West),627,2019-11-01
4,36,Queen St (West),774,2019-11-01
...,...,...,...,...
3391517,67,Flinders Ln -Degraves St (South),0,2020-10-31
3391518,68,Flinders Ln -Degraves St (North),0,2020-10-31
3391519,69,Flinders Ln -Degraves St (Crossing),0,2020-10-31
3391520,70,Errol Street (East),0,2020-10-31


In [13]:
#Aggregating hourly pedestrian count for each by sensor-id
new_results_df = pd.DataFrame(results_df.groupby(['date', 'sensor_id'])['hourly_counts'].sum())
new_results_df = new_results_df.reset_index()
new_results_df

Unnamed: 0,date,sensor_id,hourly_counts
0,2015-01-01,2,21217
1,2015-01-01,3,32695
2,2015-01-01,4,36958
3,2015-01-01,5,31224
4,2015-01-01,6,20457
...,...,...,...
98662,2020-10-31,67,4718
98663,2020-10-31,68,2447
98664,2020-10-31,69,3428
98665,2020-10-31,70,3035


In [14]:
#renaming the hourly_count column
new_results_df = new_results_df.rename(columns={"hourly_counts": "Total_Pedestrian_Count"})
new_results_df

Unnamed: 0,date,sensor_id,Total_Pedestrian_Count
0,2015-01-01,2,21217
1,2015-01-01,3,32695
2,2015-01-01,4,36958
3,2015-01-01,5,31224
4,2015-01-01,6,20457
...,...,...,...
98662,2020-10-31,67,4718
98663,2020-10-31,68,2447
98664,2020-10-31,69,3428
98665,2020-10-31,70,3035


In [15]:
#Aggregating hourly pedestrian count for each day
new_results_df_per_day = pd.DataFrame(new_results_df.groupby(['date'])['Total_Pedestrian_Count'].sum())
new_results_df_per_day = new_results_df_per_day.reset_index()
new_results_df_per_day = new_results_df_per_day.rename(columns={"Total_Pedestrian_Count": "Total_Pedestrian_Count_per_day"})
new_results_df_per_day

Unnamed: 0,date,Total_Pedestrian_Count_per_day
0,2015-01-01,471563
1,2015-01-02,389048
2,2015-01-03,291761
3,2015-01-04,382364
4,2015-01-05,487220
...,...,...
2126,2020-10-27,177267
2127,2020-10-28,311278
2128,2020-10-29,263334
2129,2020-10-30,313225


In [16]:
# Loading the weather dataset 
dataset = pd.read_csv('export_df.csv') #export_df_csv as created by Hung son on Trello card https://trello.com/c/7dcc09P9
#converting date_time feature from object datatypes to date
dataset['date'] = dataset['date'].astype('datetime64[ns]')

dataset.dtypes

date                                    datetime64[ns]
Rainfall amount (millimetres)                  float64
Minimum temperature (Degree C)                 float64
Maximum temperature (Degree C)                 float64
Daily global solar exposure (MJ/m*m)           float64
dtype: object

In [17]:
dataset.isna().sum()

date                                    0
Rainfall amount (millimetres)           8
Minimum temperature (Degree C)          0
Maximum temperature (Degree C)          0
Daily global solar exposure (MJ/m*m)    1
dtype: int64

In [18]:
dataset_wt_weather_features = dataset.merge(new_results_df_per_day, left_on='date', right_on='date')

dataset_wt_weather_features

Unnamed: 0,date,Rainfall amount (millimetres),Minimum temperature (Degree C),Maximum temperature (Degree C),Daily global solar exposure (MJ/m*m),Total_Pedestrian_Count_per_day
0,2015-01-01,0.0,13.3,26.9,23.6,471563
1,2015-01-02,0.0,15.4,38.8,26.8,389048
2,2015-01-03,0.0,20.0,38.2,26.5,291761
3,2015-01-04,4.6,16.3,21.4,25.2,382364
4,2015-01-05,0.0,15.0,22.0,30.7,487220
...,...,...,...,...,...,...
2126,2020-10-27,0.0,11.1,19.6,20.1,177267
2127,2020-10-28,0.0,9.5,20.9,26.7,311278
2128,2020-10-29,0.0,11.2,24.3,17.1,263334
2129,2020-10-30,0.0,12.9,18.6,21.0,313225


In [19]:
#writing the dataset into a csv file
import csv  

# field names  
fields = []  
for col in dataset_wt_weather_features.columns: 
    fields.append(col) 

# data rows of csv file  
rows = dataset_wt_weather_features.values  
    
# name of the csv file  
filename = "dataset_wt_weather_features.csv"
    
# writing to the csv file  
with open(filename, 'w') as csvfile:

    # creating a csv writer object  
    csvwriter = csv.writer(csvfile)  
        
    # writing the fields  
    csvwriter.writerow(fields)  
        
    # writing the data rows  
    csvwriter.writerows(rows) 

In [20]:
#Incorporating the weekly index. Representing monday as 1 and sunday as 7.
dataset_wt_weather_features['day_of_week'] = dataset_wt_weather_features.date.dt.dayofweek
dataset_wt_weather_features['day_of_week'] = dataset_wt_weather_features['day_of_week'] + 1

In [21]:
#Incorporating the monthly index. Representing January as 1 and december as 12.
dataset_wt_weather_features['monthly_index'] = dataset_wt_weather_features.date.dt.month

In [22]:
#Incorporating the day of the year index. 
dataset_wt_weather_features['day_of_year'] = dataset_wt_weather_features.date.dt.dayofyear

In [23]:
#Public Holiday dataset
public = pd.read_csv('Public_Holidays.csv')
holidays = public[['Holiday']]

In [24]:
#View the dataset
holidays

Unnamed: 0,Holiday
0,2015-01-01
1,2015-01-26
2,2015-03-14
3,2015-03-25
4,2015-03-26
...,...
74,2020-04-12
75,2020-04-13
76,2020-04-25
77,2020-06-08


In [25]:
#converting Holiday feature from object datatypes to date
holidays['Holiday'] = holidays['Holiday'].astype('datetime64[ns]')
holidays['Public_Holiday'] = 'Public Holiday'

In [26]:
#Validating the changes in datatype
holidays.dtypes

Holiday           datetime64[ns]
Public_Holiday            object
dtype: object

In [27]:
#Incorporating the public holiday information in the dataset
dataset_wt_weather_features = dataset_wt_weather_features.set_index("date").join(holidays.set_index("Holiday"), how="outer")

In [28]:
#Resetting the index
dataset_wt_weather_features.reset_index(inplace = True)

In [29]:
#Renaming the columns
dataset_wt_weather_features.rename(columns = {'index':'Date'}, inplace = True)

In [30]:
#Replacing the na values as 'not a holiday'
dataset_wt_weather_features['Public_Holiday'].fillna('Not a public holiday', inplace=True)

In [31]:
#Rearranging the columns 
dataset_wt_weather_features = dataset_wt_weather_features[['Date', 'day_of_year', 'day_of_week', 'monthly_index','Public_Holiday','Rainfall amount (millimetres)','Minimum temperature (Degree C)','Maximum temperature (Degree C)','Daily global solar exposure (MJ/m*m)', 'Total_Pedestrian_Count_per_day']]

In [32]:
#displaying the dataset
dataset_wt_weather_features

Unnamed: 0,Date,day_of_year,day_of_week,monthly_index,Public_Holiday,Rainfall amount (millimetres),Minimum temperature (Degree C),Maximum temperature (Degree C),Daily global solar exposure (MJ/m*m),Total_Pedestrian_Count_per_day
0,2015-01-01,1,4,1,Public Holiday,0.0,13.3,26.9,23.6,471563
1,2015-01-02,2,5,1,Not a public holiday,0.0,15.4,38.8,26.8,389048
2,2015-01-03,3,6,1,Not a public holiday,0.0,20.0,38.2,26.5,291761
3,2015-01-04,4,7,1,Not a public holiday,4.6,16.3,21.4,25.2,382364
4,2015-01-05,5,1,1,Not a public holiday,0.0,15.0,22.0,30.7,487220
...,...,...,...,...,...,...,...,...,...,...
2127,2020-10-27,301,2,10,Not a public holiday,0.0,11.1,19.6,20.1,177267
2128,2020-10-28,302,3,10,Not a public holiday,0.0,9.5,20.9,26.7,311278
2129,2020-10-29,303,4,10,Not a public holiday,0.0,11.2,24.3,17.1,263334
2130,2020-10-30,304,5,10,Not a public holiday,0.0,12.9,18.6,21.0,313225


In [33]:
#using monthly_index, day of the week, year and day of the year as train data 
X_train = dataset_wt_weather_features[['monthly_index','day_of_week']]

X_train['Year'] = dataset_wt_weather_features.Date.dt.year

X_train['day_of_year'] = dataset_wt_weather_features.Date.dt.dayofyear

In [34]:
X_train = X_train[['Year', 'day_of_year', 'monthly_index','day_of_week' ]]
X_train

Unnamed: 0,Year,day_of_year,monthly_index,day_of_week
0,2015,1,1,4
1,2015,2,1,5
2,2015,3,1,6
3,2015,4,1,7
4,2015,5,1,1
...,...,...,...,...
2127,2020,301,10,2
2128,2020,302,10,3
2129,2020,303,10,4
2130,2020,304,10,5


#Predicting independent variables for Minimum temperature (Degree C) and Maximum temperature (Degree C) from 1-11-2020 to 31-01-2021

In [35]:
Y_train_min_temp = dataset_wt_weather_features['Minimum temperature (Degree C)']
Y_train_min_temp
Y_train_max_temp = dataset_wt_weather_features['Maximum temperature (Degree C)']
Y_train_max_temp

0       26.9
1       38.8
2       38.2
3       21.4
4       22.0
        ... 
2127    19.6
2128    20.9
2129    24.3
2130    18.6
2131    16.8
Name: Maximum temperature (Degree C), Length: 2132, dtype: float64

In [36]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np
#using standard scaler
scaler = StandardScaler()
#fit get the mean and standard deviation of the train features 
#while tranform applies the standardation formular
X_train_StandardScaler = scaler.fit_transform(X_train) 
X_train_StandardScaler

array([[-1.44730998e+00, -1.71507061e+00, -1.58631120e+00,
        -9.38637205e-04],
       [-1.44730998e+00, -1.70542213e+00, -1.58631120e+00,
         4.99354993e-01],
       [-1.44730998e+00, -1.69577366e+00, -1.58631120e+00,
         9.99648624e-01],
       ...,
       [ 1.53055614e+00,  1.19876862e+00,  1.06833203e+00,
        -9.38637205e-04],
       [ 1.53055614e+00,  1.20841709e+00,  1.06833203e+00,
         4.99354993e-01],
       [ 1.53055614e+00,  1.21806557e+00,  1.06833203e+00,
         9.99648624e-01]])

In [37]:
from datetime import datetime, timedelta
#generating the test data set
base = datetime.strptime('2020-11-1', '%Y-%m-%d')
date_list = [base + timedelta(days=x) for x in range(92)]
test_data = df(date_list)
#renaming a column 
test_data.rename(columns = {0:'Date'},inplace = True)
#Incorporating the monthly index. Representing January as 1 and December as 12.
test_data['monthly_index'] = test_data.Date.dt.month

#Incorporating the year
test_data['Year'] = test_data.Date.dt.year

#Incorporating the weekly index. Representing monday as 1 and sunday as 7.
test_data['day_of_week'] = test_data.Date.dt.dayofweek + 1

#Incorporating the day of the year
test_data['day_of_year'] = test_data.Date.dt.dayofyear

#rearranging columns for the test dataset
test_data = test_data[['Year', 'day_of_year', 'monthly_index','day_of_week' ]]

#displaying the test dataset
test_data


Unnamed: 0,Year,day_of_year,monthly_index,day_of_week
0,2020,306,11,7
1,2020,307,11,1
2,2020,308,11,2
3,2020,309,11,3
4,2020,310,11,4
...,...,...,...,...
87,2021,27,1,3
88,2021,28,1,4
89,2021,29,1,5
90,2021,30,1,6


In [38]:
#scaling the test dataset 
test_data_StandardScaler = scaler.fit_transform(test_data) 
test_data_StandardScaler

array([[-0.71287918,  0.51194302,  0.60852228,  1.47391105],
       [-0.71287918,  0.51852106,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.5250991 ,  0.60852228, -1.00960208],
       [-0.71287918,  0.53167714,  0.60852228, -0.51289945],
       [-0.71287918,  0.53825517,  0.60852228, -0.01619682],
       [-0.71287918,  0.54483321,  0.60852228,  0.4805058 ],
       [-0.71287918,  0.55141125,  0.60852228,  0.97720843],
       [-0.71287918,  0.55798929,  0.60852228,  1.47391105],
       [-0.71287918,  0.56456733,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.57114537,  0.60852228, -1.00960208],
       [-0.71287918,  0.57772341,  0.60852228, -0.51289945],
       [-0.71287918,  0.58430145,  0.60852228, -0.01619682],
       [-0.71287918,  0.59087948,  0.60852228,  0.4805058 ],
       [-0.71287918,  0.59745752,  0.60852228,  0.97720843],
       [-0.71287918,  0.60403556,  0.60852228,  1.47391105],
       [-0.71287918,  0.6106136 ,  0.60852228, -1.5063047 ],
       [-0.71287918,  0.

In [39]:
# Loading the weather test dataset 
test_dataset = pd.read_csv('Test_dataset_updated.csv') 

# dataset.dtypes
test_dataset

Unnamed: 0,Date,Rainfall,Solar Exposure,Minimum temperature,Maximum Temperature
0,01-11-2020,0.0,23.2,12.6,17.0
1,02-11-2020,0.0,26.8,9.6,29.5
2,03-11-2020,0.0,26.9,12.1,30.4
3,04-11-2020,0.0,5.5,21.9,22.8
4,05-11-2020,2.8,23.1,11.1,15.8
...,...,...,...,...,...
56,27-12-2020,0.0,3.3,16.2,31.9
57,28-12-2020,3.2,26.5,11.5,17.8
58,29-12-2020,0.0,30.7,9.0,21.3
59,30-12-2020,0.0,30.3,13.7,21.4


##KNN Algorithm

In [40]:
#KNN parameters
n_neighbors = [2,4,5,10,25]
leaf_size = [10,20,30,50,100,500]
metric = ['euclidean','chebyshev','manhattan']
weights = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] 

###KNN for max temperature prediction

In [41]:
#KNN for max temperature
#declaring parameters for KNN algorithm to determine it performance on the test data
scores_ = []
neighbor = []
leaf_size_ = []
metric_ = []
weight = []
algorithm_ = []

for i in n_neighbors:
  for j in leaf_size:
    for k in metric:
      for l in weights:
        for m in algorithm:
          knn_maxi = KNeighborsRegressor(n_neighbors = i, leaf_size = j, metric = k, weights = l, algorithm = m)
          knn_maxi.fit(X_train_StandardScaler, Y_train_max_temp)
          scores_.append(mean_squared_error(test_dataset['Maximum Temperature'].values, knn_maxi.predict(test_data_StandardScaler[0:61])))
          neighbor.append(i)
          leaf_size_.append(j)
          metric_.append(k)
          weight.append(l)
          algorithm_.append(m)

In [42]:
#checking the minimun mean square error of the KNN algorithm used for maximum temperature prediction
min(scores_)

73.16152786885245

In [43]:
#getting the index of the minimun mean square error of the KNN algorithm from the list
index_max = np.argmin(scores_)
index_max

296

In [44]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
max_temp_knn_parameters = [neighbor[296], leaf_size_[296], metric_[296], weight[296], algorithm_[296]]
max_temp_knn_parameters

[5, 10, 'chebyshev', 'uniform', 'auto']

In [45]:
knn_max = KNeighborsRegressor(algorithm = 'auto', leaf_size = 10, metric = 'chebyshev', n_neighbors = 5, weights = 'uniform')
knn_max.fit(X_train_StandardScaler, Y_train_max_temp)

KNeighborsRegressor(algorithm='auto', leaf_size=10, metric='chebyshev',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [46]:
#printing the test data side by side with the predicted data
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), knn_max.predict(test_data_StandardScaler)[0:61].reshape(len(knn_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.   17.2 ]
 [29.5  17.42]
 [30.4  17.76]
 [22.8  16.96]
 [15.8  17.82]
 [16.6  14.9 ]
 [16.1  15.02]
 [19.5  17.2 ]
 [29.   17.42]
 [32.   17.76]
 [28.3  16.96]
 [25.5  17.82]
 [20.6  15.32]
 [21.   15.  ]
 [32.5  17.28]
 [24.3  17.3 ]
 [21.1  16.9 ]
 [29.5  17.12]
 [34.4  17.6 ]
 [20.2  15.64]
 [25.7  15.94]
 [28.2  16.04]
 [18.8  16.94]
 [20.3  17.02]
 [30.5  17.04]
 [19.4  15.96]
 [36.   15.66]
 [22.9  15.94]
 [18.6  16.04]
 [27.2  16.94]
 [26.2  17.36]
 [18.6  17.74]
 [23.7  16.7 ]
 [22.9  16.06]
 [28.4  17.7 ]
 [20.5  17.02]
 [17.3  16.22]
 [17.1  17.36]
 [21.5  17.74]
 [17.7  16.7 ]
 [21.1  16.06]
 [28.1  17.7 ]
 [30.2  17.02]
 [33.7  16.4 ]
 [33.5  17.36]
 [24.6  17.74]
 [25.7  16.7 ]
 [18.4  16.06]
 [19.9  17.7 ]
 [18.9  17.02]
 [21.5  16.4 ]
 [20.4  17.36]
 [19.5  17.62]
 [18.7  16.7 ]
 [20.   16.06]
 [27.4  17.7 ]
 [31.9  17.02]
 [17.8  16.22]
 [21.3  17.36]
 [21.4  17.74]
 [21.2  17.96]]


###KNN for minimum Temperature prediction

In [47]:
#KNN for minimun temperature
#declaring parameters for KNN algorithm to determine it performance on the test data
scores_knn_min_temp = []
neighbor = []
leaf_size_ = []
metric_ = []
weight = []
algorithm_ = []

for i in n_neighbors:
  for j in leaf_size:
    for k in metric:
      for l in weights:
        for m in algorithm:
          # print(i,j,k,l,m)
          knn_mini = KNeighborsRegressor(n_neighbors = i, leaf_size = j, metric = k, weights = l, algorithm = m)
          knn_mini.fit(X_train_StandardScaler, Y_train_min_temp)
          scores_knn_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, knn_mini.predict(test_data_StandardScaler[0:61])))

          neighbor.append(i)
          leaf_size_.append(j)
          metric_.append(k)
          weight.append(l)
          algorithm_.append(m)

In [48]:
#checking the minimun mean square error of the KNN algorithm used for minimum temperature prediction
min(scores_knn_min_temp)

23.77878032786885

In [49]:
#getting the index of the minimun mean square error of the KNN algorithm from the list
index_min = np.argmin(scores_knn_min_temp)
index_min

297

In [50]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_temp_knn_parameters = [neighbor[297], leaf_size_[297], metric_[297], weight[297], algorithm_[297]]
min_temp_knn_parameters

[5, 10, 'chebyshev', 'uniform', 'ball_tree']

In [51]:
knn_min = KNeighborsRegressor(algorithm = 'ball_tree', leaf_size = 10, metric = 'chebyshev', n_neighbors = 5, weights = 'uniform')
knn_min.fit(X_train_StandardScaler, Y_train_min_temp)

KNeighborsRegressor(algorithm='ball_tree', leaf_size=10, metric='chebyshev',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [52]:
#printing the test data side by side with the predicted data for minimum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), knn_min.predict(test_data_StandardScaler)[0:61].reshape(len(knn_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6   9.24]
 [ 9.6   8.9 ]
 [12.1   9.22]
 [21.9   9.64]
 [11.1   9.02]
 [ 9.5   8.64]
 [11.3   7.98]
 [ 8.4   9.24]
 [11.3   8.9 ]
 [16.1   9.22]
 [22.8   9.64]
 [19.3   9.02]
 [15.4  11.16]
 [13.5   9.12]
 [12.3   9.18]
 [18.2   9.88]
 [11.9   9.2 ]
 [10.8   9.82]
 [16.1   9.4 ]
 [15.6  11.42]
 [13.3   9.76]
 [16.1   9.64]
 [16.6   9.54]
 [13.5   9.02]
 [13.2   9.5 ]
 [15.8   9.9 ]
 [10.2  10.32]
 [16.    9.76]
 [14.4   9.64]
 [12.1   9.54]
 [16.8   9.18]
 [11.9   9.3 ]
 [10.6  11.1 ]
 [11.4  10.74]
 [13.9  10.84]
 [12.9   9.96]
 [11.3  10.08]
 [10.6   9.18]
 [11.7   9.3 ]
 [13.1  11.1 ]
 [ 9.6  10.74]
 [11.9  10.84]
 [16.    9.96]
 [17.7  10.46]
 [20.4   9.18]
 [15.8   9.74]
 [15.5  11.1 ]
 [12.5  10.74]
 [12.3  10.84]
 [ 9.9   9.96]
 [11.2  10.46]
 [13.8   9.18]
 [13.5   9.74]
 [13.4  11.1 ]
 [13.6  10.74]
 [10.2  10.84]
 [16.2   9.96]
 [11.5  10.46]
 [ 9.    9.18]
 [13.7   9.74]
 [15.5  10.72]]


##MLP

In [53]:
hidden_layer_sizes = []
for i in range(1,10):
  hidden_layer_sizes.append(((i,i,i)))

activation = ['tanh', 'relu', 'logistic', 'identity']
solver = ['lbfgs', 'sgd', 'adam']
alpha = [0.0001, 0.05, 0.01, .001]
learning_rate = ['constant','adaptive', 'invscaling']

In [54]:
hidden_layer_sizes

[(1, 1, 1),
 (2, 2, 2),
 (3, 3, 3),
 (4, 4, 4),
 (5, 5, 5),
 (6, 6, 6),
 (7, 7, 7),
 (8, 8, 8),
 (9, 9, 9)]

##MLP for maximum Temperature

In [55]:
#MLP for Maximum Temperature

#declaring parameters for MLP algorithm to determine it performance on the test data
scores_mlp_max_temp = []
hidden_layer_sizes_ = []
solver_ = []
activation_ = []
alpha_ = []
learning_rate_ = []


for i in hidden_layer_sizes:
  for j in activation:
    for k in solver:
      for l in alpha:
        for m in learning_rate:

          mlp_maxi = MLPRegressor(hidden_layer_sizes = i, activation = j, solver = k, alpha = l, learning_rate = m)
          mlp_maxi.fit(X_train_StandardScaler, Y_train_min_temp)
          scores_mlp_max_temp.append(mean_squared_error(test_dataset['Maximum Temperature'].values, mlp_maxi.predict(test_data_StandardScaler[0:61])))

          hidden_layer_sizes_.append(i)
          activation_.append(j)
          solver_.append(k)
          alpha_.append(l)
          learning_rate_.append(m)

In [56]:
#checking the minimun mean square error of the MLP algorithm used for maximum temperature prediction
min(scores_mlp_max_temp)

166.4169045536952

In [57]:
#getting the index of the minimun mean square error of the MLP algorithm from the list
index_max = np.argmin(scores_mlp_max_temp)
index_max

1186

In [58]:
#obtaining the parameters with the minimun mean square error of the mlp algorithm 
max_temp_mlp_parameters = [hidden_layer_sizes_[index_max], activation_[index_max], solver_[index_max], alpha_[index_max], learning_rate_[index_max]]
max_temp_mlp_parameters

[(9, 9, 9), 'tanh', 'adam', 0.001, 'adaptive']

In [59]:
mlp_max = MLPRegressor(hidden_layer_sizes = (2, 2, 2), activation = 'relu', solver = 'sgd' , alpha = 0.001, learning_rate = 'invscaling')
mlp_max.fit(X_train_StandardScaler, Y_train_min_temp)

MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(2, 2, 2), learning_rate='invscaling',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='sgd',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [60]:
#printing the test data side by side with the predicted data for maximum temperature
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), mlp_max.predict(test_data_StandardScaler)[0:61].reshape(len(mlp_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.          7.4133913 ]
 [29.5         9.87434826]
 [30.4         9.28136362]
 [22.8         8.69064203]
 [15.8         8.09992043]
 [16.6         7.50919884]
 [16.1         7.16565577]
 [19.5         7.40137005]
 [29.          9.73929761]
 [32.          9.13429173]
 [28.3         8.54357013]
 [25.5         7.95284854]
 [20.6         7.36212694]
 [21.          7.15363452]
 [32.5         7.38934881]
 [24.3         9.60424697]
 [21.1         8.98721983]
 [29.5         8.39649824]
 [34.4         7.80577664]
 [20.2         7.21505505]
 [25.7         7.14161327]
 [28.2         7.37732756]
 [18.8         9.46919632]
 [20.3         8.84014794]
 [30.5         8.24942634]
 [19.4         7.65870475]
 [36.          7.06798316]
 [22.9         7.12959203]
 [18.6         7.36530631]
 [27.2         9.33414568]
 [26.2         8.92862147]
 [18.6         8.33789987]
 [23.7         7.74717828]
 [22.9         7.15645668]
 [28.4         7.32994731]
 [20.5         7.56566159]
 [17.3         9.37227117]
 

##MLP for minimum Temperature

In [61]:
#mlp for minimun temperature
#declaring parameters for MLP algorithm to determine it performance on the test data
scores_mlp_min_temp = []
hidden_layer_sizes_ = []
solver_ = []
activation_ = []
alpha_ = []
learning_rate_ = []


for i in hidden_layer_sizes:
  for j in activation:
    for k in solver:
      for l in alpha:
        for m in learning_rate:

          mlp_mini = MLPRegressor(hidden_layer_sizes = i, activation = j, solver = k, alpha = l, learning_rate = m)
          mlp_mini.fit(X_train_StandardScaler, Y_train_min_temp)
          scores_mlp_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, mlp_mini.predict(test_data_StandardScaler[0:61])))

          hidden_layer_sizes_.append(i)
          activation_.append(j)
          solver_.append(k)
          alpha_.append(l)
          learning_rate_.append(m)

In [62]:
#checking the minimun mean square error of the MLP algorithm used for minimum temperature prediction
min(scores_mlp_min_temp)

11.957084593444305

In [63]:
#getting the index of the minimun mean square error of the MLP algorithm from the list
index_min = np.argmin(scores_mlp_min_temp)
index_min

1182

In [64]:
#obtaining the parameters with the minimun mean square error of the MLP algorithm 
min_temp_mlp_parameters = [hidden_layer_sizes_[index_min], activation_[index_min], solver_[index_min], alpha_[index_min], learning_rate_[index_min]]
min_temp_mlp_parameters

[(9, 9, 9), 'tanh', 'adam', 0.01, 'constant']

In [65]:
mlp_min = MLPRegressor(hidden_layer_sizes = (14, 14, 14), activation = 'tanh', solver = 'adam' , alpha = 0.01, learning_rate = 'invscaling')
mlp_min.fit(X_train_StandardScaler, Y_train_min_temp)

MLPRegressor(activation='tanh', alpha=0.01, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='invscaling',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [66]:
#printing the test data side by side with the predicted data for minimum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), mlp_min.predict(test_data_StandardScaler)[0:61].reshape(len(mlp_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6         8.25142525]
 [ 9.6         8.43105225]
 [12.1         8.51405043]
 [21.9         8.55252307]
 [11.1         8.54657304]
 [ 9.5         8.49932538]
 [11.3         8.41630269]
 [ 8.4         8.30435535]
 [11.3         8.50002185]
 [16.1         8.58305945]
 [22.8         8.62027996]
 [19.3         8.61205347]
 [15.4         8.56189765]
 [13.5         8.47574806]
 [12.3         8.36081028]
 [18.2         8.57335329]
 [11.9         8.65636256]
 [10.8         8.69217334]
 [16.1         8.68146366]
 [15.6         8.62817802]
 [13.3         8.53868968]
 [16.1         8.42057609]
 [16.6         8.65107723]
 [13.5         8.73393079]
 [13.2         8.76812077]
 [15.8         8.75467616]
 [10.2         8.69800265]
 [16.          8.60493503]
 [14.4         8.48343788]
 [12.1         8.73320568]
 [16.8         9.28424848]
 [11.9         9.38081207]
 [10.6         9.41420492]
 [11.4         9.38694452]
 [13.9         9.3058751 ]
 [12.9         9.18071873]
 [11.3         9.24693026]
 

##Decision Tree

###Maximum Temperature

In [67]:
splitter=['best','random']
criterion = ['mse', 'friedman_mse', 'mae']
max_depth = [1,2,3,4,5,10,20,30,40]
max_features =['auto','sqrt','log2'] 
min_samples_split= [2,3,4,5,10,20,30,40]
random_state= [1,2,3,4,5,6,7,8,9,42]

In [68]:
#DT for Maximum Temperature

#declaring parameters for DT algorithm to determine it performance on the test data
scores_dt_max_temp = []
splitter_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []


for i in splitter:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            dt_maxi = DecisionTreeRegressor(splitter = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n )
            dt_maxi.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_dt_max_temp.append(mean_squared_error(test_dataset['Maximum Temperature'].values, dt_maxi.predict(test_data_StandardScaler[0:61])))

            splitter_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

In [69]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_dt_max_temp)

168.72816673736958

In [70]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_max = np.argmin(scores_dt_max_temp)
index_max

6804

In [71]:
#obtaining the parameters with the minimun mean square error of the DT algorithm 
max_temp_dt_parameters = [splitter_[index_max], criterion_[index_max], max_depth_[index_max], max_features_[index_max], min_samples_split_[index_max],random_state_[index_max]]
max_temp_dt_parameters

['random', 'mse', 2, 'sqrt', 2, 5]

In [144]:
dt_max = DecisionTreeRegressor(splitter = 'random', criterion = 'mse', max_depth = 2, max_features = 'sqrt', min_samples_split = 2,random_state = 5)
dt_max.fit(X_train_StandardScaler, Y_train_min_temp)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=5, splitter='random')

In [145]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), dt_max.predict(test_data_StandardScaler)[0:61].reshape(len(dt_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.         11.81222057]
 [29.5        11.81222057]
 [30.4        11.81222057]
 [22.8        11.81222057]
 [15.8        11.81222057]
 [16.6        11.81222057]
 [16.1        11.81222057]
 [19.5        11.81222057]
 [29.         11.81222057]
 [32.         11.81222057]
 [28.3        11.81222057]
 [25.5        11.81222057]
 [20.6        11.81222057]
 [21.         11.81222057]
 [32.5        11.81222057]
 [24.3        11.81222057]
 [21.1        11.81222057]
 [29.5        11.81222057]
 [34.4        11.81222057]
 [20.2        11.81222057]
 [25.7        11.81222057]
 [28.2        11.81222057]
 [18.8        11.81222057]
 [20.3        11.81222057]
 [30.5        11.81222057]
 [19.4        11.81222057]
 [36.         11.81222057]
 [22.9        11.81222057]
 [18.6        11.81222057]
 [27.2        11.81222057]
 [26.2        11.81222057]
 [18.6        11.81222057]
 [23.7        11.81222057]
 [22.9        11.81222057]
 [28.4        11.81222057]
 [20.5        11.81222057]
 [17.3        11.81222057]
 

###Minimum Temperature

In [74]:
#DT for Minimum Temperature

#declaring parameters for DT algorithm to determine it performance on the test data
scores_dt_min_temp = []
splitter_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []


for i in splitter:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            dt_mini = DecisionTreeRegressor(splitter = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n )
            dt_mini.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_dt_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, dt_mini.predict(test_data_StandardScaler[0:61])))

            splitter_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

In [75]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_dt_min_temp)

12.496963981512133

In [76]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_min = np.argmin(scores_dt_min_temp)
index_min

6804

In [77]:
#obtaining the parameters with the minimun mean square error of the DT algorithm 
min_temp_dt_parameters = [splitter_[index_min], criterion_[index_min], max_depth_[index_min], max_features_[index_min], min_samples_split_[index_min],random_state_[index_min]]
min_temp_dt_parameters

['random', 'mse', 2, 'sqrt', 2, 5]

In [134]:
dt_min = DecisionTreeRegressor(splitter = 'random', criterion = 'mse', max_depth = 2, max_features = 'sqrt', min_samples_split = 2,random_state = 5)
dt_min.fit(X_train_StandardScaler, Y_train_min_temp)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=5, splitter='random')

In [135]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), dt_min.predict(test_data_StandardScaler)[0:61].reshape(len(dt_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6        11.81222057]
 [ 9.6        11.81222057]
 [12.1        11.81222057]
 [21.9        11.81222057]
 [11.1        11.81222057]
 [ 9.5        11.81222057]
 [11.3        11.81222057]
 [ 8.4        11.81222057]
 [11.3        11.81222057]
 [16.1        11.81222057]
 [22.8        11.81222057]
 [19.3        11.81222057]
 [15.4        11.81222057]
 [13.5        11.81222057]
 [12.3        11.81222057]
 [18.2        11.81222057]
 [11.9        11.81222057]
 [10.8        11.81222057]
 [16.1        11.81222057]
 [15.6        11.81222057]
 [13.3        11.81222057]
 [16.1        11.81222057]
 [16.6        11.81222057]
 [13.5        11.81222057]
 [13.2        11.81222057]
 [15.8        11.81222057]
 [10.2        11.81222057]
 [16.         11.81222057]
 [14.4        11.81222057]
 [12.1        11.81222057]
 [16.8        11.81222057]
 [11.9        11.81222057]
 [10.6        11.81222057]
 [11.4        11.81222057]
 [13.9        11.81222057]
 [12.9        11.81222057]
 [11.3        11.81222057]
 

##Random Forest

###RF Maximum Temperature

In [80]:

criterion = ['mse', 'friedman_mse', 'mae']
max_depth = [10,20,30,40]
max_features =['auto','sqrt','log2'] 
min_samples_split= [10,20,30,40]
random_state= [1,2,3,42]
n_estimators =[10,50,100]



In [81]:
#RF for Maximum Temperature

#declaring parameters for RF algorithm to determine it performance on the test data
scores_rf_max_temp = []
n_estimators_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []


for i in n_estimators:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            rf_maxi = RandomForestRegressor(n_estimators = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n )
            rf_maxi.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_rf_max_temp.append(mean_squared_error(test_dataset['Maximum Temperature'].values, rf_maxi.predict(test_data_StandardScaler[0:61])))

            n_estimators_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

In [82]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_rf_max_temp)

228.01545883121128

In [83]:
#getting the index of the minimun mean square error of the RF algorithm from the list
index_max = np.argmin(scores_rf_max_temp)
index_max

65

In [84]:
#obtaining the parameters with the minimun mean square error of the RF algorithm 
max_temp_rf_parameters = [n_estimators_[index_max], criterion_[index_max], max_depth_[index_max], max_features_[index_max],min_samples_split_[index_max],random_state_[index_max] ]
max_temp_rf_parameters


[10, 'mse', 20, 'sqrt', 10, 2]

In [88]:
rf_max = RandomForestRegressor(n_estimators = 10, criterion = 'mse', max_depth = 20, max_features =  'sqrt', min_samples_split = 10,random_state = 2 )
rf_max.fit(X_train_StandardScaler, Y_train_min_temp)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=2, verbose=0, warm_start=False)

In [141]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), rf_max.predict(test_data_StandardScaler)[0:61].reshape(len(rf_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.          9.7545321 ]
 [29.5         8.8941302 ]
 [30.4         8.53142795]
 [22.8         8.76217795]
 [15.8         7.95516135]
 [16.6         8.92757295]
 [16.1         9.00849917]
 [19.5         8.87178589]
 [29.          8.1866302 ]
 [32.          8.53142795]
 [28.3         8.66301128]
 [25.5         8.10321691]
 [20.6         9.0756285 ]
 [21.          9.69250711]
 [32.5         9.51151605]
 [24.3         8.08746354]
 [21.1         9.21116604]
 [29.5        10.20760835]
 [34.4        10.03963216]
 [20.2         9.85305275]
 [25.7        10.04159802]
 [28.2         9.86368388]
 [18.8         9.69581061]
 [20.3        10.47067208]
 [30.5        11.02608874]
 [19.4        10.11430303]
 [36.          9.77219777]
 [22.9         9.91823943]
 [18.6        10.05992319]
 [27.2         9.8144324 ]
 [26.2        10.6433663 ]
 [18.6        10.55574725]
 [23.7        11.0652381 ]
 [22.9        10.52557576]
 [28.4        10.49130973]
 [20.5         9.04652215]
 [17.3         9.49232434]
 

In [90]:
#rf for Minimum Temperature

#declaring parameters for rf algorithm to determine it performance on the test data
scores_rf_min_temp = []
n_estimators_ = []
criterion_ =  []
max_depth_ = []
max_features_ = []
min_samples_split_ = []
random_state_ = []


for i in n_estimators:
  for j in criterion:
    for k in max_depth:
      for l in max_features:
        for m in min_samples_split:
          for n in random_state:
            rf_mini = RandomForestRegressor(n_estimators = i, criterion = j, max_depth = k, max_features = l, min_samples_split = m,random_state = n )
            rf_mini.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_rf_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, rf_mini.predict(test_data_StandardScaler[0:61])))

            n_estimators_.append(i)
            criterion_.append(j)
            max_depth_.append(k)
            max_features_.append(l)
            min_samples_split_.append(m)
            random_state_.append(n)

In [91]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_rf_min_temp)

26.713782690054888

In [93]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_min = np.argmin(scores_rf_min_temp)
index_min

65

In [94]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_temp_rf_parameters = [n_estimators_[index_max], criterion_[index_max], max_depth_[index_max], max_features_[index_max],min_samples_split_[index_max],random_state_[index_max] ]
min_temp_rf_parameters

[10, 'mse', 20, 'sqrt', 10, 2]

In [95]:
rf_min =  RandomForestRegressor(n_estimators = 10, criterion = 'mse', max_depth = 20, max_features = 'sqrt', min_samples_split = 10,random_state = 2 )
rf_min.fit(X_train_StandardScaler, Y_train_min_temp)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=40, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=40, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [138]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), rf_min.predict(test_data_StandardScaler)[0:61].reshape(len(rf_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6     8.065 ]
 [ 9.6     8.163 ]
 [12.1     8.3655]
 [21.9     8.529 ]
 [11.1     7.5035]
 [ 9.5     7.6645]
 [11.3     7.569 ]
 [ 8.4     7.458 ]
 [11.3     7.646 ]
 [16.1     7.52  ]
 [22.8     7.631 ]
 [19.3     7.273 ]
 [15.4     7.4705]
 [13.5     7.6925]
 [12.3     7.642 ]
 [18.2     7.834 ]
 [11.9     7.9665]
 [10.8     8.1895]
 [16.1     8.224 ]
 [15.6     8.491 ]
 [13.3     8.465 ]
 [16.1     8.7175]
 [16.6     8.723 ]
 [13.5     8.8995]
 [13.2     8.9995]
 [15.8     8.7605]
 [10.2     8.777 ]
 [16.      8.7665]
 [14.4     8.7045]
 [12.1     8.878 ]
 [16.8     9.355 ]
 [11.9     9.4905]
 [10.6     9.4775]
 [11.4     9.466 ]
 [13.9     9.747 ]
 [12.9     9.6825]
 [11.3     9.566 ]
 [10.6     9.5515]
 [11.7     9.5785]
 [13.1     9.647 ]
 [ 9.6     9.506 ]
 [11.9     9.405 ]
 [16.      9.3655]
 [17.7     9.3795]
 [20.4     9.3265]
 [15.8     9.4405]
 [15.5     9.544 ]
 [12.5     9.6575]
 [12.3     9.54  ]
 [ 9.9     9.5645]
 [11.2     9.3445]
 [13.8     9.2805]
 [13.5     9

##SVR

###SVR Maximum Temperature

In [116]:
kernel =['linear', 'poly', 'rbf']
gamma = ['auto', 'scale']
degree = [3,4,5,10]
C = [2.0,3.0,4.0]

In [117]:
#SVR for Maximum Temperature

#declaring parameters for SVR algorithm to determine it performance on the test data
scores_svr_max_temp = []
kernel_ = []
gamma_ =  []
degree_ = []
C_ = []



for i in kernel:
  for j in gamma:
    for k in degree:
      for l in C:
            svr_maxi = SVR(kernel = i, gamma = j, degree = k, C = l )
            svr_maxi.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_svr_max_temp.append(mean_squared_error(test_dataset['Maximum Temperature'].values, svr_maxi.predict(test_data_StandardScaler[0:61])))

            kernel_.append(i)
            gamma_.append(j)
            degree_.append(k)
            C_.append(l)

In [118]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_svr_max_temp)

197.35316205396575

In [119]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_max = np.argmin(scores_svr_max_temp)
index_max

26

In [120]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
max_temp_svr_parameters = [kernel_[index_max], gamma_[index_max], degree_[index_max], C_[index_max]]
max_temp_svr_parameters

['poly', 'auto', 3, 4.0]

In [121]:
svr_max = SVR(kernel = 'poly', gamma = 'auto', degree = 3, C = 4)
svr_max.fit(X_train_StandardScaler, Y_train_min_temp)

SVR(C=4, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [140]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Maximum Temperature'].values.reshape(len(test_dataset['Maximum Temperature']),1), svr_max.predict(test_data_StandardScaler)[0:61].reshape(len(svr_max.predict(test_data_StandardScaler)[0:61]),1)),1))

[[17.         10.23051324]
 [29.5        10.40065254]
 [30.4        10.69282371]
 [22.8        10.92426415]
 [15.8        11.04446636]
 [16.6        11.00292284]
 [16.1        10.74912612]
 [19.5        10.23256869]
 [29.         10.35646832]
 [32.         10.68303356]
 [28.3        10.93856427]
 [25.5        11.07255294]
 [20.6        11.03449209]
 [21.         10.77387421]
 [32.5        10.24019183]
 [24.3        10.31454391]
 [21.1        10.67607377]
 [29.5        10.95626528]
 [34.4        11.10461095]
 [20.2        11.07060328]
 [25.7        10.8037348 ]
 [28.2        10.25349799]
 [18.8        10.27499467]
 [20.3        10.67205967]
 [30.5        10.97748251]
 [19.4        11.14075571]
 [36.         11.11137177]
 [22.9        10.8388232 ]
 [18.6        10.27260251]
 [27.2        10.23793591]
 [26.2        10.57377023]
 [18.6        10.86207291]
 [23.7        10.99478747]
 [22.9        10.92140641]
 [28.4        10.59142223]
 [20.5         9.95432745]
 [17.3        10.15013797]
 

###SVR Minimum Temperature

In [123]:
#svr for Minimum Temperature

#declaring parameters for svr algorithm to determine it performance on the test data
scores_svr_min_temp = []
kernel_ = []
gamma_ =  []
degree_ = []
C_ = []


for i in kernel:
  for j in gamma:
    for k in degree:
      for l in C:
            svr_mini =  SVR(kernel = i, gamma = j, degree = k, C = l )
            svr_mini.fit(X_train_StandardScaler, Y_train_min_temp)
            scores_svr_min_temp.append(mean_squared_error(test_dataset['Minimum temperature'].values, svr_mini.predict(test_data_StandardScaler[0:61])))

            kernel_.append(i)
            gamma_.append(j)
            degree_.append(k)
            C_.append(l)

In [124]:
#checking the minimun mean square error of the dt algorithm used for maximum temperature prediction
min(scores_svr_min_temp)

17.714977345357404

In [125]:
#getting the index of the minimun mean square error of the dt algorithm from the list
index_min = np.argmin(scores_svr_min_temp)
index_min

26

In [126]:
#obtaining the parameters with the minimun mean square error of the KNN algorithm 
min_temp_svr_parameters = [kernel_[index_min], gamma_[index_min], degree_[index_min], C_[index_min]]
min_temp_svr_parameters

['poly', 'auto', 3, 4.0]

In [127]:
svr_min =  SVR(kernel = 'poly', gamma = 'auto', degree = 3, C = 4)
svr_min.fit(X_train_StandardScaler, Y_train_min_temp)

SVR(C=4, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [139]:
#printing the test data side by side with the predicted data for Maximum temperature
print(np.concatenate((test_dataset['Minimum temperature'].values.reshape(len(test_dataset['Minimum temperature']),1), svr_min.predict(test_data_StandardScaler)[0:61].reshape(len(svr_min.predict(test_data_StandardScaler)[0:61]),1)),1))

[[12.6        10.23051324]
 [ 9.6        10.40065254]
 [12.1        10.69282371]
 [21.9        10.92426415]
 [11.1        11.04446636]
 [ 9.5        11.00292284]
 [11.3        10.74912612]
 [ 8.4        10.23256869]
 [11.3        10.35646832]
 [16.1        10.68303356]
 [22.8        10.93856427]
 [19.3        11.07255294]
 [15.4        11.03449209]
 [13.5        10.77387421]
 [12.3        10.24019183]
 [18.2        10.31454391]
 [11.9        10.67607377]
 [10.8        10.95626528]
 [16.1        11.10461095]
 [15.6        11.07060328]
 [13.3        10.8037348 ]
 [16.1        10.25349799]
 [16.6        10.27499467]
 [13.5        10.67205967]
 [13.2        10.97748251]
 [15.8        11.14075571]
 [10.2        11.11137177]
 [16.         10.8388232 ]
 [14.4        10.27260251]
 [12.1        10.23793591]
 [16.8        10.57377023]
 [11.9        10.86207291]
 [10.6        10.99478747]
 [11.4        10.92140641]
 [13.9        10.59142223]
 [12.9         9.95432745]
 [11.3        10.15013797]
 

In [129]:
#Using Sum of squared residuals to evaluate the performance of the models
from sklearn.metrics import r2_score
from prettytable import PrettyTable

rt = PrettyTable(["Algorithm", "r2 for minimum temperature","r2 for maximum temperature", "MSE for minimum temperature","MSE for maximum temperature",])
rt.align["Algorithm"] = "l" # Left align city names
rt.padding_width = 1 # One space between column edges and contents (default)
rt.add_row(["MLP",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, mlp_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, mlp_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, mlp_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, mlp_max.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["Decision Tree",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, dt_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, dt_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, dt_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, dt_max.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["KNN",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, knn_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, knn_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, knn_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, knn_max.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["Random forest",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, rf_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, rf_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, rf_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, rf_max.predict(test_data_StandardScaler[0:61])))])

rt.add_row(["SVM",
            '{:.2f}'.format(r2_score(test_dataset['Minimum temperature'].values, svr_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(r2_score(test_dataset['Maximum Temperature'].values, svr_max.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Minimum temperature'].values, svr_min.predict(test_data_StandardScaler[0:61]))),
            '{:.2f}'.format(mean_squared_error(test_dataset['Maximum Temperature'].values, svr_max.predict(test_data_StandardScaler[0:61])))])

print(rt)

+---------------+----------------------------+----------------------------+-----------------------------+-----------------------------+
| Algorithm     | r2 for minimum temperature | r2 for maximum temperature | MSE for minimum temperature | MSE for maximum temperature |
+---------------+----------------------------+----------------------------+-----------------------------+-----------------------------+
| MLP           |           -2.25            |           -8.53            |            30.47            |            271.98           |
| Decision Tree |           -0.33            |           -4.91            |            12.50            |            168.73           |
| KNN           |           -1.54            |           -1.56            |            23.78            |            73.16            |
| Random forest |           -2.53            |           -6.99            |            33.06            |            228.02           |
| SVM           |           -0.89            |  

In [130]:
dataset_wt_weather_features.to_csv('dataset_with_public_holidays.csv')

# Daily global solar exposure

In [131]:
Y_train_sun = dataset_wt_weather_features['Daily global solar exposure (MJ/m*m)']
# clean
Y_train_sun = Y_train_sun.fillna(Y_train_sun.mean())

In [132]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train) 
X_train_StandardScaler

array([[-1.44730998e+00, -1.71507061e+00, -1.58631120e+00,
        -9.38637205e-04],
       [-1.44730998e+00, -1.70542213e+00, -1.58631120e+00,
         4.99354993e-01],
       [-1.44730998e+00, -1.69577366e+00, -1.58631120e+00,
         9.99648624e-01],
       ...,
       [ 1.53055614e+00,  1.19876862e+00,  1.06833203e+00,
        -9.38637205e-04],
       [ 1.53055614e+00,  1.20841709e+00,  1.06833203e+00,
         4.99354993e-01],
       [ 1.53055614e+00,  1.21806557e+00,  1.06833203e+00,
         9.99648624e-01]])

In [133]:
# test_date
# test_date_StandardScaler = scaler.fit_transform(test_date) 
test_date_StandardScaler

NameError: ignored

In [None]:
#  ML models
mlp_reg_sun = MLPRegressor(random_state=42)
dct_sun = DecisionTreeRegressor(splitter='random',  max_depth=36, min_samples_split=140, random_state=42)
knn_sun = KNeighborsRegressor(metric = 'manhattan', n_neighbors = 4)
rf_sun = RandomForestRegressor(random_state=42)
svm_sun = SVR(kernel = 'rbf', gamma = 100, C = 100, degree = 0) #Using Radial-basis function kernel


In [None]:
mlp_reg_sun.fit(X_train_StandardScaler, Y_train_sun)
dct_sun.fit(X_train_StandardScaler, Y_train_sun)
knn_sun.fit(X_train_StandardScaler, Y_train_sun)
rf_sun.fit(X_train_StandardScaler, Y_train_sun)
svm_sun.fit(X_train_StandardScaler, Y_train_sun)

In [None]:
y_pred_sun_mlp = mlp_reg_sun.predict(test_date_StandardScaler)
y_pred_sun_dt = dct_sun.predict(test_date_StandardScaler)
y_pred_sun_knn = knn_sun.predict(test_date_StandardScaler)
y_pred_sun_rf = rf_sun.predict(test_date_StandardScaler)
y_pred_sun_svm = svm_sun.predict(test_date_StandardScaler)
y_pred_sun_svm

In [None]:
# from sklearn.metrics import r2_score
# from prettytable import PrettyTable

rt = PrettyTable(["Algorithm", "r2", "MSE"])
rt.align["Algorithm"] = "l" # Left align city names
rt.padding_width = 1 # One space between column edges and contents (default)
rt.add_row(["MLP",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_mlp[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_mlp[0:61]))])

rt.add_row(["Decision Tree",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_dt[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_dt[0:61]))])

rt.add_row(["KNN",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_knn[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_knn[0:61]))])

rt.add_row(["Random Forest",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_rf[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_rf[0:61]))])

rt.add_row(["SVM",
            '{:.2f}'.format(r2_score(test_dataset['Solar Exposure'].values, y_pred_sun_svm[0:61])),
            '{:.2f}'.format(mean_squared_error(test_dataset['Solar Exposure'].values, y_pred_sun_svm[0:61]))])

print(rt)

In [None]:
scores = []
kernel = []
gamma = []
C = []
degree = []

kernels = ['linear', 'rbf']
gammas = [0.1, 1, 10, 100]
Cs = [0.1, 1, 10, 100]
degrees = [0, 1, 2, 3, 4, 5, 6]


for i in kernels:
  for j in gammas:
    for k in Cs:
        for l in degrees: 
          print(i,j,k,l)
          svm = SVR(kernel=i, gamma=j, C=k, degree=l)
          svm.fit(X_train_StandardScaler, Y_train_sun)
          scores.append(mean_squared_error(test_dataset['Solar Exposure'].values, svm.predict(test_date_StandardScaler[0:61])))
          kernel.append(i)
          gamma.append(j)
          C.append(k)
          degree.append(l)

In [None]:
min(scores)
import numpy as np
index = np.argmin(scores)
index

In [None]:
p = [kernel[217], gamma[217], C[217], degree[217]]
p