In [1]:
#---------------------------------------------------------------------------------------------LIBRARIES--------------------------------------------------------------------------------------------                                                                                         #import OpenCV2 library for image processing and algorithms
import math
import csv 
import numpy as np                                                                                     #import numpy mathematical library
import pandas as pd
import pickle
import random
import matplotlib.pyplot as plt      #import matplotlib library for plotting
from micromlgen import port

from IPython.core.display import display, HTML                                    
display(HTML("<style>.container { width:100% !important; }</style>"))                                  #change width of Jupyer Notebook to use the whole window resolution availab

# Fitting Random Forest Regression to the dataset
# import the regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

In [12]:
#20 years (2000-2019) of temperature, humidity, pressure averages for each calendar day
train_set_name = "weather_data_2020_2021"
train_set_path = "./" + train_set_name + ".csv"
train_dataset = pd.read_csv(train_set_path,header=None)
test_dataset = pd.read_csv('./weather_data_2019.csv',header=None) 

print("Your Train Dataset Path is: ", train_set_path)                                                                       #display dataset name to user

#training features
temperature_train = train_dataset.iloc[1:, 4:5]
feels_like_train = train_dataset.iloc[1:, 7:8]
dew_point_train = train_dataset.iloc[1:, 8:9]
humidity_train = train_dataset.iloc[1:, 9:10]
pressure_train = train_dataset.iloc[1:, 19:20] 
uv_index_train = train_dataset.iloc[1:, 24:25] 

#testing data
temperature_test = test_dataset.iloc[1:, 4:5]
feels_like_test = test_dataset.iloc[1:, 7:8]
dew_point_test = test_dataset.iloc[1:, 8:9]
humidity_test = test_dataset.iloc[1:, 9:10]
pressure_test = test_dataset.iloc[1:, 19:20]
uv_index_test = test_dataset.iloc[1:, 24:25] 

X_train = pd.concat([temperature_train, feels_like_train, dew_point_train, humidity_train, pressure_train, uv_index_train], axis=1)
X_train.replace(([np.inf, -np.inf], np.nan), inplace=True)                                                            #replace any infinite values with nan
X_train = X_train.to_numpy()  

y_train = train_dataset.iloc[1:, 10:11].values

#https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
test_data = pd.concat([temperature_test, feels_like_test, dew_point_test, humidity_test, pressure_test, uv_index_test], axis=1)        #load the first 3 columns for temp, hum and pressure, convert to numpy array
test_data.replace(([np.inf, -np.inf], np.nan), inplace=True)                                                            #replace any infinite values with nan
test_array = test_data.to_numpy()

#change all nan values in all datasets with the mean of the rest of the data
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_train)
imp.fit(y_train)
imp.fit(test_array)

disp = int(input("Please choose 1 to display the dataset or any button to cotinue without displaying!"))
if disp == 1:
    display(X_train)
    display(y_train)
    
    display(test_data)
else: print("Not displaying dataset!")   
    
#------------------------------------------------------------------------------------------
model_name = "decisiontree_r_optimised_" + train_set_name + "_6f.p"
choice = int(input("Please choose 1 for Training and 2 for GridSearch. Pressing 3 will load the optimised Decision Tree Regressor model!"))
if choice == 1:
    # create a regressor object - hyperparameters found on dataset based on GridSearchCV
    regressor = DecisionTreeRegressor(random_state = 0)

    # fit the regressor with X and Y data
    regressor.fit(X_train, y_train)
    
    csv_name = "output_dtr_default_" + train_set_name + "_6f.csv"
    with open(csv_name, "a", newline='') as fp:
        for row in test_array:
            result = float(str(regressor.predict(row.reshape(1, -1))).replace('[','').replace(']',''))     #remove double brackets(list of lists) by converting to string, then convert back to float
            wr = csv.writer(fp, dialect='excel')
            wr.writerow([result])                                                                      #write the result to the specific row based on iteration number
                                                                                                       #result needs to be in a list
            print("Current row is: ", row)
            print("Your result is: ", result)
            
elif choice == 2:
    decision_tree_regressor = DecisionTreeRegressor()
    parameters={"splitter":["best","random"],
               "max_depth" : [1,3,5,7,9,11,12],
               "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
               "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5],
               "max_features":["auto","log2","sqrt",None],
               "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90],
               "random_state":[0]}
    grid = GridSearchCV(decision_tree_regressor,param_grid=parameters,scoring='neg_mean_squared_error', cv=3,verbose=1)
    
    grid.fit(X_train, y_train)
    pickle.dump(grid, open(model_name, "wb"))
    
elif choice == 3:
    rfr = pickle.load(open(model_name, "rb"))
    csv_name = "output_dtr_optimised_" + train_set_name + "_6f.csv"
    with open(csv_name, "a", newline='') as fp:
        for row in test_array:
            result = float(str(rfr.predict(row.reshape(1, -1))).replace('[','').replace(']',''))
            wr = csv.writer(fp, dialect='excel')
            wr.writerow([result])
            
            print("Current row is: ", row)
            print("Your result is: ", result)
    fp.close()

Your Train Dataset Path is:  ./weather_data_2020_2021.csv
Please choose 1 to display the dataset or any button to cotinue without displaying!3
Not displaying dataset!
Please choose 1 for Training and 2 for GridSearch. Pressing 3 will load the optimised Decision Tree Regressor model!3
Current row is:  ['8.5' '6.9' '4.6' '77.03' '1035.7' '1']
Your result is:  0.14188679
Current row is:  ['5.1' '3.4' '0.4' '71.83' '1043.1' '1']
Your result is:  0.14188679
Current row is:  ['4.7' '3.7' '-1.2' '65.73' '1043.2' '1']
Your result is:  0.14188679
Current row is:  ['3' '1.4' '-1.3' '74' '1042.3' '1']
Your result is:  0.14188679
Current row is:  ['4.4' '2.2' '0.8' '77.73' '1039.1' '1']
Your result is:  0.14188679
Current row is:  ['7.3' '5.8' '4.2' '80.9' '1036.8' '1']
Your result is:  0.14188679
Current row is:  ['9.3' '8.1' '6.6' '83.65' '1032.3' '1']
Your result is:  0.14188679
Current row is:  ['6.9' '4.1' '2' '71.22' '1028.1' '2']
Your result is:  0.14188679
Current row is:  ['3.9' '0.7' '0.