In [135]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

import pandas as pd

import pickle

In [136]:
dataset = pd.read_csv("training.csv")

In [137]:
fields_to_delete = ['Unnamed: 0', 'Date', 'precip', 'snow', 'snowdepth', 'windspeed', 'winddir', 'conditions', 'solar_production']
fields_to_keep = ['temp', 'humidity', 'visibility', 'cloudcover', 'solarradiation', 'solarenergy', 'uvindex']

In [138]:
dates = dataset['Date']

timestamps = pd.to_datetime(dates)

hours = timestamps.dt.hour
months = timestamps.dt.month

dataset = dataset.dropna()

dataset = dataset.drop(fields_to_delete, axis=1)
dataset['hour'] = hours
dataset['month'] = months

dataset

Unnamed: 0,temp,humidity,visibility,cloudcover,solarradiation,solarenergy,uvindex,hour,month
0,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,2,7
1,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,2,7
2,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,2,7
3,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,2,7
4,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,2,7
...,...,...,...,...,...,...,...,...,...
17851,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,1,9
17852,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,1,9
17853,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,1,9
17854,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,1,9


In [139]:
train_set = dataset.sample(frac=0.8, random_state=42)
test_set = dataset.drop(train_set.index)

In [149]:
X_train = train_set.copy()
X_test = test_set.copy()

y_train = pd.DataFrame()
y_test = pd.DataFrame()

y_train = y_train.assign(**X_train[['visibility', 'solarradiation', 'solarenergy', 'uvindex']])
y_test = y_test.assign(**X_test[['visibility', 'solarradiation', 'solarenergy', 'uvindex']])

X_train = X_train.drop(columns=['visibility', 'solarradiation', 'solarenergy', 'uvindex'])
X_test = X_test.drop(columns=['visibility', 'solarradiation', 'solarenergy', 'uvindex'])

X_train

Unnamed: 0,temp,humidity,cloudcover,hour,month
2546,27.888889,39.01,21.4,22,4
17535,11.888889,74.34,68.0,23,9
12295,23.388889,43.03,2.4,18,9
14974,16.888889,45.21,25.0,1,9
16278,10.944444,96.90,88.0,14,9
...,...,...,...,...,...
1616,26.611111,30.73,2.4,16,1
16697,13.888889,63.15,94.3,1,9
11767,20.888889,73.34,68.0,22,6
11741,23.333333,59.56,54.8,20,6


In [151]:
y_train

Unnamed: 0,visibility,solarradiation,solarenergy,uvindex
2546,21.7,60.0,0.2,1.0
17535,6.2,30.0,0.1,0.0
12295,18.6,21.0,0.1,0.0
14974,6.2,14.0,0.1,0.0
16278,6.2,14.0,0.1,0.0
...,...,...,...,...
1616,28.0,930.0,3.3,9.0
16697,6.2,9.0,0.0,0.0
11767,5.9,9.0,0.0,0.0
11741,6.2,30.0,0.1,0.0


In [144]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)

pickle.dump(model, open("predict_missing_model.pickle", "wb"))
r2_score

0.999998229364789

In [150]:
predicted = model.predict(X_test)
print(predicted)

[[16.75928464 95.17410895  0.3801521   0.37960376]
 [16.75928464 95.17410895  0.3801521   0.37960376]
 [16.75928464 95.17410895  0.3801521   0.37960376]
 ...
 [ 2.10407768 16.32310475  0.15508814 -0.27134432]
 [ 2.10407768 16.32310475  0.15508814 -0.27134432]
 [ 2.10407768 16.32310475  0.15508814 -0.27134432]]


In [145]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)
r2_score

0.3267123844458155

In [148]:
model = MLPRegressor(hidden_layer_sizes=(10,), activation='relu', solver='adam', max_iter=5000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)
r2_score

0.480096058577977