In [1]:
import pickle

import pandas as pd

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

In [2]:
fields_to_delete = ['Unnamed: 0', 'Date', 'precip', 'snow', 'snowdepth', 'windspeed', 'winddir', 'conditions']
fields_to_keep = ['temp', 'humidity', 'visibility', 'cloudcover', 'solarradiation', 'solarenergy', 'uvindex']
fields_to_predict_missing = ['temp', 'humidity', 'cloudcover', 'hour', 'month']

# Load training set and clean the data for training

Missing values for the columns {visibility, solarradiation, solarenergy, uvindex} get predicted by another Model (RandomForestRegressor)

In [3]:
dataset = pd.read_csv("training.csv")

In [4]:
# Remove irrelevant columns from Dataset
dates = dataset['Date']

timestamps = pd.to_datetime(dates)

hours = timestamps.dt.hour
months = timestamps.dt.month

dataset = dataset.drop(fields_to_delete, axis=1)
dataset['month'] = months
dataset['hour'] = hours

dataset

Unnamed: 0,temp,humidity,visibility,cloudcover,solarradiation,solarenergy,uvindex,solar_production,month,hour
0,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
1,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
2,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
3,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
4,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
...,...,...,...,...,...,...,...,...,...,...
17851,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,0.0,9,1
17852,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,0.0,9,1
17853,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,0.0,9,1
17854,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,0.0,9,1


In [5]:
dataset = dataset.dropna(subset=fields_to_predict_missing)

In [6]:
# Import pretrained model to predict missing values of the fields: Visibility, Solarradiation, Solarenergy, Uvindex
loaded_model = pickle.load(open("predict_missing_model.pickle", "rb"))

In [7]:
set_predict_missing = pd.DataFrame()
set_predict_missing = set_predict_missing.assign(**dataset[fields_to_predict_missing])
set_predict_missing

Unnamed: 0,temp,humidity,cloudcover,hour,month
0,21.277778,51.52,9.5,2,7
1,21.277778,51.52,9.5,2,7
2,21.277778,51.52,9.5,2,7
3,21.277778,51.52,9.5,2,7
4,21.277778,51.52,9.5,2,7
...,...,...,...,...,...
17851,10.888889,76.90,88.0,1,9
17852,10.888889,76.90,88.0,1,9
17853,10.888889,76.90,88.0,1,9
17854,10.888889,76.90,88.0,1,9


In [8]:
predicted_missing = loaded_model.predict(set_predict_missing)
predicted_missing_df = pd.DataFrame(predicted_missing, columns=['visibility', 'solarradiation', 'solarenergy', 'uvindex'])
predicted_missing_df

Unnamed: 0,visibility,solarradiation,solarenergy,uvindex
0,21.7,33.0,0.1,0.0
1,21.7,33.0,0.1,0.0
2,21.7,33.0,0.1,0.0
3,21.7,33.0,0.1,0.0
4,21.7,33.0,0.1,0.0
...,...,...,...,...
17259,5.6,144.0,0.5,1.0
17260,5.6,144.0,0.5,1.0
17261,5.6,144.0,0.5,1.0
17262,5.6,144.0,0.5,1.0


In [9]:
dataset

Unnamed: 0,temp,humidity,visibility,cloudcover,solarradiation,solarenergy,uvindex,solar_production,month,hour
0,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
1,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
2,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
3,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
4,21.277778,51.52,21.7,9.5,33.0,0.1,0.0,0.0,7,2
...,...,...,...,...,...,...,...,...,...,...
17851,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,0.0,9,1
17852,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,0.0,9,1
17853,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,0.0,9,1
17854,10.888889,76.90,5.6,88.0,144.0,0.5,1.0,0.0,9,1


In [10]:
# Replace the NaN values of the Dataset by the predicted value
nas = dataset.isna()
for index, row in nas.iterrows():
    try:
        if row['visibility']:
            dataset['visibility'][index] = predicted_missing_df['visibility'][index]
        if row['solarradiation']:
            dataset['solarradiation'][index] = predicted_missing_df['solarradiation'][index]
        if row['solarenergy']:
            dataset['solarenergy'][index] = predicted_missing_df['solarenergy'][index]
        if row['uvindex']:
            dataset['uvindex'][index] = predicted_missing_df['uvindex'][index]
    except KeyError:
        continue
dataset = dataset.dropna(subset=fields_to_keep)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['visibility'][index] = predicted_missing_df['visibility'][index]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['solarradiation'][index] = predicted_missing_df['solarradiation'][index]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['solarenergy'][index] = predicted_missing_df['solarenergy'][index]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stab

# Train the DNN with the training_set

In [11]:
train_set = dataset.sample(frac=0.8, random_state=42)
test_set = dataset.drop(train_set.index)

In [12]:
X_train = train_set.copy()
X_test = test_set.copy()

y_train = X_train.pop('solar_production')
y_test = X_test.pop('solar_production')

In [13]:
X_train

Unnamed: 0,temp,humidity,visibility,cloudcover,solarradiation,solarenergy,uvindex,month,hour
16773,11.444444,79.50,6.200,52.7,7.00,0.000,0.00,9,7
3684,17.000000,68.97,14.260,0.0,202.31,0.732,2.01,8,21
2374,18.944444,59.57,12.400,0.0,175.60,0.660,1.60,4,7
8984,15.944444,83.10,19.220,0.0,175.60,0.660,1.60,8,6
3037,21.333333,69.35,21.700,0.0,898.00,3.200,9.00,6,15
...,...,...,...,...,...,...,...,...,...
1731,19.000000,60.37,19.871,0.0,121.46,0.470,1.06,2,2
10146,14.888889,80.00,5.200,98.0,12.19,0.034,0.00,1,7
9188,21.944444,63.53,15.500,7.9,62.89,0.267,0.48,8,23
8397,19.666667,68.91,19.220,0.0,175.60,0.660,1.60,8,5


In [14]:
model = mlp = MLPRegressor(
    hidden_layer_sizes=(350, 350, 350),
    activation='tanh',
    solver='adam',
    alpha=1e-5,
    learning_rate_init=0.001,
    max_iter=500
)

In [15]:
model.fit(X_train, y_train)

In [16]:
# init Series used for r2 evaluation
res = pd.Series(dtype='float64')
Y_true_tot = pd.Series(dtype='float64')
Y_pred_tot = pd.Series(dtype='float64')

In [17]:
Y_guess = model.predict(X_test)

Y_true_tot = pd.concat((Y_true_tot, y_test))
Y_pred_tot = pd.concat((Y_pred_tot, pd.DataFrame(Y_guess)))
res = pd.concat((res, abs(y_test - Y_guess)))

In [18]:
r2_score(Y_true_tot, Y_pred_tot)

0.9224614656562934

In [446]:
# Only execute whe we got a good result
pickle.dump(model, open("modelX-score.pickle", "wb"))

# Now load the test_set and write the predictions to a csv

In [19]:
# Load testset
test_set = pd.read_csv("test_students.csv")
test_set

Unnamed: 0.1,Unnamed: 0,Date,temp,humidity,precip,snow,snowdepth,windspeed,winddir,visibility,cloudcover,solarradiation,solarenergy,uvindex,conditions
0,0,2022-01-10 02:00:00,8.833333,90.81,0.0,0.0,0.0,2.3,200.0,6.2,50.0,,,,Partially cloudy
1,1,2022-01-10 02:05:00,8.833333,90.81,0.0,0.0,0.0,2.3,200.0,6.2,50.0,,,,Partially cloudy
2,2,2022-01-10 02:10:00,8.833333,90.81,0.0,0.0,0.0,2.3,200.0,6.2,50.0,,,,Partially cloudy
3,3,2022-01-10 02:15:00,8.833333,90.81,0.0,0.0,0.0,2.3,200.0,6.2,50.0,,,,Partially cloudy
4,4,2022-01-10 02:20:00,8.833333,90.81,0.0,0.0,0.0,2.3,200.0,6.2,50.0,,,,Partially cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2299,2299,2022-10-10 01:35:00,,,,,,,,,,,,,
2300,2300,2022-10-10 01:40:00,,,,,,,,,,,,,
2301,2301,2022-10-10 01:45:00,,,,,,,,,,,,,
2302,2302,2022-10-10 01:50:00,,,,,,,,,,,,,


In [20]:
dates = test_set['Date']
timestamps = pd.to_datetime(dates)

hours = timestamps.dt.hour
months = timestamps.dt.month

id_col = test_set['Unnamed: 0']

test_set = test_set.drop(fields_to_delete, axis=1)
for col in test_set.columns:
    if col == 'Date':
        continue
    test_set[col].fillna(test_set[col].mean(), inplace=True)
test_set['month'] = months
test_set['hour'] = hours

X_test = test_set.copy()
X_test

Unnamed: 0,temp,humidity,visibility,cloudcover,solarradiation,solarenergy,uvindex,month,hour
0,8.833333,90.810000,6.200000,50.000000,294.382022,1.060674,2.842697,1,2
1,8.833333,90.810000,6.200000,50.000000,294.382022,1.060674,2.842697,1,2
2,8.833333,90.810000,6.200000,50.000000,294.382022,1.060674,2.842697,1,2
3,8.833333,90.810000,6.200000,50.000000,294.382022,1.060674,2.842697,1,2
4,8.833333,90.810000,6.200000,50.000000,294.382022,1.060674,2.842697,1,2
...,...,...,...,...,...,...,...,...,...
2299,15.100292,73.682421,6.089535,53.099474,294.382022,1.060674,2.842697,10,1
2300,15.100292,73.682421,6.089535,53.099474,294.382022,1.060674,2.842697,10,1
2301,15.100292,73.682421,6.089535,53.099474,294.382022,1.060674,2.842697,10,1
2302,15.100292,73.682421,6.089535,53.099474,294.382022,1.060674,2.842697,10,1


In [21]:
# Load saved model if necessary, else comment this block
model = pickle.load(open("model1-952.pickle", "rb"))

In [22]:
predicted = model.predict(X_test)
predicted

array([0.12427078, 0.12427078, 0.12427078, ..., 0.0107426 , 0.0107426 ,
       0.0107426 ])

In [23]:
# Set values below 0 to 0 (it predicts sometimes values really close to 0 but negative like -0.006 and so on)
for i in range(len(predicted)):
    if predicted[i] < 0:
        predicted[i] = 0.0
predicted

array([0.12427078, 0.12427078, 0.12427078, ..., 0.0107426 , 0.0107426 ,
       0.0107426 ])

In [24]:
df = pd.DataFrame({'id': id_col, 'predicted': predicted})

In [25]:
df.to_csv("V2_DEFFHERREAUX.csv", index=False)