In [3]:
import pandas as pd
from sklearn.metrics import r2_score
import suncalc

from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV


latitude = 46.5095
longitude = 6.6243

In [4]:
training_set = pd.read_csv("training.csv")
test_set = pd.read_csv("test_students.csv")
x = training_set.drop(['solar_production'], axis=1).copy()
y = training_set['solar_production'].copy()

In [5]:
x['Date'] = pd.to_datetime(x['Date'])
x.sort_values(by='Date', ascending=True)
x['Date'] = x['Date'] - pd.Timedelta(hours=2)
x['timestamp'] = x['Date'].apply(lambda x: int(x.timestamp()))
x['day'] = x['Date'].dt.day
x['month'] = x['Date'].dt.month
x['hour'] = x['Date'].dt.hour
x['minute'] = x['Date'].dt.minute
x['timeofday'] = x['minute'] + x['hour'] * 60
x['timeofyear'] = (x['Date'].dt.dayofyear * 24 + x['hour']) * 60 + x['minute']

In [6]:
latitude = 46.5095
longitude = 6.6243

sun_pos = pd.DataFrame.from_dict(
    x['Date'].apply(lambda d: suncalc.get_position(d.to_pydatetime(), longitude, latitude)).to_dict(), orient='index')
sun_time = pd.DataFrame.from_dict(
    x['Date'].apply(lambda d: suncalc.get_times(d.to_pydatetime(), longitude, latitude)).to_dict(), orient='index')
sun_time = sun_time.applymap(pd.to_datetime)
sun_time = sun_time.apply(lambda x: x.dt.second + x.dt.minute * 60 + x.dt.hour * 3600)
sun_time = sun_time.subtract(x['timeofday'], axis=0)
x = pd.concat([x, sun_pos, sun_time], axis=1)

In [7]:
x = x.drop(['Unnamed: 0', 'Date', 'conditions'], axis=1)

# Create a boolean mask that is true for rows where column A is not NaN
mask = x['temp'].notna()

# Use the mask to select the rows where column A is not NaN and column B has NaN
x.loc[mask & x['uvindex'].isna(), 'uvindex'] = 0
x.loc[mask & x['solarenergy'].isna(), 'solarenergy'] = 0

In [8]:
scaler = MinMaxScaler()
scaler.fit(x)
x = pd.DataFrame(scaler.transform(x), columns=x.columns)

imp = KNNImputer(n_neighbors=3, weights="distance")
x = pd.DataFrame(imp.fit_transform(x), columns=x.columns)

In [9]:
c1 = -42.379
c2 = 2.04901523
c3 = 10.14333127
c4 = -0.22475541
c5 = -6.83783e-03
c6 = -5.481717e-02
c7 = 1.22874e-03
c8 = 8.5282e-04
c9 = -1.99e-06
x['heat_index'] = c1 + c2 * x['temp'] + c3 * x['humidity'] + c4 * x['temp'] * x['humidity'] + c5 * x[
    'temp'] ** 2 + c6 * x['humidity'] ** 2 + c7 * x['temp'] ** 2 * x['humidity'] + c8 * x['temp'] * x[
                      'humidity'] ** 2 + c9 * x['temp'] ** 2 * x['humidity'] ** 2

In [10]:
x = x.drop(['snow', 'snowdepth', 'winddir', 'minute', 'hour', 'month',  'visibility', 'windspeed', 'solarradiation'], axis=1)

x = x.drop(sun_time.columns, axis=1)
x = x.drop(sun_pos.columns, axis=1)

In [11]:
# sklearn-genetic-opt
from sklearn_genetic import GASearchCV
from sklearn_genetic import ExponentialAdapter
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn.model_selection import KFold

mutation_adapter = ExponentialAdapter(initial_value=0.8, end_value=0.2, adaptive_rate=0.1)
crossover_adapter = ExponentialAdapter(initial_value=0.2, end_value=0.8, adaptive_rate=0.1)
from sklearn.model_selection import train_test_split, StratifiedKFold
cv = KFold(n_splits=3, shuffle=False)


param_grid = {
    'loss': Categorical(['huber']),
    'learning_rate': Continuous(0.04, 0.06, distribution='log-uniform'),
    'n_estimators': Integer(60, 80),
    'subsample': Continuous(0.01, 0.5, distribution='log-uniform'),
    'criterion': Categorical(['friedman_mse']),
    'min_samples_split': Integer(30,50),
    'min_samples_leaf': Integer(40,60),
    'max_depth': Integer(4, 10),
    'min_impurity_decrease': Continuous(0.2, 0.5, distribution='log-uniform'),
    'max_leaf_nodes': Integer(5,15),
    'random_state': Integer(42,42)
}

model = GradientBoostingRegressor()

grid_search = GASearchCV(estimator=model,
                         cv=cv,
                         scoring='r2',
                         population_size=20,
                         generations=25,
                         mutation_probability=mutation_adapter,
                         crossover_probability=crossover_adapter,
                         param_grid=param_grid,
                         n_jobs=-1)

In [12]:
grid_search.fit(x, y)
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.809562	0.0351015  	0.835518   	0.686627   
1  	40    	0.826018	0.005771   	0.835518   	0.81291    
2  	40    	0.829815	0.00466053 	0.836292   	0.815953   
3  	40    	0.831911	0.00115365 	0.832726   	0.828233   
4  	40    	0.832279	0.00076675 	0.833566   	0.830447   
5  	40    	0.832889	0.000426695	0.833587   	0.832273   
6  	40    	0.833374	0.000647968	0.83457    	0.832353   
7  	40    	0.834376	0.000982133	0.835946   	0.833566   
8  	40    	0.835583	0.00101119 	0.83714    	0.833824   
9  	40    	0.836139	0.000683415	0.83714    	0.833765   
10 	40    	0.836556	0.000545942	0.83714    	0.835613   
11 	40    	0.836735	0.000629241	0.83714    	0.834967   
12 	40    	0.83714 	0          	0.83714    	0.83714    
13 	40    	0.83714 	0          	0.83714    	0.83714    
14 	40    	0.83714 	0          	0.83714    	0.83714    
15 	40    	0.83714 	0          	0.83714    	0.83714    
16 	40    	0.83714 	0          	0.83714    	0.83

In [14]:
model = GradientBoostingRegressor(tol=1e-10, learning_rate=0.07, loss='huber', min_samples_leaf=15, min_samples_split=50, random_state=42)
model.fit(x, y)

In [15]:
res = pd.DataFrame(cross_validate(estimator=model,
                                  X=x,
                                  y=y,
                                  cv=10,
                                  scoring=['r2']))
res.median()

fit_time      2.181290
score_time    0.002000
test_r2       0.786743
dtype: float64

In [16]:
res.median()

In [None]:
0.7902499128274068