In [9]:
from datetime import datetime as dt
from datetime import timedelta
import sklearn.metrics as met
import pandas as pd

PATH_DATAFRAMES = '/Users/andreasditte/Desktop/Private_Projekte/Sonntagsfrage/src/dataframes'

In [10]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [11]:
def load_df_from_file(filename, path=PATH_DATAFRAMES):
    """
        Loads a local file into a Pandas Dataframe.

        :param filename: Name of the file that shall be loaded.
        :param path: Path to the file.
        :return: pandas.Dataframe: Returns a Pandas Dataframe created from the loaded file.
    """

    df = pd.read_pickle(f"{path}/{filename}.pkl")

    return df

In [12]:
df_generate_predictions_finish = load_df_from_file('generate_predictions_finish')

In [13]:
df_generate_predictions_finish.columns

Index(['day_in_month_sin', 'calendar_week_sin', 'weekday_sin', 'dayofyear_sin',
       'month_sin', 'day_in_month_cos', 'calendar_week_cos', 'weekday_cos',
       'dayofyear_cos', 'month_cos', 'nb_days_since_last_survey', 'CDU_CSU',
       'SPD', 'GRUENE', 'FDP', 'LINKE', 'PIRATEN', 'AfD', 'Linke_PDS', 'PDS',
       'REP_DVU', 'Sonstige', 'CDU_CSU_pred', 'SPD_pred', 'GRUENE_pred',
       'FDP_pred', 'LINKE_pred', 'PIRATEN_pred', 'AfD_pred', 'Linke_PDS_pred',
       'PDS_pred', 'REP_DVU_pred', 'Sonstige_pred'],
      dtype='object')

## Get algos to work

In [4]:
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=13)

params = {'n_estimators': 500,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}


In [7]:
y_train

array([172.,  91.,  48., 116., 185., 170., 129., 104.,  85.,  97.,  84.,
       170.,  87., 186., 273., 230., 141., 134.,  96.,  94.,  71., 214.,
       235., 252.,  72., 276., 197.,  91., 277., 145.,  52., 131.,  91.,
        65., 102.,  93., 191.,  83., 201.,  96., 118., 168.,  71., 258.,
        84., 108., 185., 198., 262.,  78., 172.,  72.,  40., 243.,  93.,
       279., 306., 111., 217.,  67., 197., 144.,  42.,  43., 246., 113.,
        59., 128., 225., 156., 219.,  64., 163.,  99.,  52., 103.,  90.,
        96., 275., 129.,  48., 264.,  77., 182., 212., 268.,  81.,  85.,
       242., 121., 195., 293.,  83.,  91., 190., 109., 146., 177., 185.,
       233.,  65., 113., 310.,  74., 139.,  68., 109., 158., 150., 158.,
       139.,  52., 210., 143., 160., 104.,  49., 163.,  90.,  55., 101.,
       109., 200., 310., 288., 192., 263., 164., 220.,  96., 202., 143.,
        55., 216., 173., 131., 245., 110., 109.,  59.,  83.,  72.,  48.,
       341.,  92., 232.,  65., 101.,  88., 122., 14

In [5]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 3006.5641


## Get time to work properly

In [6]:
dt.now()

datetime.datetime(2020, 12, 31, 11, 13, 39, 35657)

In [10]:
dt_now = dt.now()
start = dt_now - timedelta(days=dt_now.weekday())
end = start + timedelta(days=6)
print(start)
print(end)

2020-12-28 11:14:59.677252
2021-01-03 11:14:59.677252


In [14]:
dt.strftime(start, '%d.%m.%Y')

'28.12.2020'

## Create dataframe with metrics

In [14]:
y_true = [1,3,5,4,3,2]
y_pred = [6,5,1,2,5,2]
estimator = 'DecisionTreeRegressor'

In [15]:
mae = met.mean_absolute_error(y_true, y_pred)
mse = met.mean_squared_error(y_true, y_pred)
rmse = met.mean_squared_error(y_true, y_pred, squared=False)
mape = met.mean_absolute_percentage_error(y_true, y_pred)
r2 = met.r2_score(y_true, y_pred)

metrics_series = [mae, mse, rmse, mape, r2, estimator]
metrics_array = [[mae, mse, rmse, mape, r2, estimator]]
metrics_colnames = ['mae', 'mse', 'rmse', 'mape', 'r2', 'estimator']

In [16]:
df_metrics = pd.DataFrame(metrics_array, columns =metrics_colnames) 

In [17]:
df_metrics.head()

Unnamed: 0,mae,mse,rmse,mape,r2,estimator
0,2.5,8.833333,2.972092,1.272222,-4.3,DecisionTreeRegressor


In [18]:
print(df_generate_predictions_finish.dropna().count()) # 813
print(df_generate_predictions_finish.count()) # 814

day_in_month_sin             813
calendar_week_sin            813
weekday_sin                  813
dayofyear_sin                813
month_sin                    813
day_in_month_cos             813
calendar_week_cos            813
weekday_cos                  813
dayofyear_cos                813
month_cos                    813
nb_days_since_last_survey    813
CDU_CSU                      813
SPD                          813
GRUENE                       813
FDP                          813
LINKE                        813
PIRATEN                      813
AfD                          813
Linke_PDS                    813
PDS                          813
REP_DVU                      813
Sonstige                     813
CDU_CSU_pred                 813
SPD_pred                     813
GRUENE_pred                  813
FDP_pred                     813
LINKE_pred                   813
PIRATEN_pred                 813
AfD_pred                     813
Linke_PDS_pred               813
PDS_pred  

In [19]:
y_true_spd = df_generate_predictions_finish.dropna()['SPD']
y_pred_spd = df_generate_predictions_finish.dropna()['SPD_pred']

In [20]:
mae = met.mean_absolute_error(y_true_spd, y_pred_spd)
print(mae)

6.204797047970479


In [21]:
y_pred_spd

Datum_dt
1998-02-05    43.0
1998-02-10    45.0
1998-04-04    45.0
1998-04-07    41.0
1998-04-18    41.0
              ... 
2020-08-20    14.0
2020-09-01    24.0
2020-09-17    26.0
2020-10-15    14.0
2020-11-15    26.0
Name: SPD_pred, Length: 813, dtype: float64

In [22]:
test = []
test.append(metrics_series)
test.append(metrics_series)
print(test)

[[2.5, 8.833333333333334, 2.972092416687835, 1.2722222222222224, -4.3, 'DecisionTreeRegressor'], [2.5, 8.833333333333334, 2.972092416687835, 1.2722222222222224, -4.3, 'DecisionTreeRegressor']]


In [27]:
metrics_series_test = [mae, mse, rmse, mape, r2]
round_to_tenths = [round(num, 4) for num in metrics_series_test]
print(metrics_series_test)
print(round_to_tenths)

[6.204797047970479, 8.833333333333334, 2.972092416687835, 1.2722222222222224, -4.3]
[6.2048, 8.8333, 2.9721, 1.2722, -4.3]


In [28]:
metrics_series_test.append('estimator')

In [29]:
print(metrics_series_test)

[6.204797047970479, 8.833333333333334, 2.972092416687835, 1.2722222222222224, -4.3, 'estimator']


In [59]:
df_test = pd.DataFrame(test, columns =metrics_colnames) 
df_test.head()

Unnamed: 0,mae,mse,rmse,mape,r2,estimator
0,2.5,8.833333,2.972092,1.272222,-4.3,DecisionTreeRegressor
1,2.5,8.833333,2.972092,1.272222,-4.3,DecisionTreeRegressor
