```yaml
titan: v1
service:
  image: scipy
  machine:
    cpu: 2
    memory: 512MB
```

In [44]:
import time as t
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24
import json

Pandas   1.2.3
Sklearn  0.24.1


In [20]:
# Reading the dataset from a Gitlab repo
url = "https://storage.googleapis.com/tutorial-datasets/weather_data_GER_2016.csv"
weather = pd.read_csv(url)

In [21]:
weather.head()

Unnamed: 0,timestamp,cumulated hours,lat,lon,v1,v2,v_50m,h1,h2,z0,SWTDN,SWGDN,T,rho,p
0,2016-01-01T00:00:00Z,0,47.5,5.625,0.81,1.88,3.36,2,10,0.052526,0.0,0.0,277.350159,1.236413,99282.710938
1,2016-01-01T01:00:00Z,1,47.5,5.625,0.77,1.61,2.63,2,10,0.05251,0.0,0.0,277.025665,1.23939,99300.164062
2,2016-01-01T02:00:00Z,2,47.5,5.625,0.66,1.22,1.89,2,10,0.052495,0.0,0.0,277.223755,1.243861,99310.992188
3,2016-01-01T03:00:00Z,3,47.5,5.625,0.96,1.35,1.62,2,10,0.05248,0.0,0.0,277.13324,1.24739,99314.773438
4,2016-01-01T04:00:00Z,4,47.5,5.625,1.14,1.56,1.83,2,10,0.05248,0.0,0.0,276.867767,1.248869,99324.796875


Next, we read the weather data for Germany in 2016 by reading the full csv file.

The data in the file contains the following:

* wind
  * v1: velocity [m/s] @ height h1 (2 meters above displacement height)
  * v2: velocity [m/s] @ height h2 (10 meters above displacement height)
  * v_50m: velocity [m/s] @ 50 meters above ground
  * h1: height above ground [m] (h1 = displacement height +2m)
  * h2: height above ground [m] (h2 = displacement height +10m)
  * z0: roughness length [m]
* solar parameters:
  * SWTDN: total top-of-the-atmosphere horizontal radiation [W/m²]
  * SWGDN: total ground horizontal radiation [W/m²]
* temperature data
  * T: Temperature [K] @ 2 meters above displacement height (see h1)
* air data
  * Rho: air density [kg/m³] @ surface
  *p: air pressure [Pa] @ surface

In [22]:
# Reading the dataset from a Gitlab repo
url = "https://storage.googleapis.com/tutorial-datasets/time_series_60min_singleindex_filtered.csv"
production = pd.read_csv(url)


In [23]:
production.head()


Unnamed: 0,utc_timestamp,cet_cest_timestamp,DE_wind_generation_actual
0,2015-12-31T23:00:00Z,2016-01-01T00:00:00+0100,8638
1,2016-01-01T00:00:00Z,2016-01-01T01:00:00+0100,8579
2,2016-01-01T01:00:00Z,2016-01-01T02:00:00+0100,8542
3,2016-01-01T02:00:00Z,2016-01-01T03:00:00+0100,8443
4,2016-01-01T03:00:00Z,2016-01-01T04:00:00+0100,8295


In [24]:
# Merge datasets
weather_by_day = weather.groupby(weather.index).mean()
combined = pd.merge(production, weather_by_day, how='left', left_index=True, right_index=True)



In [25]:
combined.head()

Unnamed: 0,utc_timestamp,cet_cest_timestamp,DE_wind_generation_actual,cumulated hours,lat,lon,v1,v2,v_50m,h1,h2,z0,SWTDN,SWGDN,T,rho,p
0,2015-12-31T23:00:00Z,2016-01-01T00:00:00+0100,8638,0,47.5,5.625,0.81,1.88,3.36,2,10,0.052526,0.0,0.0,277.350159,1.236413,99282.710938
1,2016-01-01T00:00:00Z,2016-01-01T01:00:00+0100,8579,1,47.5,5.625,0.77,1.61,2.63,2,10,0.05251,0.0,0.0,277.025665,1.23939,99300.164062
2,2016-01-01T01:00:00Z,2016-01-01T02:00:00+0100,8542,2,47.5,5.625,0.66,1.22,1.89,2,10,0.052495,0.0,0.0,277.223755,1.243861,99310.992188
3,2016-01-01T02:00:00Z,2016-01-01T03:00:00+0100,8443,3,47.5,5.625,0.96,1.35,1.62,2,10,0.05248,0.0,0.0,277.13324,1.24739,99314.773438
4,2016-01-01T03:00:00Z,2016-01-01T04:00:00+0100,8295,4,47.5,5.625,1.14,1.56,1.83,2,10,0.05248,0.0,0.0,276.867767,1.248869,99324.796875


In [26]:
X = combined[['v1', 'v2', 'v_50m', 'z0']]
y = combined['DE_wind_generation_actual']

In [27]:
cat_vars  = []       # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['v1', 'v2', 'v_50m', 'z0']
print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['v1', 'v2', 'v_50m', 'z0']

Categorical features:
 []


In [28]:
# YOUR CODE HERE
num_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='mean', add_indicator=False))
])

cat_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
  ('ordinal', preprocessing.OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=99999999)) 
  # ('ordinal', preprocessing.OrdinalEncoder(categories='auto', handle_unknown='ignore'))
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro

In [29]:
num_4_multi_models = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='mean', add_indicator=False)),
  ('scaler',  preprocessing.StandardScaler())
])

cat_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
  ('onehot', preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')) 
]) #handle_unknown='use_encoded_value', unknown_value=99999999 --> what is this ?

multi_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_multi_models, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

multi_prepro

In [31]:
from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from sklearn.ensemble      import HistGradientBoostingRegressor
from catboost import CatBoostRegressor



In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge



In [47]:
# YOUR CODE HERE
#we dont need scaling/norm for these models
#ordinal encoding (sometimes binary) for categorical 
tree_regressor = {
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":ExtraTreesRegressor(),
  "Random Forest":RandomForestRegressor(),
  "AdaBoost":AdaBoostRegressor(),
  "GBM_regressor":GradientBoostingRegressor(),
  "HGB_regressor":HistGradientBoostingRegressor(),
  "CatBoost":CatBoostRegressor()
}

#we need to do scaling/norm for these models
#one hot encoding for categorical
mult_regressor= {
    "Linear_regression": LinearRegression(),
    "Ridge_regessor": Ridge(),
    "SVM_regressor":SVR(),
    "MLP_regressor":MLPRegressor(),
    "SDG_regressor": SGDRegressor(),
    "KNN_regressor": KNeighborsRegressor(),
    "Br_regressor": BayesianRidge(),
}


tree_models_regressor = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_regressor.items()}
multi_models_regressor = {name: pipeline.make_pipeline(multi_prepro, model) for name, model in mult_regressor.items()}


tree_models_regressor["Decision Tree"]

In [45]:
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

results = pd.DataFrame({'Model': [], 'MSE': [], 'RMSE': [],'RMSLE': [], 'Time': [],})

for name, pipe in tree_models_regressor.items():

    start_time = t.time()

    preds = pipe.fit(x_train,y_train).predict(x_valid)

    total_time = t.time() - start_time

    results = results.append({"Model": name,
                              "MSE":   metrics.mean_squared_error(y_valid, preds),
                              "RMSE":  metrics.mean_squared_error(y_valid, preds,squared=False),
                              'RMSLE': metrics.mean_squared_log_error(y_valid, abs(preds)),
                              "Time":  total_time},
                              ignore_index=True)


    results_ord = results.sort_values(by=['RMSLE'], ascending=True, ignore_index=True)
    results_ord.index += 1 
    clear_output()
    display(results_ord.style.bar(subset=['MSE', 'RMSE','RMSLE'], vmin=0, color='#5fba7d'))

Unnamed: 0,Model,MSE,RMSE,RMSLE,Time
1,Random Forest,15243706.105171,3904.318904,0.335793,7.584455
2,Extra Trees,19910872.952702,4462.160122,0.428087,4.780494
3,CatBoost,23271027.994184,4824.005389,0.50295,11.278654
4,Decision Tree,26999230.109277,5196.078339,0.523677,0.29838
5,HGB_regressor,24899515.274896,4989.94141,0.532458,4.793483
6,GBM_regressor,29564841.300629,5437.356095,0.610963,2.064678
7,AdaBoost,43132992.370939,6567.571269,0.860794,0.508768


In [48]:
# Apply linear regression
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

results = pd.DataFrame({'Model': [], 'MSE': [], 'RMSE': [],'RMSLE': [], 'Time': [],})

for name, pipe in multi_models_regressor.items():

    start_time = t.time()

    preds = pipe.fit(x_train,y_train).predict(x_valid)

    total_time = t.time() - start_time

    results = results.append({"Model": name,
                              "MSE":   metrics.mean_squared_error(y_valid, preds),
                              "RMSE":  metrics.mean_squared_error(y_valid, preds,squared=False),
                              'RMSLE': metrics.mean_squared_log_error(y_valid, abs(preds)),
                              "Time":  total_time},
                              ignore_index=True)


    results_ord = results.sort_values(by=['RMSLE'], ascending=True, ignore_index=True)
    results_ord.index += 1 
    clear_output()
    display(results_ord.style.bar(subset=['MSE', 'RMSE','RMSLE'], vmin=0, color='#5fba7d'))

Unnamed: 0,Model,MSE,RMSE,RMSLE,Time
1,SVM_regressor,48894352.651567,6992.449689,0.726269,14.893092
2,KNN_regressor,42910101.663472,6550.580254,0.76909,0.15666
3,Linear_regression,43891883.063369,6625.094947,0.790801,0.061358
4,Ridge_regessor,43891993.790198,6625.103304,0.790861,0.102893
5,Br_regressor,43893593.8181,6625.224058,0.79103,0.171635
6,SDG_regressor,44100376.721704,6640.811451,0.791133,0.213521
7,MLP_regressor,52146700.417981,7221.267231,0.815078,19.572862


In [58]:
X_train, X_test , y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
model = SGDRegressor()
model.fit(X_train, y_train)

In [60]:
# GET /alphas
print(f'alpha = {model.intercept_}')

alpha = [7531.62092172]


In [61]:
# GET /betas
print(f'betas = {model.coef_}')

betas = [-3169.87542948  2584.47097122  -201.83445803   327.7257001 ]


In [62]:
# Mock request object for local API testing
headers = {
'content-type': 'application/json'
}
body = json.dumps({
  "data": [[1.44, 1.77, 2, 0.054]]
})
REQUEST = json.dumps({ 'headers': headers, 'body': body })

In [63]:
# POST /prediction
body = json.loads(REQUEST)['body']
# predict the cluster for new samples. Function to be exposed through Titan
input_params = json.loads(body)['data']
#input_params = [[0.44, 1.77, 2, 0.054]]
print(model.predict(input_params))

[7155.54219409]
