In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

def get_dummy_data(path='./src/data/bakery_sales_dataset_preprocessed.csv'):
    df = pd.read_csv(path, sep=',', index_col=0)
    x_columns = ['date', 'daytime', 'weekday', 'holiday', 'h_type','weather', 'temp']
    # Drop date because we only have 1 year of data.
    # Month is not considered as a feature because of this also.
    x = df[x_columns].drop(['date'], axis=1)
    y = df.drop(x_columns, axis=1)
    return x, y


def get_linear_regression_model(x, y):
    clf = Pipeline([
        ('column_transform', ColumnTransformer([
            ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False), ['weekday'])
        ], remainder='passthrough')),
        ("scaler", StandardScaler()),
        ("clf", LinearRegression(n_jobs=-1)),
    ])
    scores = cross_val_score(clf, x, y, cv=5, scoring='neg_mean_squared_error')
    print("%0.2f mean squared error with a standard deviation of %0.2f" % (-scores.mean(), scores.std()))
    clf.fit(x, y)

    return clf


def score(y_test, y_pred):
    """ Print and return various validation scores """
    score = {
        "mse": mean_squared_error(y_test, y_pred),
        # "roc_auc": roc_auc_score(y_test, y_pred),
    }
    print(score)
    return score

In [63]:
x, y = get_dummy_data('/Users/moberleitner/fhrepos/savebread/src/data/bakery_sales_dataset_preprocessed.csv')
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=42
)
x

Unnamed: 0,daytime,weekday,holiday,h_type,weather,temp
0,2,3,0,0,2,22.0
1,1,4,0,0,1,23.8
2,1,5,0,0,2,25.1
3,2,5,0,0,2,25.1
4,1,6,0,0,1,26.0
...,...,...,...,...,...,...
412,1,3,1,4,1,16.5
413,2,3,1,4,1,16.5
414,1,4,1,1,1,20.5
415,2,4,1,1,1,20.5


In [26]:
y

Unnamed: 0,angbutter,plain bread,jam,americano,croissant,caffe latte,tiramisu croissant,cacao deep,pain au chocolat,almond croissant,...,gateau chocolat,pandoro,cheese cake,lemon ade,orange pound,wiener,vanila latte,berry ade,tiramisu,merinque cookies
0,2.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
3,4.0,2.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,14.0,0.0,1.0,3.0,0.0,1.0,8.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,5.0,2.0,0.0,1.0,1.0,0.0,4.0,1.0,2.0,0.0,...,1.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0
413,2.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
414,14.0,3.0,0.0,4.0,4.0,1.0,4.0,1.0,3.0,1.0,...,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0
415,1.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# First Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LinearRegression()),
]).fit(x_train, y_train)

y_pred = pipeline.predict(x_test)
score(y_test, y_pred)

{'mse': 2.430383577630602}


{'mse': 2.430383577630602}

In [38]:
# Add One Hot Encoding
pipeline = Pipeline([
    ('column_transform', ColumnTransformer([
        ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False), ['weekday'])
    ], remainder='passthrough')),
    ("scaler", StandardScaler()),
    ("clf", LinearRegression(n_jobs=-1)),
]).fit(x_train, y_train)

y_pred = pipeline.predict(x_test)
score(y_test, y_pred)

{'mse': 2.405398021028507}


{'mse': 2.405398021028507}

In [52]:
get_linear_regression_model(*get_dummy_data('/Users/moberleitner/fhrepos/savebread/src/data/bakery_sales_dataset_preprocessed.csv'))

ValueError: 'mean_squared_error' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.

In [60]:
# Try cross validation
clf = Pipeline([
    ('column_transform', ColumnTransformer([
        ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False), ['weekday'])
    ], remainder='passthrough')),
    ("scaler", StandardScaler()),
    ("clf", LinearRegression(n_jobs=-1)),
])
scores = cross_val_score(clf, x, y, cv=5, scoring='neg_mean_squared_error')
print("%0.2f mean squared error with a standard deviation of %0.2f" % (-scores.mean(), scores.std()))
clf.fit(x, y)

0.01 mean squared error with a standard deviation of 0.01




Pipeline(steps=[('column_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one_hot',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore',
                                                                sparse=False),
                                                  ['weekday'])])),
                ('scaler', StandardScaler()),
                ('clf', LinearRegression(n_jobs=-1))])

In [67]:
# Combine x and y

# clf = get_linear_regression_model(x, y)
y_pred = clf.predict(x)
df_y = pd.DataFrame(y_pred, columns=y.columns)
pd.concat([x, df_y], axis=1)

Unnamed: 0,daytime,weekday,holiday,h_type,weather,temp,angbutter,plain bread,jam,americano,...,gateau chocolat,pandoro,cheese cake,lemon ade,orange pound,wiener,vanila latte,berry ade,tiramisu,merinque cookies
0,2,3,0,0,2,22.0,3.970949,0.215654,0.268798,0.126693,...,0.573263,0.219915,-0.177107,0.014684,0.395598,0.798998,0.256673,0.078548,0.030275,0.116910
1,1,4,0,0,1,23.8,10.144080,2.831547,0.751645,1.182663,...,0.689915,1.385045,0.111248,0.174372,1.879356,1.767139,0.683841,0.216438,0.044127,0.114321
2,1,5,0,0,2,25.1,12.108533,3.423650,0.934629,1.743538,...,1.055234,1.837140,0.148007,0.210249,1.734961,2.193500,0.707888,0.228599,0.115665,0.183729
3,2,5,0,0,2,25.1,4.073557,0.470067,0.329365,0.440945,...,0.713963,0.630272,-0.058439,0.131111,0.389380,0.981145,0.199439,0.076870,0.092384,0.130944
4,1,6,0,0,1,26.0,13.557374,4.305127,0.912009,2.614127,...,0.964553,1.721422,0.177645,0.118948,2.302704,2.523483,1.105881,0.272053,0.079488,0.205099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,1,3,1,4,1,16.5,9.978449,2.595717,0.513597,1.894042,...,0.737779,1.440245,0.129662,0.002541,1.781241,1.837163,0.814138,0.100090,0.021570,0.094133
413,2,3,1,4,1,16.5,1.943473,-0.357866,-0.091666,0.591448,...,0.396508,0.233377,-0.076784,-0.076597,0.435660,0.624808,0.305689,-0.051639,-0.001711,0.041349
414,1,4,1,1,1,20.5,10.587659,3.203962,0.650906,1.958280,...,0.708002,1.495401,0.233822,0.203798,2.570147,1.578623,1.257787,0.166355,0.002556,0.346835
415,2,4,1,1,1,20.5,2.552682,0.250380,0.045642,0.655687,...,0.366731,0.288533,0.027376,0.124660,1.224566,0.366268,0.749337,0.014626,-0.020724,0.294050


array(['one_hot__weekday_1', 'one_hot__weekday_2', 'one_hot__weekday_3',
       'one_hot__weekday_4', 'one_hot__weekday_5', 'one_hot__weekday_6',
       'remainder__daytime', 'remainder__holiday', 'remainder__h_type',
       'remainder__weather', 'remainder__temp'], dtype=object)