# Procedure for splitting the Data in Train and Validation sets

**Merge all Data**

In [9]:
import pandas as pd

# URLs
base = "https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/"
df_umsatz = pd.read_csv(base + "umsatzdaten_gekuerzt.csv")
df_wetter = pd.read_csv(base + "wetter.csv")
df_kiwo = pd.read_csv(base + "kiwo.csv")

# Datum vereinheitlichen
for df in [df_umsatz, df_wetter, df_kiwo]:
    df["Datum"] = pd.to_datetime(df["Datum"])

# Merge
df_all = df_umsatz.merge(df_wetter, on="Datum", how="outer") \
                  .merge(df_kiwo, on="Datum", how="outer")

df_all.info()
print(df_all.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10119 entries, 0 to 10118
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id                   9334 non-null   float64       
 1   Datum                10119 non-null  datetime64[ns]
 2   Warengruppe          9334 non-null   float64       
 3   Umsatz               9334 non-null   float64       
 4   Bewoelkung           10048 non-null  float64       
 5   Temperatur           10103 non-null  float64       
 6   Windgeschwindigkeit  10103 non-null  float64       
 7   Wettercode           7581 non-null   float64       
 8   KielerWoche          250 non-null    float64       
dtypes: datetime64[ns](1), float64(8)
memory usage: 711.6 KB
Index(['id', 'Datum', 'Warengruppe', 'Umsatz', 'Bewoelkung', 'Temperatur',
       'Windgeschwindigkeit', 'Wettercode', 'KielerWoche'],
      dtype='object')


# Split the Data set by dates

training set 01.07.2013 - 31.07.2017
val set 01.08.2017 - 31.07.2018
test set 01.08.2018 -31.07.2019

In [None]:
df_all = df_all.sort_values(by="Datum")

# look at the first rows
print(df_all.head())

# define data thresshold
train_end_date = "31.07.2017"
validation_end_date = "31.07.2018"


# convert threshold dates to datetime
df_all["Datum"] = pd.to_datetime(df_all["Datum"], format="%Y-%m-%d")
train_end = pd.to_datetime(train_end_date, format="%d.%m.%Y")
validation_end = pd.to_datetime(validation_end_date, format="%d.%m.%Y")

# split data
train = df_all[df_all["Datum"] <= train_end]
validation = df_all[(df_all["Datum"] > train_end) & (df_all["Datum"] <= validation_end)]
test = df_all[df_all["Datum"] > validation_end]

# check dimensions
print(f"Train shape: {train.shape}")
print(f"Validation shape: {validation.shape}")
print(f"Test shape: {test.shape}")



Unnamed: 0,id,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche
0,,2012-01-01,,,8.0,9.8250,14.0,58.0,
1,,2012-01-02,,,7.0,7.4375,12.0,,
2,,2012-01-03,,,8.0,5.5375,18.0,63.0,
3,,2012-01-04,,,4.0,5.6875,19.0,80.0,
4,,2012-01-05,,,6.0,5.3000,23.0,80.0,
...,...,...,...,...,...,...,...,...,...
10114,,2019-07-28,,,3.0,23.3500,14.0,5.0,
10115,,2019-07-29,,,6.0,25.2500,7.0,61.0,
10116,,2019-07-30,,,7.0,20.7375,8.0,61.0,
10117,,2019-07-31,,,6.0,20.4500,7.0,61.0,


Train shape: (7917, 9)
Validation shape: (1849, 9)
Test shape: (353, 9)


## Define a simple linear model equation and conduct a linear regression using the training data


In [18]:
import statsmodels.formula.api as smf

mod = smf.ols(formula="Umsatz ~ Warengruppe", data=train)
mod = mod.fit()

# check the summary
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     27.08
Date:                Thu, 22 May 2025   Prob (F-statistic):           2.01e-07
Time:                        15:29:46   Log-Likelihood:                -48051.
No. Observations:                7493   AIC:                         9.611e+04
Df Residuals:                    7491   BIC:                         9.612e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     227.7143      3.921     58.076      

## Use Model or prediction

In [None]:

# predict
pred = mod.predict(test)
print(pred[0])