# Daten einlesen und Pipeline definieren

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score



def r2(actual: np.ndarray, predicted: np.ndarray):
    """ R2 Score """
    return r2_score(actual, predicted)
def adjr2(actual: np.ndarray, predicted: np.ndarray, rowcount: int, featurecount: int):
    """ R2 Score """
    return 1-(1-r2(actual,predicted))*(rowcount-1)/(rowcount-featurecount)

# 1. Daten einlesen und zusammenführen
df1 = pd.read_csv('../umsatzdaten_gekuerzt.csv')
df2 = pd.read_csv('../wetter.csv')
df3 = pd.read_csv('../kiwo.csv')
df4 = pd.read_csv('../Feier_Bruecke_Ferien_bis2018.csv')

df = df1.merge(df2, on='Datum', how='left')
df = df.merge(df3, on='Datum', how='left')
df = df.merge(df4, on='Datum', how='left')

# replace NaN with False for 'KielerWoche'
df['KielerWoche'] = df['KielerWoche'].fillna(False)
df['KielerWoche'] = df['KielerWoche'].astype('bool')

# Sicherstellen, dass 'Datum' als datetime konvertiert ist
df['Datum'] = pd.to_datetime(df['Datum'])

# 2. Pipeline-Komponenten definieren

# a) Feature Engineering: Erstellen von Datum- und zyklischen Features
def add_features(df):
    # df = df.copy()
    # Basismerkmale
    df['Jahr'] = df['Datum'].dt.year
    df['Monat'] = df['Datum'].dt.month
    df['Wochentag'] = df['Datum'].dt.weekday
    df['Kalenderwoche'] = df['Datum'].dt.isocalendar().week
    df['Tag_im_Jahr'] = df['Datum'].dt.dayofyear
    df['Ist_Wochenende'] = df['Wochentag'].isin([5, 6]).astype(int)
    
    # Zyklische Merkmale
    df['Tag_im_Jahr_sin'] = np.sin(2 * np.pi * df['Tag_im_Jahr'] / 365)
    df['Tag_im_Jahr_cos'] = np.cos(2 * np.pi * df['Tag_im_Jahr'] / 365)
    df['Monat_sin'] = np.sin(2 * np.pi * df['Monat'] / 12)
    df['Monat_cos'] = np.cos(2 * np.pi * df['Monat'] / 12)
    df['Wochentag_sin'] = np.sin(2 * np.pi * df['Wochentag'] / 7)
    df['Wochentag_cos'] = np.cos(2 * np.pi * df['Wochentag'] / 7)
        
    return df

df = add_features(df)
# b) FunctionTransformer für Feature Engineering
feature_engineering = FunctionTransformer(add_features, validate=False)

# c) Definieren der Feature-Gruppen
numeric_features = ['Temperatur', 'Bewoelkung', 'Windgeschwindigkeit',
                    'Tag_im_Jahr_sin', 'Tag_im_Jahr_cos',
                    'Monat_sin', 'Monat_cos',
                    'Wochentag_sin', 'Wochentag_cos','feiertag', 'KielerWoche',
                    'brueckentag', 'BW', 'BY', 'B', 'BB', 'HB', 'HH', 'HE', 'MV',
                      'NI', 'NW', 'RP', 'SL', 'SN', 'ST', 'SH', 'TH']
                    
categorical_features = ['Warengruppe', 'Wettercode', 'Wochentag']

# d) Definieren des ColumnTransformers
# Stellen Sie sicher, dass 'Wettercode' als String vorliegt
df['Wettercode'] = df['Wettercode'].astype(str)

# Liste der Kategorien für 'Wettercode'
wettercode_categories = [str(i) for i in range(0, 100)] + ['Unbekannt']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        
        ('wettercode', OneHotEncoder(categories=[wettercode_categories], handle_unknown='ignore'), ['Wettercode']),
        
        ('warengruppe', OneHotEncoder(handle_unknown='ignore'), ['Warengruppe']),
        
        ('wochentag', OneHotEncoder(handle_unknown='ignore'), ['Wochentag'])
    ],
    remainder='drop'
)

# Define your date thresholds
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'

train_data = df[df['Datum'] <= train_end_date]
test_data = df[df['Datum'] > train_end_date]

df = df.drop('Datum', axis=1)
df = df.drop('id', axis=1)

X = df.drop(['Umsatz'], axis=1)
y = df['Umsatz']

X_train  = train_data.drop(['Umsatz'], axis=1)  # Behalten Sie 'Datum' in X für die Pipeline
y_train = train_data['Umsatz']

X_test  = test_data.drop(['Umsatz'], axis=1)  # Behalten Sie 'Datum' in X für die Pipeline
y_test = test_data['Umsatz']


In [None]:
df.columns.size

In [None]:
df.head

# Neuronales Netz nutzen

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam


# Transformation der Daten
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Überprüfen der Dimensionen der transformierten Daten
input_shape = X_train_preprocessed.shape[1]


model = Sequential([
  InputLayer(shape=(input_shape, )),
  BatchNormalization(),
  Dense(72, activation='relu'),
  Dropout(0.2),  # Regularisierung
  Dense(36, activation='relu'),
  Dropout(0.2),  # Regularisierung
  Dense(18, activation='relu'),
  Dense(1, activation='linear')
])

model.summary()

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Training
history = model.fit(
    X_train_preprocessed,
    y_train,
    validation_data=(X_test_preprocessed, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)

y_pred = model.predict(X_test_preprocessed)
print("R² on test:", r2_score(y_test, y_pred))
print("adjusted R² on test:", adjr2(y_test, y_pred, len(y), X.shape[1]))

# Performance evaluieren

In [None]:
import matplotlib.pyplot as plt

# Flatten y_pred
y_pred_flat = y_pred.flatten()

# Scatterplot
plt.scatter(y_test, y_pred_flat, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Diagonale Linie
plt.xlabel("Tatsächliche Werte")
plt.ylabel("Vorhergesagte Werte")
plt.title("Tatsächliche Werte vs. Vorhergesagte Werte")
plt.show()

residuals = y_test - y_pred_flat

# Residualplot
plt.scatter(y_pred_flat, residuals, alpha=0.5, color='g')
plt.axhline(0, color='r', linestyle='--')
plt.xlabel("Vorhergesagte Werte")
plt.ylabel("Residuen")
plt.title("Residualplot")
plt.show()

import seaborn as sns

# Histogramm der Residuen
sns.histplot(residuals, kde=True, color='y')
plt.axvline(0, color='r', linestyle='--')
plt.xlabel("Residuen")
plt.title("Verteilung der Residuen")
plt.show()

plt.plot(y_test.values, label="Tatsächliche Werte")
plt.plot(y_pred_flat, label="Vorhergesagte Werte")
plt.legend()
plt.xlabel("Index")
plt.ylabel("Umsatz")
plt.title("Vergleich der tatsächlichen und vorhergesagten Werte")
plt.show()

# Modell auf Submission-Datensatz laufen lassen

In [None]:
## Submission vorbereiten


df_sub = pd.read_csv('../sample_submission.csv')

# Angenommen, die Jahre sind 20xx:
df_sub['id'] = df_sub['id'].astype('string')
df_sub['Tag'] = df_sub['id'].str[0:2]
df_sub['Monat'] = df_sub['id'].str[2:4]
df_sub['Jahr'] = '20' + df_sub['id'].str[4:6]  # falls im Format YY z. B. 01 → 2001 oder 20xx

df_sub['Warengruppe'] = df_sub['id'].str[6]  # letztes Zeichen
df_sub['Warengruppe'] = df_sub['Warengruppe'].astype(int)

# Ein Datum aus Tag, Monat, Jahr erzeugen:
df_sub.rename(columns={'Jahr':'year', 'Monat':'month', 'Tag':'day'}, inplace=True)
df_sub['Datum'] = pd.to_datetime(df_sub[['year','month','day']])
df2['Datum'] = pd.to_datetime(df2['Datum'])
df3['Datum'] = pd.to_datetime(df3['Datum'])
df4['Datum'] = pd.to_datetime(df4['Datum'])
df_sub = df_sub.merge(df4, on='Datum', how='left')
df_sub = df_sub.merge(df3, on='Datum', how='left')
df_sub = df_sub.merge(df2, on='Datum', how='left')

df_sub['Wochentag'] = df_sub['Datum'].dt.weekday
df_sub['Ist_Wochenende'] = df_sub['Wochentag'].isin([5,6])
df_sub['Monat'] = df_sub['Datum'].dt.month
df_sub['Monat_sin'] = np.sin(2 * np.pi * df_sub['Monat']/12)
df_sub['Monat_cos'] = np.cos(2 * np.pi * df_sub['Monat']/12)

# Check if all data types are correctly set for all variables
print('### Initial datatypes')
print(df_sub.dtypes)

# Set the correct types for all variables

df_sub['Wettercode'] = df_sub['Wettercode'].astype('category')
df_sub['Warengruppe'] = df_sub['Warengruppe'].astype('category')


# replace NaN with False for 'KielerWoche'
df_sub['KielerWoche'] = df_sub['KielerWoche'].fillna(False)
df_sub['KielerWoche'] = df_sub['KielerWoche'].astype('bool')
print('### Corrected datatypes')
print(df_sub.dtypes)


# Datum in Datetime konvertieren
df_sub['Datum'] = pd.to_datetime(df_sub['Datum'])

# Basismerkmale
df_sub['Jahr'] = df_sub['Datum'].dt.year
df_sub['Monat'] = df_sub['Datum'].dt.month
df_sub['Wochentag'] = df_sub['Datum'].dt.weekday
df_sub['Kalenderwoche'] = df_sub['Datum'].dt.isocalendar().week
df_sub['Tag_im_Jahr'] = df_sub['Datum'].dt.dayofyear
df_sub['Tag_im_Jahr_sin'] = np.sin(2 * np.pi * df_sub['Tag_im_Jahr'] / 365)
df_sub['Tag_im_Jahr_cos'] = np.cos(2 * np.pi * df_sub['Tag_im_Jahr'] / 365)
df_sub['Ist_Wochenende'] = df_sub['Wochentag'].isin([5, 6]).astype(int)


# Zyklische Merkmale
df_sub['Monat_sin'] = np.sin(2 * np.pi * df_sub['Monat'] / 12)
df_sub['Monat_cos'] = np.cos(2 * np.pi * df_sub['Monat'] / 12)
df_sub['Wochentag_sin'] = np.sin(2 * np.pi * df_sub['Wochentag'] / 7)
df_sub['Wochentag_cos'] = np.cos(2 * np.pi * df_sub['Wochentag'] / 7)


# Beispiel für Mittelwert-Imputation
df_sub['Temperatur'].fillna(df_sub['Temperatur'].mean(), inplace=True)
df_sub['Bewoelkung'].fillna(df_sub['Bewoelkung'].mean(), inplace=True)
df_sub['Windgeschwindigkeit'].fillna(df_sub['Windgeschwindigkeit'].mean(), inplace=True)

df_sub['Wettercode'] = df_sub['Wettercode'].cat.add_categories(['Unbekannt'])
df_sub['Wettercode'] = df_sub['Wettercode'].fillna('Unbekannt')


# Sicherstellen, dass 'Datum' als datetime konvertiert ist
#df['Datum'] = pd.to_datetime(df['Datum'])



# 5. Vorhersagen treffen und evaluieren
y_pred_sub = model.predict(df_sub)
#y_pred_sub_best = grid_search.predict(df_sub)

df_sub['Umsatz'] = y_pred_sub

# Erstellen der finalen Submission:
df_sub[['id','Umsatz']].to_csv('../nr_submission.csv', index=False)

# MAPE

In [None]:


def mean_absolute_percentage_error(y_true, y_pred):
    """
    Calculate the Mean Absolute Percentage Error (MAPE)
    
    Parameters:
    y_true (array-like): Actual values
    y_pred (array-like): Predicted values
    
    Returns:
    float: MAPE value
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Ensure y_pred has the same number of elements as y_test
y_pred = model.predict(X_test_preprocessed).flatten()

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE: {mape}%")

# Submission v2

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


def r2(actual: np.ndarray, predicted: np.ndarray):
    """ R2 Score """
    return r2_score(actual, predicted)

def adjr2(actual: np.ndarray, predicted: np.ndarray, rowcount: int, featurecount: int):
    """ Adjusted R2 Score """
    return 1-(1-r2(actual,predicted))*(rowcount-1)/(rowcount-featurecount)

def mean_absolute_percentage_error(y_true, y_pred):
    """
    Calculate the Mean Absolute Percentage Error (MAPE)
    
    Parameters:
    y_true (array-like): Actual values
    y_pred (array-like): Predicted values
    
    Returns:
    float: MAPE value
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# 1. Daten einlesen und zusammenführen
df1 = pd.read_csv('../umsatzdaten_gekuerzt.csv')
df2 = pd.read_csv('../wetter.csv')
df3 = pd.read_csv('../kiwo.csv')
df4 = pd.read_csv('../Feier_Bruecke_Ferien_bis2018.csv')

df = df1.merge(df2, on='Datum', how='left')
df = df.merge(df3, on='Datum', how='left')
df = df.merge(df4, on='Datum', how='left')

# replace NaN with False for 'KielerWoche'
df['KielerWoche'] = df['KielerWoche'].fillna(False)
df['KielerWoche'] = df['KielerWoche'].astype('bool')

# Sicherstellen, dass 'Datum' als datetime konvertiert ist
df['Datum'] = pd.to_datetime(df['Datum'])

# a) Feature Engineering: Erstellen von Datum- und zyklischen Features
def add_features(df):
    df['Jahr'] = df['Datum'].dt.year
    df['Monat'] = df['Datum'].dt.month
    df['Wochentag'] = df['Datum'].dt.weekday
    df['Kalenderwoche'] = df['Datum'].dt.isocalendar().week
    df['Tag_im_Jahr'] = df['Datum'].dt.dayofyear
    df['Ist_Wochenende'] = df['Wochentag'].isin([5, 6]).astype(int)
    
    # Zyklische Merkmale
    df['Tag_im_Jahr_sin'] = np.sin(2 * np.pi * df['Tag_im_Jahr'] / 365)
    df['Tag_im_Jahr_cos'] = np.cos(2 * np.pi * df['Tag_im_Jahr'] / 365)
    df['Monat_sin'] = np.sin(2 * np.pi * df['Monat'] / 12)
    df['Monat_cos'] = np.cos(2 * np.pi * df['Monat'] / 12)
    df['Wochentag_sin'] = np.sin(2 * np.pi * df['Wochentag'] / 7)
    df['Wochentag_cos'] = np.cos(2 * np.pi * df['Wochentag'] / 7)
        
    return df

df = add_features(df)
# b) FunctionTransformer für Feature Engineering
feature_engineering = FunctionTransformer(add_features, validate=False)

# c) Definieren der Feature-Gruppen
numeric_features = ['Temperatur', 'Bewoelkung', 'Windgeschwindigkeit',
                    'Tag_im_Jahr_sin', 'Tag_im_Jahr_cos',
                    'Monat_sin', 'Monat_cos',
                    'Wochentag_sin', 'Wochentag_cos','feiertag', 'KielerWoche',
                    'brueckentag', 'BW', 'BY', 'B', 'BB', 'HB', 'HH', 'HE', 'MV',
                      'NI', 'NW', 'RP', 'SL', 'SN', 'ST', 'SH', 'TH']
                    
categorical_features = ['Warengruppe', 'Wettercode', 'Wochentag']

# d) Definieren des ColumnTransformers
# Stellen Sie sicher, dass 'Wettercode' als String vorliegt
df['Wettercode'] = df['Wettercode'].astype(str)

# Liste der Kategorien für 'Wettercode'
wettercode_categories = [str(i) for i in range(0, 100)] + ['Unbekannt']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        
        ('wettercode', OneHotEncoder(categories=[wettercode_categories], handle_unknown='ignore'), ['Wettercode']),
        
        ('warengruppe', OneHotEncoder(handle_unknown='ignore'), ['Warengruppe']),
        
        ('wochentag', OneHotEncoder(handle_unknown='ignore'), ['Wochentag'])
    ],
    remainder='drop'
)

# Define your date thresholds
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'

train_data = df[df['Datum'] <= train_end_date]
test_data = df[df['Datum'] > train_end_date]

df = df.drop('Datum', axis=1)
df = df.drop('id', axis=1)

X = df.drop(['Umsatz'], axis=1)
y = df['Umsatz']

X_train  = train_data.drop(['Umsatz'], axis=1)  # Behalten Sie 'Datum' in X für die Pipeline
y_train = train_data['Umsatz']
upper_limit = y_train.quantile(0.995)
y_train = np.clip(y_train, a_min=None, a_max=upper_limit)

X_test  = test_data.drop(['Umsatz'], axis=1)  # Behalten Sie 'Datum' in X für die Pipeline
y_test = test_data['Umsatz']

# Transformation der Daten
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Überprüfen der Dimensionen der transformierten Daten
input_shape = X_train_preprocessed.shape[1]

# Modell
model = Sequential([
    InputLayer(shape=(input_shape, )),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='huber', metrics=['mae'])

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
]

# Training
history = model.fit(
    X_train_preprocessed,
    y_train,
    validation_data=(X_test_preprocessed, y_test),
    epochs=100,
    batch_size=32,
    verbose=1,
    callbacks=callbacks
)

y_pred = model.predict(X_test_preprocessed)
print("R² on test:", r2_score(y_test, y_pred))
print("adjusted R² on test:", adjr2(y_test, y_pred, len(y), X.shape[1]))
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred)}%")

# Submission vorbereiten
df_sub = pd.read_csv('../sample_submission.csv')

# Angenommen, die Jahre sind 20xx:
df_sub['id'] = df_sub['id'].astype('string')
df_sub['Tag'] = df_sub['id'].str[4:6]
df_sub['Monat'] = df_sub['id'].str[2:4]
df_sub['Jahr'] = '20' + df_sub['id'].str[0:2]  # falls im Format YY z. B. 01 → 2001 oder 20xx

df_sub['Warengruppe'] = df_sub['id'].str[6]  # letztes Zeichen
df_sub['Warengruppe'] = df_sub['Warengruppe'].astype(int)

# Ein Datum aus Tag, Monat, Jahr erzeugen:
df_sub.rename(columns={'Jahr':'year', 'Monat':'month', 'Tag':'day'}, inplace=True)
df_sub['Datum'] = pd.to_datetime(df_sub[['year','month','day']])
df2['Datum'] = pd.to_datetime(df2['Datum'])
df3['Datum'] = pd.to_datetime(df3['Datum'])
df4['Datum'] = pd.to_datetime(df4['Datum'])
df_sub = df_sub.merge(df4, on='Datum', how='left')
df_sub = df_sub.merge(df3, on='Datum', how='left')
df_sub = df_sub.merge(df2, on='Datum', how='left')

df_sub['Wochentag'] = df_sub['Datum'].dt.weekday
df_sub['Ist_Wochenende'] = df_sub['Wochentag'].isin([5,6])
df_sub['Monat'] = df_sub['Datum'].dt.month
df_sub['Monat_sin'] = np.sin(2 * np.pi * df_sub['Monat']/12)
df_sub['Monat_cos'] = np.cos(2 * np.pi * df_sub['Monat']/12)


print(df_sub.head)
# Check if all data types are correctly set for all variables
print('### Initial datatypes')
print(df_sub.dtypes)

# Set the correct types for all variables
df_sub['Wettercode'] = df_sub['Wettercode'].astype('category')
df_sub['Warengruppe'] = df_sub['Warengruppe'].astype('category')

# replace NaN with False for 'KielerWoche'
df_sub['KielerWoche'] = df_sub['KielerWoche'].fillna(False)
df_sub['KielerWoche'] = df_sub['KielerWoche'].astype('bool')
print('### Corrected datatypes')
print(df_sub.dtypes)

# Datum in Datetime konvertieren
df_sub['Datum'] = pd.to_datetime(df_sub['Datum'])

# Basismerkmale
df_sub['Jahr'] = df_sub['Datum'].dt.year
df_sub['Monat'] = df_sub['Datum'].dt.month
df_sub['Wochentag'] = df_sub['Datum'].dt.weekday
df_sub['Kalenderwoche'] = df_sub['Datum'].dt.isocalendar().week
df_sub['Tag_im_Jahr'] = df_sub['Datum'].dt.dayofyear
df_sub['Tag_im_Jahr_sin'] = np.sin(2 * np.pi * df_sub['Tag_im_Jahr'] / 365)
df_sub['Tag_im_Jahr_cos'] = np.cos(2 * np.pi * df_sub['Tag_im_Jahr'] / 365)
df_sub['Ist_Wochenende'] = df_sub['Wochentag'].isin([5, 6]).astype(int)

# Zyklische Merkmale
df_sub['Monat_sin'] = np.sin(2 * np.pi * df_sub['Monat'] / 12)
df_sub['Monat_cos'] = np.cos(2 * np.pi * df_sub['Monat'] / 12)
df_sub['Wochentag_sin'] = np.sin(2 * np.pi * df_sub['Wochentag'] / 7)
df_sub['Wochentag_cos'] = np.cos(2 * np.pi * df_sub['Wochentag'] / 7)

# Beispiel für Mittelwert-Imputation
df_sub['Temperatur'].fillna(df_sub['Temperatur'].mean(), inplace=True)
df_sub['Bewoelkung'].fillna(df_sub['Bewoelkung'].mean(), inplace=True)
df_sub['Windgeschwindigkeit'].fillna(df_sub['Windgeschwindigkeit'].mean(), inplace=True)

df_sub['Wettercode'] = df_sub['Wettercode'].cat.add_categories(['Unbekannt'])
df_sub['Wettercode'] = df_sub['Wettercode'].fillna('Unbekannt')

# Sicherstellen, dass 'Datum' als datetime konvertiert ist
#df['Datum'] = pd.to_datetime(df['Datum'])

# Transform the submission data
df_sub_preprocessed = preprocessor.transform(df_sub.drop(['id', 'Datum'], axis=1))

# Make predictions
y_pred_sub = model.predict(df_sub_preprocessed)

df_sub['Umsatz'] = y_pred_sub

# Erstellen der finalen Submission:
df_sub[['id','Umsatz']].to_csv('../nr_submission.csv', index=False)

In [None]:
correlation_matrix = pd.concat([X_train, y_train], axis=1).corr()
print(correlation_matrix['Umsatz'].sort_values(ascending=False))

In [None]:
import optuna
import tensorflow as tf

# Convert datetime columns to numerical values
X_train_preprocessed = X_train_preprocessed.astype(float)
X_test_preprocessed = X_test_preprocessed.astype(float)

def objective(trial):
    history = model.fit(X_train_preprocessed, y_train, validation_data=(X_test_preprocessed, y_test), 
                        epochs=10, batch_size=trial.suggest_int('batch_size', 16, 128), verbose=0)
    val_loss = min(history.history['val_loss'])
    return val_loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Beste Parameter:", study.best_params)
print("Bestes Ergebnis (val_mae):", study.best_value)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Drop datetime columns if present
X_train_rf = X_train.drop(columns=['Datum'], errors='ignore')
X_test_rf = X_test.drop(columns=['Datum'], errors='ignore')

rf = RandomForestRegressor(n_estimators=100, random_state=42)

rf.fit(X_train_rf, y_train)
feature_importances = pd.Series(rf.feature_importances_, index=X_train_rf.columns)
print(feature_importances.sort_values(ascending=False))

In [None]:
# Optimiertes Modell


selected_features = [
    'Warengruppe', 'Temperatur', 'Tag_im_Jahr', 'Tag_im_Jahr_cos',
    'id', 'Wochentag', 'Wochentag_sin', 'Ist_Wochenende', 'MV', 'BB'
]

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

model = Sequential([
    Dense(64, activation='relu', input_dim=X_train_selected.shape[1]),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mae', metrics=['mae'])
history = model.fit(
    X_train_selected, y_train,
    validation_data=(X_test_selected, y_test),
    epochs=50, batch_size=51, verbose=1
)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_selected, y_train)
y_pred_rf = rf.predict(X_test_selected)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest MAE: {mae_rf}")

In [None]:
import matplotlib.pyplot as plt
plt.boxplot(y_train)
plt.title('Umsatz Boxplot')
plt.show()

In [None]:
df_sub