# Data Characteristics
Import our data frames and play around to see what characteristics we have in our data

In [None]:
# import processed df from ../data/processed/df_merged_extended_weather.csv
import pandas as pd
df_merged_extended_weather = pd.read_csv("../data/processed/df_merged_extended_weather.csv")
display(df_merged_extended_weather.shape)
df_merged_extended_weather.tail()

In [None]:
import scipy.stats as stats
import seaborn as sns
# Testing Regression
#reg_cols=['Umsatz_umsatz','umsatz_rolling7','KielerWoche_kiwo','Bewoelkung_weather', 'rain_sum', 'sunshine_hours','Temperatur_weather','Windgeschwindigkeit_weather','precipitation_hours','day_of_week']
reg_cols=['Umsatz_umsatz','KielerWoche_kiwo','Bewoelkung_weather', 'rain_sum', 'sunshine_hours','Temperatur_weather','Windgeschwindigkeit_weather','precipitation_hours','day_of_week']
#sns.pairplot(df_merged_extended_weather[reg_cols].dropna())
df_merged_extended_weather[reg_cols].corr()

In [None]:
# displaying umsatz correlations
correlations = df_merged_extended_weather[reg_cols].corr()['Umsatz_umsatz'].sort_values(ascending=False)
print(correlations)

Check on which days we have high sales

In [None]:
# check on which day of week have high sales
sns.boxplot(x='day_of_week', y='Umsatz_umsatz', data=df_merged_extended_weather)
# Calculate average sales per day of the week
avg_sales_by_day = df_merged_extended_weather.groupby('day_of_week')['Umsatz_umsatz'].mean().sort_values(ascending=False)
print(avg_sales_by_day)

In [None]:
# Extract month from date column (adjust 'Datum' if the column name differs)
df_merged_extended_weather['month'] = pd.to_datetime(df_merged_extended_weather['Datum']).dt.month

# Calculate average sales per month
avg_sales_by_month = df_merged_extended_weather.groupby('month')['Umsatz_umsatz'].mean().sort_values(ascending=False)
print(avg_sales_by_month)

import matplotlib.pyplot as plt
import seaborn as sns

# Create subplots for side-by-side comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Sales by month
sns.boxplot(ax=axes[0, 0], x='month', y='Umsatz_umsatz', data=df_merged_extended_weather)
axes[0, 0].set_title('Sales by Month')

# Sunshine duration by month
sns.boxplot(ax=axes[0, 1], x='month', y='sunshine_hours', data=df_merged_extended_weather)
axes[0, 1].set_title('Sunshine Duration by Month')

# Cloud cover by month
sns.boxplot(ax=axes[1, 0], x='month', y='Bewoelkung_weather', data=df_merged_extended_weather)
axes[1, 0].set_title('Cloud Cover by Month')

sns.boxplot(ax=axes[1, 1], x='month', y='Temperatur_weather', data=df_merged_extended_weather)
axes[1, 1].set_title('Temperature by Month')

plt.tight_layout()
plt.show()

In [None]:
# Calculate total sales per Warengruppe_umsatz
import matplotlib.pyplot as plt
total_sales_by_group = df_merged_extended_weather.groupby('Warengruppe_umsatz')['Umsatz_umsatz'].sum().sort_index()
print(total_sales_by_group)

# Visualize total sales per Warengruppe_umsatz
plt.figure(figsize=(10, 6))
sns.barplot(x=total_sales_by_group.index, y=total_sales_by_group.values)
plt.title('Total Sales per Warengruppe_umsatz')
plt.xlabel('Warengruppe_umsatz')
plt.ylabel('Total Umsatz_umsatz')
labels = ['Bread', 'Rolls', 'Croissant', 'Confectionery', 'Cake', 'SeasonalBread']
plt.xticks(ticks=range(len(total_sales_by_group)), labels=labels, rotation=45)
plt.show()

# Check with Holidays

In [None]:
# import processed df from ../data/processed/df_extended_weather_holidays.csv
df_merged_extended_weather = pd.read_csv("../data/processed/df_extended_weather_holidays.csv")
display(df_merged_extended_weather.shape)
df_merged_extended_weather.tail()

In [None]:
# shift -1 for next_day_holiday from public_holiday column
df_merged_extended_weather['next_day_holiday'] = df_merged_extended_weather['public_holiday'].shift(-1)

# check correlation of umsatz with all other columns except Datum
correlations = df_merged_extended_weather.drop(columns=['Datum','day']).corr()['Umsatz_umsatz'].sort_values(ascending=False)
print(correlations)

In [None]:
# dropping columns which have negative correlation with umsatz
cols_to_drop = correlations[correlations < 0].index.tolist()
df_final = df_merged_extended_weather.drop(columns=cols_to_drop)
print(f"Dropped columns: {cols_to_drop}")
display(df_final.shape)
df_final.sample(5)

In [None]:
# check if there was effect of public holidays and school holidays on sales
import matplotlib.pyplot as plt
import seaborn as sns
# Boxplot for public holidays
plt.figure(figsize=(8, 6))
sns.boxplot(x='public_holiday', y='Umsatz_umsatz', data=df_final)
plt.title('Sales on Public Holidays vs Non-Public Holidays')
plt.xlabel('Is Public Holiday')
plt.ylabel('Umsatz_umsatz')
plt.show()
# Boxplot for school holidays
plt.figure(figsize=(8, 6))
sns.boxplot(x='school_holiday', y='Umsatz_umsatz',
            data=df_final)
plt.title('Sales on School Holidays vs Non-School Holidays')
plt.xlabel('Is School Holiday')
plt.ylabel('Umsatz_umsatz')
plt.show()
# Boxplot for next day holidays
plt.figure(figsize=(8, 6))
sns.boxplot(x='next_day_holiday', y='Umsatz_umsatz',
            data=df_final)
plt.title('Sales on Next Day Holidays vs Non-Next Day Holidays')
plt.xlabel('Is Next Day Holiday')
plt.ylabel('Umsatz_umsatz')
plt.show()


# check their significance using t-test
public_holiday_sales = df_final[df_final['public_holiday'] == 1]['Umsatz_umsatz']
non_public_holiday_sales = df_final[df_final['public_holiday'] == 0]['Umsatz_umsatz']
t_stat, p_value = stats.ttest_ind(public_holiday_sales, non_public_holiday_sales, equal_var=False)
print(f"T-test for Public Holidays: t-statistic = {t_stat}, p-value = {p_value}")   
school_holiday_sales = df_final[df_final['school_holiday'] == 1]['Umsatz_umsatz']
non_school_holiday_sales = df_final[df_final['school_holiday'] == 0]['Umsatz_umsatz']
t_stat, p_value = stats.ttest_ind(school_holiday_sales, non_school_holiday_sales, equal_var=False)
print(f"T-test for School Holidays: t-statistic = {t_stat}, p-value = {p_value}")
#also checking next_day_holiday significance
next_day_holiday_sales = df_final[df_final['next_day_holiday'] == 1]['Umsatz_umsatz']
non_next_day_holiday_sales = df_final[df_final['next_day_holiday'] == 0]['Umsatz_umsatz']
t_stat, p_value = stats.ttest_ind(next_day_holiday_sales, non_next_day_holiday_sales, equal_var=False)
print(f"T-test for Next Day Holidays: t-statistic = {t_stat}, p-value = {p_value}") 


