In [1]:
import pandas as pd
names = ['date', 'time', 'product_name', 'product_category', 'amount', 'line_total', 'payment_method', 'store_id', 'store_name', 'order_id']
df = pd.read_csv('data/sales_data_1.csv', delimiter='\t', names=names)
df.head(10)


Unnamed: 0,date,time,product_name,product_category,amount,line_total,payment_method,store_id,store_name,order_id
0,02.01.2025,12:58,Hjemmelavede forårsruller (#12),Snacks,1.0,7900,Mealo,0,Arhaan Thai,15064
1,02.01.2025,12:58,Panang Karry (#33),Karry,1.0,12500,Mealo,0,Arhaan Thai,15064
2,02.01.2025,12:58,Kylling,/MSG Mad,1.0,0,Mealo,0,Arhaan Thai,15064
3,02.01.2025,12:58,Pad Thai (#41),Nudler,1.0,11900,Mealo,0,Arhaan Thai,15064
4,02.01.2025,12:58,Kylling,/MSG Mad,1.0,0,Mealo,0,Arhaan Thai,15064
5,02.01.2025,12:58,Takeaway - Pose,Ekstra,1.0,400,Mealo,0,Arhaan Thai,15064
6,02.01.2025,15:06,Wok med cashewnødder (#54),Wok,1.0,13500,Wolt,0,Arhaan Thai,15065
7,02.01.2025,15:06,Kylling,/MSG Mad,1.0,0,Wolt,0,Arhaan Thai,15065
8,02.01.2025,15:06,Ekstra oksekød,/MSG Mad,1.0,2000,Wolt,0,Arhaan Thai,15065
9,02.01.2025,15:06,Hjemmelavede forårsruller (#12),Snacks,1.0,7900,Wolt,0,Arhaan Thai,15065


In [2]:
df['product_category'] = df['product_category'].replace({'Karry': 'Maindish', 'Wok': 'Maindish', 'Nudler': 'Maindish', 'Specialiteter': 'Maindish', 'Børnemenu': 'Maindish'})
df['product_category'] = df['product_category'].replace({'Drikkevarer': 'Drinks', 'Øl og drinks': 'Drinks', 'Vin': 'Drinks'})
df['payment_method'] = df['payment_method'].replace({'Splitbetaling': 'In-House_register', 'Cash': 'In-House_register', 'Card': 'In-House_register', 'SoftPay': 'In-House_table'}) 

# Update product_category to "Takeaway" where product_name is "Takeaway - Pose"
df.loc[df['product_name'] == 'Takeaway - Pose', 'product_category'] = 'Takeaway'

# remove rows where product_name starts with '/MSG'
df = df[~df['product_category'].str.startswith('/MSG', na=False)]
df = df[~df['payment_method'].str.startswith(('Gavekort','Kredit'), na=False)]

# ensure numeric types
df['amount'] = pd.to_numeric(df['amount'], errors='coerce').fillna(0)
# fix price: replace comma decimal separators and convert to numeric
df['line_total'] = df['line_total'].astype(str).str.replace(',', '.', regex=False)
df['line_total'] = pd.to_numeric(df['line_total'], errors='coerce').fillna(0.0)

df.head(10)

Unnamed: 0,date,time,product_name,product_category,amount,line_total,payment_method,store_id,store_name,order_id
0,02.01.2025,12:58,Hjemmelavede forårsruller (#12),Snacks,1.0,79.0,Mealo,0,Arhaan Thai,15064
1,02.01.2025,12:58,Panang Karry (#33),Maindish,1.0,125.0,Mealo,0,Arhaan Thai,15064
3,02.01.2025,12:58,Pad Thai (#41),Maindish,1.0,119.0,Mealo,0,Arhaan Thai,15064
5,02.01.2025,12:58,Takeaway - Pose,Takeaway,1.0,4.0,Mealo,0,Arhaan Thai,15064
6,02.01.2025,15:06,Wok med cashewnødder (#54),Maindish,1.0,135.0,Wolt,0,Arhaan Thai,15065
9,02.01.2025,15:06,Hjemmelavede forårsruller (#12),Snacks,1.0,79.0,Wolt,0,Arhaan Thai,15065
10,02.01.2025,15:06,Satay (#11),Snacks,1.0,69.0,Wolt,0,Arhaan Thai,15065
11,02.01.2025,15:06,Takeaway - Pose,Takeaway,1.0,4.0,Wolt,0,Arhaan Thai,15065
12,02.01.2025,15:25,Tom Yam,Suppe,1.0,89.0,In-House_register,0,Arhaan Thai,6119
14,02.01.2025,15:25,Wontons (#16),Snacks,1.0,59.0,In-House_register,0,Arhaan Thai,6119


In [3]:
# Add columns for number of items per category per order
category_counts = df.groupby(['order_id', 'product_category'])['amount'].sum().unstack(fill_value=0)
df = df.merge(category_counts, left_on='order_id', right_index=True, how='left', suffixes=('', '_count'))


df['number_of_maindishes'] = df['Maindish'] if 'Maindish' in df.columns else 0
df['number_of_snacks'] = df['Snacks'] if 'Snacks' in df.columns else 0
df['number_of_drinks'] = df['Drinks'] if 'Drinks' in df.columns else 0
df['number_of_soups'] = df['Suppe'] if 'Suppe' in df.columns else 0
df['number_of_extras'] = df['Ekstra'] if 'Ekstra' in df.columns else 0
df['number_of_takeaways'] = df['Takeaway'] if 'Takeaway' in df.columns else 0

# Add order_total column
order_totals = df.groupby('order_id')['line_total'].sum()
df['order_total'] = df['order_id'].map(order_totals)

# Add day_of_week column
# parse date (expects 'dd.mm.YYYY'), coerce errors
df['date'] = pd.to_datetime(df['date'].astype(str).str.strip(), format='%d.%m.%Y', dayfirst=True, errors='coerce')

# trim time and create combined datetime (handles missing/invalid values)
df['time'] = df['time'].astype(str).str.strip()
df['datetime'] = pd.to_datetime(df['date'].dt.strftime('%Y-%m-%d').fillna('') + ' ' + df['time'].replace('nan',''), errors='coerce')

# inspect any rows that failed to parse
if df['date'].isna().any() or df['datetime'].isna().any():
    print("Date parse failures (sample):")
    print(df.loc[df['date'].isna(), ['date', 'time', 'product_name']].head())
    print(df.loc[df['datetime'].isna(), ['date', 'time', 'product_name']].head())

# derive day_of_week
df['day_of_week'] = df['datetime'].dt.day_name()

# Drop unnecessary columns
df = df.drop(columns=['store_id', '/Diverse', 'Dessert', 'Drinks', 'Ekstra', 'Maindish', 'Snacks', 'Suppe', 'store_name', 'Takeaway'])



In [4]:
# Convert day of the week to numeric values
day_mapping = {
    'Monday' : 1,
    'Tuesday' : 2,
    'Wednesday' : 3,
    'Thursday' : 4,
    'Friday' : 5,
    'Saturday' : 6,
    'Sunday' : 7
}
df['day_of_week'] = df['day_of_week'].map(day_mapping) 

# Check unique payment methods
# print("Unique payment methods:", df['payment_method'].unique())

# Convert payment_method to numeric values
payment_mapping = {
    'Mealo' : 1,
    'Wolt' : 2,
    'In-House_table' : 3,
    'In-House_register' : 4
}
df['payment_method'] = df['payment_method'].map(payment_mapping)

In [5]:
# Group by order_id and aggregate the data
df_single_row = df.groupby('datetime').agg({
    'day_of_week': 'first',          # Keep the first day of the week
    'order_id': 'first',         # Keep the first order_id (or use another aggregation if needed)
    'order_total': 'first',  # Keep the first value of order_total (should be the same for all rows in the group)
    'payment_method': 'first',  # Keep the first payment method
    'number_of_maindishes': 'first',  # Keep the first value of number_of_maindishes
    'number_of_snacks': 'first',      # Keep the first value of number_of_snacks
    'number_of_drinks': 'first',      # Keep the first value of number_of_drinks
    'number_of_soups': 'first',       # Keep the first value of number_of_soups
    'number_of_extras': 'first',      # Keep the first value of number_of_extras
    'number_of_takeaways': 'first'   # Keep the first value of number_of_takeaways
}).reset_index()

# Display the resulting DataFrame
df_single_row.head(10)

Unnamed: 0,datetime,day_of_week,order_id,order_total,payment_method,number_of_maindishes,number_of_snacks,number_of_drinks,number_of_soups,number_of_extras,number_of_takeaways
0,2025-01-02 12:58:00,4,15064,327.0,1,2.0,1.0,0.0,0.0,0.0,1.0
1,2025-01-02 15:06:00,4,15065,287.0,2,1.0,2.0,0.0,0.0,0.0,1.0
2,2025-01-02 15:25:00,4,6119,685.0,4,2.0,1.0,1.0,1.0,0.0,0.0
3,2025-01-02 16:19:00,4,6117,109.0,4,0.0,0.0,0.0,1.0,1.0,0.0
4,2025-01-02 16:37:00,4,15067,129.0,1,1.0,0.0,0.0,0.0,0.0,1.0
5,2025-01-02 16:41:00,4,6118,500.0,4,1.0,0.0,0.0,0.0,0.0,0.0
6,2025-01-02 16:46:00,4,15070,321.0,1,2.0,2.0,0.0,0.0,0.0,1.0
7,2025-01-02 16:50:00,4,15071,202.0,1,1.0,1.0,0.0,0.0,0.0,1.0
8,2025-01-02 17:03:00,4,6120,272.7,4,2.0,1.0,0.0,0.0,0.0,0.0
9,2025-01-02 17:16:00,4,15074,182.0,2,2.0,0.0,0.0,0.0,0.0,1.0
