In [1]:
import pandas as pd
names = ['date', 'time', 'product_name', 'product_category', 'amount', 'line_total', 'payment_method', 'store_id', 'store_name', 'order_id']
df = pd.read_csv('data/sales_data_1.csv', delimiter='\t', names=names)
print(df.head(5))
df.info()

         date   time                     product_name product_category  \
0  02.01.2025  12:58  Hjemmelavede forårsruller (#12)           Snacks   
1  02.01.2025  12:58               Panang Karry (#33)            Karry   
2  02.01.2025  12:58                          Kylling         /MSG Mad   
3  02.01.2025  12:58                   Pad Thai (#41)           Nudler   
4  02.01.2025  12:58                          Kylling         /MSG Mad   

   amount line_total payment_method  store_id   store_name  order_id  
0     1.0      79,00          Mealo         0  Arhaan Thai     15064  
1     1.0     125,00          Mealo         0  Arhaan Thai     15064  
2     1.0       0,00          Mealo         0  Arhaan Thai     15064  
3     1.0     119,00          Mealo         0  Arhaan Thai     15064  
4     1.0       0,00          Mealo         0  Arhaan Thai     15064  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61873 entries, 0 to 61872
Data columns (total 10 columns):
 #   Column         

In [2]:
df['product_category'] = df['product_category'].replace({'Karry': 'Maindish', 'Wok': 'Maindish', 'Nudler': 'Maindish'})
df['product_category'] = df['product_category'].replace({'Drikkevarer': 'Drinks', 'Øl og drinks': 'Drinks', 'Vin': 'Drinks'})

# remove rows where product_name starts with '/MSG'
df = df[~df['product_category'].str.startswith('/MSG', na=False)]

# ensure numeric types
df['amount'] = pd.to_numeric(df['amount'], errors='coerce').fillna(0)
# fix price: replace comma decimal separators and convert to numeric
df['line_total'] = df['line_total'].astype(str).str.replace(',', '.', regex=False)
df['line_total'] = pd.to_numeric(df['line_total'], errors='coerce').fillna(0.0)

df.head(10)

Unnamed: 0,date,time,product_name,product_category,amount,line_total,payment_method,store_id,store_name,order_id
0,02.01.2025,12:58,Hjemmelavede forårsruller (#12),Snacks,1.0,79.0,Mealo,0,Arhaan Thai,15064
1,02.01.2025,12:58,Panang Karry (#33),Maindish,1.0,125.0,Mealo,0,Arhaan Thai,15064
3,02.01.2025,12:58,Pad Thai (#41),Maindish,1.0,119.0,Mealo,0,Arhaan Thai,15064
5,02.01.2025,12:58,Takeaway - Pose,Ekstra,1.0,4.0,Mealo,0,Arhaan Thai,15064
6,02.01.2025,15:06,Wok med cashewnødder (#54),Maindish,1.0,135.0,Wolt,0,Arhaan Thai,15065
9,02.01.2025,15:06,Hjemmelavede forårsruller (#12),Snacks,1.0,79.0,Wolt,0,Arhaan Thai,15065
10,02.01.2025,15:06,Satay (#11),Snacks,1.0,69.0,Wolt,0,Arhaan Thai,15065
11,02.01.2025,15:06,Takeaway - Pose,Ekstra,1.0,4.0,Wolt,0,Arhaan Thai,15065
12,02.01.2025,15:25,Tom Yam,Suppe,1.0,89.0,Card,0,Arhaan Thai,6119
14,02.01.2025,15:25,Wontons (#16),Snacks,1.0,59.0,Card,0,Arhaan Thai,6119


In [None]:
# Add columns for number of items per category per order
category_counts = df.groupby(['order_id', 'product_category'])['amount'].sum().unstack(fill_value=0)
df = df.merge(category_counts, left_on='order_id', right_index=True, how='left', suffixes=('', '_count'))

df['number_of_maindishes'] = df['Maindish'] if 'Maindish' in df.columns else 0
df['number_of_snacks'] = df['Snacks'] if 'Snacks' in df.columns else 0
df['number_of_drinks'] = df['Drinks'] if 'Drinks' in df.columns else 0
df['number_of_soups'] = df['Suppe'] if 'Suppe' in df.columns else 0
df['number_of_extras'] = df['Ekstra'] if 'Ekstra' in df.columns else 0

# Add order_total column
order_totals = df.groupby('order_id')['line_total'].sum()
df['order_total'] = df['order_id'].map(order_totals)

# Add day_of_week column
df['day_of_week'] = pd.to_datetime(df['date'].astype(str).str.strip(), format='%d.%m.%Y').dt.day_name()


Unnamed: 0,date,time,product_name,product_category,amount,line_total,payment_method,store_id,store_name,order_id,...,Snacks,Specialiteter,Suppe,number_of_maindishes,number_of_snacks,number_of_drinks,number_of_soups,number_of_extras,order_total,day_of_week
0,02.01.2025,12:58,Hjemmelavede forårsruller (#12),Snacks,1.0,79.0,Mealo,0,Arhaan Thai,15064,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,327.0,Thursday
1,02.01.2025,12:58,Panang Karry (#33),Maindish,1.0,125.0,Mealo,0,Arhaan Thai,15064,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,327.0,Thursday
3,02.01.2025,12:58,Pad Thai (#41),Maindish,1.0,119.0,Mealo,0,Arhaan Thai,15064,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,327.0,Thursday
5,02.01.2025,12:58,Takeaway - Pose,Ekstra,1.0,4.0,Mealo,0,Arhaan Thai,15064,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,327.0,Thursday
6,02.01.2025,15:06,Wok med cashewnødder (#54),Maindish,1.0,135.0,Wolt,0,Arhaan Thai,15065,...,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,287.0,Thursday
9,02.01.2025,15:06,Hjemmelavede forårsruller (#12),Snacks,1.0,79.0,Wolt,0,Arhaan Thai,15065,...,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,287.0,Thursday
10,02.01.2025,15:06,Satay (#11),Snacks,1.0,69.0,Wolt,0,Arhaan Thai,15065,...,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,287.0,Thursday
11,02.01.2025,15:06,Takeaway - Pose,Ekstra,1.0,4.0,Wolt,0,Arhaan Thai,15065,...,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,287.0,Thursday
12,02.01.2025,15:25,Tom Yam,Suppe,1.0,89.0,Card,0,Arhaan Thai,6119,...,1.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,685.0,Thursday
14,02.01.2025,15:25,Wontons (#16),Snacks,1.0,59.0,Card,0,Arhaan Thai,6119,...,1.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,685.0,Thursday


In [5]:
df.head(17)

Unnamed: 0,date,time,product_name,product_category,amount,line_total,payment_method,store_id,store_name,order_id,...,Snacks,Specialiteter,Suppe,number_of_maindishes,number_of_snacks,number_of_drinks,number_of_soups,number_of_extras,order_total,day_of_week
0,02.01.2025,12:58,Hjemmelavede forårsruller (#12),Snacks,1.0,79.0,Mealo,0,Arhaan Thai,15064,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,327.0,Thursday
1,02.01.2025,12:58,Panang Karry (#33),Maindish,1.0,125.0,Mealo,0,Arhaan Thai,15064,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,327.0,Thursday
3,02.01.2025,12:58,Pad Thai (#41),Maindish,1.0,119.0,Mealo,0,Arhaan Thai,15064,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,327.0,Thursday
5,02.01.2025,12:58,Takeaway - Pose,Ekstra,1.0,4.0,Mealo,0,Arhaan Thai,15064,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,327.0,Thursday
6,02.01.2025,15:06,Wok med cashewnødder (#54),Maindish,1.0,135.0,Wolt,0,Arhaan Thai,15065,...,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,287.0,Thursday
9,02.01.2025,15:06,Hjemmelavede forårsruller (#12),Snacks,1.0,79.0,Wolt,0,Arhaan Thai,15065,...,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,287.0,Thursday
10,02.01.2025,15:06,Satay (#11),Snacks,1.0,69.0,Wolt,0,Arhaan Thai,15065,...,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,287.0,Thursday
11,02.01.2025,15:06,Takeaway - Pose,Ekstra,1.0,4.0,Wolt,0,Arhaan Thai,15065,...,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,287.0,Thursday
12,02.01.2025,15:25,Tom Yam,Suppe,1.0,89.0,Card,0,Arhaan Thai,6119,...,1.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,685.0,Thursday
14,02.01.2025,15:25,Wontons (#16),Snacks,1.0,59.0,Card,0,Arhaan Thai,6119,...,1.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,685.0,Thursday


In [4]:

groupedA = df.groupby(['order_id', 'product_category', 'date', 'time', 'payment_method'])['amount'].sum().reset_index()
groupedB = df.groupby(['order_id', 'product_category', 'date', 'time', 'payment_method'])['line_total'].sum().reset_index()
combined = pd.merge(groupedA, groupedB, on=['order_id', 'product_category', 'date', 'time', 'payment_method'])
print(combined.head(10))

   order_id product_category        date   time payment_method  amount  \
0       913           Drinks  03.01.2025  18:57        SoftPay     2.0   
1       913         Maindish  03.01.2025  18:57        SoftPay     2.0   
2       914           Drinks  03.01.2025  19:18        SoftPay     2.0   
3       914         Maindish  03.01.2025  19:18        SoftPay     2.0   
4       914           Snacks  03.01.2025  19:18        SoftPay     1.0   
5       915           Drinks  03.01.2025  20:51        SoftPay     2.0   
6       915           Snacks  03.01.2025  20:51        SoftPay     1.0   
7       915            Suppe  03.01.2025  20:51        SoftPay     1.0   
8       916           Drinks  03.01.2025  20:52        SoftPay     2.0   
9       916           Snacks  03.01.2025  20:52        SoftPay     2.0   

   line_total  
0       119.0  
1       238.0  
2        99.0  
3       244.0  
4        69.0  
5       120.0  
6        59.0  
7        89.0  
8       148.0  
9        89.0  
