In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/invoices.csv')
display(df.head())

Unnamed: 0,first_name,last_name,email,product_id,qty,amount,invoice_date,address,city,stock_code,job
0,Carmen Nixon,Todd Anderson,marvinjackson@example.com,133,9,14.57,10/09/1982,283 Wendy Common,West Alexander,36239634,Logistics and distribution manager
1,Mrs. Heather Miller,Julia Moore,jeffrey84@example.net,155,5,65.48,03/10/2012,13567 Patricia Circles Apt. 751,Andreamouth,2820163,Osteopath
2,Crystal May,Philip Moody,ugoodman@example.com,151,9,24.66,23/03/1976,6389 Debbie Island Suite 470,Coxbury,27006726,Economist
3,Bobby Weber,Mark Scott,ssanchez@example.com,143,4,21.34,17/08/1986,6362 Ashley Plaza Apt. 994,Ninaland,83036521,Sports administrator
4,Kristen Welch,David David,cynthia66@example.net,168,2,83.9,11/06/1996,463 Steven Cliffs Suite 757,Isaiahview,80142652,Chief Marketing Officer


In [None]:
# Step 2: Compute unit price for each transaction
df["unit_price"] = df["amount"] / df["qty"]

In [None]:
# Step 3: Compute avg price for each product_id
avg_price_per_product = df.groupby("product_id")["unit_price"].mean().reset_index()
avg_price_per_product.rename(columns={"unit_price": "avg_price"}, inplace=True)

In [None]:
# Step 4: Merge avg_price back to main DataFrame
df = df.merge(avg_price_per_product, on="product_id", how="left")

In [None]:
# Step 5: Derive fixed cost & list prices
df["cost_price"] = df["avg_price"] * 0.75
df["list_price"] = df["avg_price"] * 1.25

In [None]:
# Step 6: Derive discount % and margin %
df["discount_pct"] = ((df["list_price"] - df["unit_price"]) / df["list_price"]) * 100
df["margin_pct"] = ((df["unit_price"] - df["cost_price"]) / df["cost_price"]) * 100

In [None]:
#  Step 7: Ensure discount_pct is always positive
df["discount_pct"] = df["discount_pct"].abs()

In [None]:
# Step 8: Optional rounding
cols = ["unit_price", "avg_price", "cost_price", "list_price", "discount_pct", "margin_pct"]
df[cols] = df[cols].round(2)

In [None]:
# Step 11: Add 20 random salespeople
salespeople = [f"Salesperson_{i:02d}" for i in range(1, 21)]  # Salesperson_01 to Salesperson_20

In [None]:
# Random assignment
np.random.seed(42)  # for reproducibility
df["salesperson"] = np.random.choice(salespeople, size=len(df))

In [None]:
# Step 13: Add invoice_delay column
np.random.seed(42)  # reproducibility

In [None]:
# Generate random probabilities for each row
rand_probs = np.random.rand(len(df))

In [None]:
# Initialize column
df["invoice_delay"] = 0

In [None]:
# 40% of rows get 0, 60% get random delay between 1 and 10
df.loc[rand_probs > 0.4, "invoice_delay"] = np.random.randint(1, 11, size=(rand_probs > 0.4).sum())

In [None]:
df.head(20)

Unnamed: 0,first_name,last_name,email,product_id,qty,amount,invoice_date,address,city,stock_code,job,unit_price,avg_price,cost_price,list_price,discount_pct,margin_pct,salesperson,invoice_delay
0,Carmen Nixon,Todd Anderson,marvinjackson@example.com,133,9,14.57,10/09/1982,283 Wendy Common,West Alexander,36239634,Logistics and distribution manager,1.62,17.16,12.87,21.45,92.45,-87.42,Salesperson_07,0
1,Mrs. Heather Miller,Julia Moore,jeffrey84@example.net,155,5,65.48,03/10/2012,13567 Patricia Circles Apt. 751,Andreamouth,2820163,Osteopath,13.1,15.41,11.55,19.26,31.99,13.34,Salesperson_20,7
2,Crystal May,Philip Moody,ugoodman@example.com,151,9,24.66,23/03/1976,6389 Debbie Island Suite 470,Coxbury,27006726,Economist,2.74,13.51,10.13,16.89,83.77,-72.96,Salesperson_15,4
3,Bobby Weber,Mark Scott,ssanchez@example.com,143,4,21.34,17/08/1986,6362 Ashley Plaza Apt. 994,Ninaland,83036521,Sports administrator,5.34,15.28,11.46,19.1,72.07,-53.44,Salesperson_11,2
4,Kristen Welch,David David,cynthia66@example.net,168,2,83.9,11/06/1996,463 Steven Cliffs Suite 757,Isaiahview,80142652,Chief Marketing Officer,41.95,15.87,11.9,19.84,111.49,252.48,Salesperson_08,0
5,Jamie Garcia,Jonathan Tate,luis57@example.net,192,6,55.67,01/09/2016,54443 Kyle Haven Apt. 230,Millsside,53234087,"Therapist, sports",9.28,12.37,9.28,15.47,40.01,-0.01,Salesperson_07,0
6,Amanda Hicks,Stephanie Huff,hendersontracey@example.org,153,4,77.57,17/04/1993,5600 Hannah Vista,Port Susan,30928602,Special educational needs teacher,19.39,13.85,10.39,17.32,11.98,86.64,Salesperson_19,0
7,James Parker,Dustin Hanson Jr.,ffowler@example.net,107,3,86.5,27/09/1986,9076 Garrett Well,New Heatherfort,45734816,Textile designer,28.83,14.9,11.17,18.62,54.86,158.1,Salesperson_11,5
8,Susan Smith,Michael Campbell,thomaspatricia@example.net,149,1,53.73,01/11/1997,1684 John Circles,South Kim,13834685,Psychiatrist,53.73,16.71,12.53,20.88,157.27,328.78,Salesperson_11,3
9,Kylie Campos,Samantha Mendez,danny61@example.com,108,5,91.71,23/03/1994,15898 Jeanne Inlet Suite 973,West Patricia,92548268,Radio broadcast assistant,18.34,15.47,11.6,19.34,5.16,58.07,Salesperson_04,9


In [None]:
# Step 14: Save final version
df.to_csv("invoices_enriched_final.csv", index=False)