In [1]:
import pandas as pd
import numpy as np
from faker import Faker

In [3]:
fake=Faker()
np.random.seed(42)

In [9]:
data= {
    'Date': pd.date_range(start='2022-04-14', periods=1000, freq='D'),
    'ProductID': [f"P{np.random.randint(100,200)}" for _ in range(1000)],
    'Region': [fake.city() for _ in range(1000)],
    'Sales': np.random.randint(10, 100, 1000),
    'Price': np.random.uniform(5, 100, 1000).round(2),
    'Cost': np.random.uniform(3, 80, 1000).round(2),
    'PromotionCode': [fake.lexify('PROMO_???') if np.random.rand()>0.7 else None for _ in range(1000)]
}
df = pd.DataFrame(data)
df['Profit'] = ((df['Sales']*df['Price'])-df['Cost']).round(2)
#df.to_csv('sales_data.csv', index=False)
df

Unnamed: 0,Date,ProductID,Region,Sales,Price,Cost,PromotionCode,Profit
0,2022-04-14,P182,North Lucasport,96,93.00,12.25,,8915.75
1,2022-04-15,P108,Fredport,85,10.08,22.05,PROMO_vcr,834.75
2,2022-04-16,P126,East Richardfurt,81,23.70,64.46,,1855.24
3,2022-04-17,P148,North Paula,62,72.05,13.21,,4453.89
4,2022-04-18,P178,East Sarahland,94,59.87,79.55,,5548.23
...,...,...,...,...,...,...,...,...
995,2025-01-03,P140,Hendersonfort,61,74.30,50.83,,4481.47
996,2025-01-04,P156,West Brandi,32,5.78,4.49,,180.47
997,2025-01-05,P141,North Benjamin,15,74.75,30.40,,1090.85
998,2025-01-06,P152,South Perry,45,17.46,10.71,,774.99


In [18]:
## 1. Sales trend by date
df.groupby('Date')['Sales'].sum().sort_values(ascending=False).to_frame()

Unnamed: 0_level_0,Sales
Date,Unnamed: 1_level_1
2024-01-16,99
2022-08-18,99
2022-09-25,99
2024-05-22,99
2024-08-19,99
...,...
2023-06-11,10
2023-11-09,10
2022-05-21,10
2024-08-28,10


In [19]:
## 2. Sales trend by Region
df.groupby('Region')['Sales'].sum().sort_values(ascending=False).reset_index(drop=False)

Unnamed: 0,Region,Sales
0,North Matthew,203
1,Lake Deborahshire,178
2,Port Michael,174
3,Lake Michael,168
4,South Thomas,160
...,...,...
961,Port Kristopher,10
962,Port Patriciahaven,10
963,Mossview,10
964,Timothybury,10


In [47]:
## 3. Most profitable region using pivot_table()
pivotProfit = df.pivot_table(
    values="Profit",
    index="Region",
    columns="ProductID",
    aggfunc="sum",
    fill_value=0
)

In [48]:
pivotProfit

ProductID,P100,P101,P102,P103,P104,P105,P106,P107,P108,P109,...,P190,P191,P192,P193,P194,P195,P196,P197,P198,P199
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaronfurt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
Aaronport,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
Aaronville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
Adamsstad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
Adkinsstad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wilsonville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4876.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
Wongburgh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
Woodsfurt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
Wrightstad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,4105.32,0.0,0.0,0.0


In [53]:
mostProfitableRegion = pivotProfit.sum(axis=1).sort_values(ascending=False).head().to_frame()
mostProfitableRegion

Unnamed: 0_level_0,0
Region,Unnamed: 1_level_1
West Jamesmouth,12615.15
North Matthew,11086.1
Port Timothy,10575.0
Lake Sarah,10286.21
Anthonyborough,9720.85


In [54]:
mostProfitableProduct = pivotProfit.sum(axis=0).sort_values(ascending=False).head()
mostProfitableProduct.to_frame()

Unnamed: 0_level_0,0
ProductID,Unnamed: 1_level_1
P141,60510.82
P172,55879.98
P113,55296.16
P170,52990.22
P140,52593.4


In [60]:
## 4. FIlter based on certain conditions
highSalesPromo = df[(df['Sales']>97) & (df['PromotionCode'].notna())]
highSalesPromo

Unnamed: 0,Date,ProductID,Region,Sales,Price,Cost,PromotionCode,Profit
123,2022-08-15,P155,Port Jodistad,99,8.9,33.4,PROMO_rlh,847.7
247,2022-12-17,P179,Stephanieside,98,66.46,5.15,PROMO_aMo,6507.93
346,2023-03-26,P198,Riosmouth,98,74.06,31.41,PROMO_bjN,7226.47
425,2023-06-13,P157,West Amanda,98,70.79,69.51,PROMO_fVh,6867.91
450,2023-07-08,P199,New Melissa,98,77.82,47.6,PROMO_SCU,7578.76
557,2023-10-23,P140,Jasonbury,99,7.2,6.92,PROMO_cbw,705.88
636,2024-01-10,P189,West Keith,99,22.45,33.74,PROMO_BJJ,2188.81
950,2024-11-19,P120,Port Joseville,98,54.78,7.55,PROMO_Fdz,5360.89


In [67]:
## 5. Discounted Price if promo exist
df['Discounted Price'] = np.where(df['PromotionCode'].notna(), df['Price']-df['Price']*0.25, df['Price']).round(2)

In [68]:
df

Unnamed: 0,Date,ProductID,Region,Sales,Price,Cost,PromotionCode,Profit,Discounted Price
0,2022-04-14,P182,North Lucasport,96,93.00,12.25,,8915.75,93.00
1,2022-04-15,P108,Fredport,85,10.08,22.05,PROMO_vcr,834.75,7.56
2,2022-04-16,P126,East Richardfurt,81,23.70,64.46,,1855.24,23.70
3,2022-04-17,P148,North Paula,62,72.05,13.21,,4453.89,72.05
4,2022-04-18,P178,East Sarahland,94,59.87,79.55,,5548.23,59.87
...,...,...,...,...,...,...,...,...,...
995,2025-01-03,P140,Hendersonfort,61,74.30,50.83,,4481.47,74.30
996,2025-01-04,P156,West Brandi,32,5.78,4.49,,180.47,5.78
997,2025-01-05,P141,North Benjamin,15,74.75,30.40,,1090.85,74.75
998,2025-01-06,P152,South Perry,45,17.46,10.71,,774.99,17.46
