# PROMOTION BUMP ASSIGNMENT
**Date:** 2024-09-19  
**Data Scientist:** Botan Fırat BULUT

## IMPORTS AND SETUP

In [1]:
# Imports:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignoner harmless warnings:
warnings.filterwarnings('ignore')

# Plot rcParams:
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['figure.dpi'] = 144
plt.rcParams['figure.titlesize'] = 22
plt.rcParams['figure.titleweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.titleweight'] = 'bold'
plt.style.use('ggplot')

### Data = assignment4.1a.csv
The data contains daily sales of sample of items in several stores on a  
specific time frame. Negative sale quantities represents returns. Each row  
represents a sale (or return) activity for an item in a store at a specific day.  
If a store-item combination has no observation in a certain day you can  
assume there is no sales for that item at that store at that day.

In [2]:
# Read assignment4.1a.csv:
df1a = pd.read_csv('./assignment4.1a.csv')
df1a.head()

Unnamed: 0,Date,StoreCode,ProductCode,SalesQuantity
0,2015-01-01,8,9,-1
1,2015-01-01,131,9,1
2,2015-01-01,144,9,2
3,2015-01-01,203,9,2
4,2015-01-01,256,9,0


In [3]:
# df1a information:
df1a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1873618 entries, 0 to 1873617
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   Date           object
 1   StoreCode      int64 
 2   ProductCode    int64 
 3   SalesQuantity  int64 
dtypes: int64(3), object(1)
memory usage: 57.2+ MB


In [4]:
# df1a null values:
df1a.isnull().sum()

Date             0
StoreCode        0
ProductCode      0
SalesQuantity    0
dtype: int64

The assignment4.1a.csv contains 3 columns with 1,873,618 records. It does  
not contain any missing values.

### Data=Promotiondates.csv
The data contains beginning and the end dates of 6 promotions that took
place in 2015.  

In [5]:
df_promotion = pd.read_csv('./PromotionDates.csv')
df_promotion

Unnamed: 0,Period,StartDate,EndDate
0,Promo1,2/10/2015,2/17/2015
1,Promo2,3/15/2015,3/22/2015
2,Promo3,5/24/2015,6/1/2015
3,Promo4,6/21/2015,6/28/2015
4,Promo5,1/9/2015,6/9/2015
5,Promo6,20/11/2015,27/11/2015


### TASK A

Your goal is to model the effect of promotion on products and stores. At this  
stage only use the data in the file Assignment4.1a.csv and base your model  
using the first 4 promotions.

In [6]:
# convert dataframe Date column from object to datetime:
df1a['Date'] = pd.to_datetime(df1a['Date'],
	format='%Y-%m-%d')

# Promotion data's last 2 rows contain day first datetime format.
# We fix this manually since there are only 4 wrong entries:
df_promotion.at[4, 'StartDate'] = '9/1/2015'
df_promotion.at[4, 'EndDate'] = '9/6/2015'
df_promotion.at[5, 'StartDate'] = '11/20/2015'
df_promotion.at[5, 'EndDate'] = '11/27/2015'

In [7]:
# df1a date column datetype:
df1a['Date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1873618 entries, 0 to 1873617
Series name: Date
Non-Null Count    Dtype         
--------------    -----         
1873618 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 14.3 MB


In [8]:
# Check the update promotion data:
df_promotion

Unnamed: 0,Period,StartDate,EndDate
0,Promo1,2/10/2015,2/17/2015
1,Promo2,3/15/2015,3/22/2015
2,Promo3,5/24/2015,6/1/2015
3,Promo4,6/21/2015,6/28/2015
4,Promo5,9/1/2015,9/6/2015
5,Promo6,11/20/2015,11/27/2015


In [9]:
# We create ISO calender week column for groupping:
df1a['WeekNumber'] = df1a['Date'].dt.isocalendar().week
df1a['WeekNumber'].value_counts()

WeekNumber
12    75285
10    69697
8     68661
18    68488
7     67939
11    67328
14    66865
17    66494
9     66165
15    65999
16    65900
20    65844
19    64510
5     64229
21    62328
23    61392
6     61231
13    60921
26    59211
2     59069
24    58247
3     57745
22    56793
28    56045
4     55768
25    55147
30    55010
29    54824
27    49482
31    42576
1     24425
Name: count, dtype: Int64

In [31]:
# Convert promotion data columns to datetype:
df_promotion['StartDate'] = pd.to_datetime(df_promotion['StartDate'],
                                           format='%m/%d/%Y')

df_promotion['EndDate'] = pd.to_datetime(df_promotion['EndDate'],
                                         format='%m/%d/%Y')

# Check the updated promotion data information:
df_promotion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Period           6 non-null      object        
 1   StartDate        6 non-null      datetime64[ns]
 2   EndDate          6 non-null      datetime64[ns]
 3   StartWeekNumber  6 non-null      UInt32        
 4   EndWeekNumber    6 non-null      UInt32        
dtypes: UInt32(2), datetime64[ns](2), object(1)
memory usage: 332.0+ bytes


In [32]:
# Obtain week numbers of promotion start and end dates:
df_promotion['StartWeekNumber'] = (df_promotion['StartDate']
                                   .dt.isocalendar().week)

df_promotion['EndWeekNumber'] = df_promotion['EndDate']\
	.dt.isocalendar().week

df_promotion

Unnamed: 0,Period,StartDate,EndDate,StartWeekNumber,EndWeekNumber
0,Promo1,2015-02-10,2015-02-17,7,8
1,Promo2,2015-03-15,2015-03-22,11,12
2,Promo3,2015-05-24,2015-06-01,21,23
3,Promo4,2015-06-21,2015-06-28,25,26
4,Promo5,2015-09-01,2015-09-06,36,36
5,Promo6,2015-11-20,2015-11-27,47,48


In [12]:
# Describe Date:
df1a['Date'].describe().T

count                          1873618
mean     2015-04-14 10:09:35.587126528
min                2015-01-01 00:00:00
25%                2015-02-23 00:00:00
50%                2015-04-13 00:00:00
75%                2015-06-03 00:00:00
max                2015-07-31 00:00:00
Name: Date, dtype: object

In [13]:
# Describe SalesQuantity:
pd.options.display.float_format = '{:.4f}'.format
df1a['SalesQuantity']\
	.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).T

count   1873618.0000
mean          2.2466
std           5.0290
min         -60.0000
10%           0.0000
25%           0.0000
50%           1.0000
75%           2.0000
90%           4.0000
max         912.0000
Name: SalesQuantity, dtype: float64

- There are 340 unique store codes. Store 331 has  the most records with  
value of 17470.
- There are 317 unique product codes. Prdouct 149 has the most records with  
value of 54060
- Data date range is from 2015-01-01 to 2015-07-31.  
- Sales Quantity has mean value of 2.2466 with std 5.290. Most of the values  
are in the range 0 to 4. We observe a extereme value of 912.
- Assignment4.1a.csv data does not contain any values  regarding promo5 and  
promo6 date range.
- Most of the records observed in Promo2 weeks.  

Divide product and stores  into 3 clusters each. Product with higher  
average weekly sale per store during non-promotion periods will be called  
“Fast items” and items with lower weekly average sale per store will be  
labeled as “Slow items”, items in between will be called “Medium items”.   
Grouping parameters selection is left to you. Apply similar approach to  
Stores as well.

In [14]:
def fill_missing_date(target_df: pd.DataFrame,
					  start: str, end: str) -> pd.DataFrame:
    """
	Fills the missing date sales information
	and returns the updated dataframe.
	"""
    
    # Create a complete date range:
    date_range = pd.date_range(start=start, end=end)

    # Ensure the 'Date' column is in datetime format:
    target_df['Date'] = pd.to_datetime(target_df['Date'], format='%Y-%m-%d')

    # Group by 'StoreCode' and 'ProductCode',
	# and reindex with the complete date range
    filled_df = (target_df
                 .set_index(['StoreCode', 'ProductCode', 'Date'])
                 .reindex(pd.MultiIndex.from_product(
                     [target_df['StoreCode'].unique(), 
                      target_df['ProductCode'].unique(), 
                      date_range],
                     names=['StoreCode', 'ProductCode', 'Date']
                 ))
                 .reset_index())

    # Fill missing values
    filled_df['SalesQuantity'] = filled_df['SalesQuantity'].fillna(0)

    return filled_df

In [15]:
df1a = fill_missing_date(target_df=df1a,
						start='2015-01-01',
 						end='2015-07-31')

df1a['WeekNumber'] = df1a['Date'].dt.isocalendar().week

In [16]:
# Merge promotion information with sales information:
def promo_and_sales_merger(target_df: pd.DataFrame,
                           promo_df: pd.DataFrame) -> None:
    """
    Add flags to target_df using promotion information.
    """

    # Loop through each promotion date:
    for _, promo_row in promo_df.iterrows():

        promo = promo_row['Period']
        start_week = promo_row['StartWeekNumber']
        end_week = promo_row['EndWeekNumber']

        # Obtain target weeks:
        target_weeks = [start_week, end_week]

        # Add a new column for the promotion flag:
        target_df[f'{promo}_yes'] = 0

        # Flag promo weeks using vectorized boolean indexing:
        target_df.loc[target_df['WeekNumber']\
            .isin(target_weeks), f'{promo}_yes'] = 1

    return None

In [17]:
# Run function:
promo_and_sales_merger(target_df=df1a,
					  promo_df=df_promotion)

Unnamed: 0,Promo1_yes,Promo2_yes,Promo3_yes,Promo4_yes
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
22849355,0,0,0,0
22849356,0,0,0,0
22849357,0,0,0,0
22849358,0,0,0,0


In [19]:
# Obtain promotion periods from df1a: 
promo_indices = []

# Update promo cols:
promo_cols = [f'Promo{x}_yes' for x in range(1, 5)]
for promo in promo_cols:
	promo_indices\
		.extend(df1a[df1a[promo] == True].index.tolist())

# Generate no promotion data:
df1a_no_promo = df1a.drop(index=promo_indices)

In [20]:
# Validate no promotion:
df1a_no_promo[promo_cols].sum()

Promo1_yes    0
Promo2_yes    0
Promo3_yes    0
Promo4_yes    0
dtype: int64

In [21]:
weekly_sales_by_product = df1a_no_promo[
	['WeekNumber', 'ProductCode', 'SalesQuantity']
	].groupby(['WeekNumber', 'ProductCode'])\
		.sum().reset_index()

In [22]:
def pct_change_per_product(weekly_product_sales_df:\
						   pd.DataFrame) -> pd.DataFrame:
	"""
	Calculate percentage change for each product over defined date range
	"""

	# Define dictionary for storing percentage change information:
	pct_change_dict = dict()

	# Loop all unique product codes:
	for product_code in weekly_product_sales_df.index:

		# Select single product.
		temp = weekly_product_sales_df.loc[product_code]

		# Calculate the percentage chage:
		temp = temp.pct_change().dropna()

		# Update the dict:
		pct_change_dict[product_code] = temp.to_list()

	
	# Return percentage change df:
	return pct_change_dict

In [23]:
temp = df1a_no_promo[
    ['WeekNumber', 'ProductCode', 'StoreCode', 'SalesQuantity']
    ].groupby(['ProductCode', 'StoreCode', 'WeekNumber']).mean()

In [24]:
temp.reset_index()

Unnamed: 0,ProductCode,StoreCode,WeekNumber,SalesQuantity
0,1,1,1,0.0000
1,1,1,2,0.5714
2,1,1,3,0.0000
3,1,1,4,0.1429
4,1,1,5,0.0000
...,...,...,...,...
2478935,317,340,27,0.0000
2478936,317,340,28,0.0000
2478937,317,340,29,0.0000
2478938,317,340,30,0.0000


In [25]:
a = pd.pivot_table(data=df1a_no_promo,
			  columns='WeekNumber',
			  index='ProductCode',
			  values='SalesQuantity',
			  aggfunc='mean').fillna(0)

In [26]:
a

WeekNumber,1,2,3,4,5,6,9,10,13,14,...,18,19,20,22,24,27,28,29,30,31
ProductCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.1493,0.1441,0.1660,0.1029,0.1647,0.1727,0.1706,0.1626,0.1143,0.1269,...,0.1975,0.1857,0.1458,0.1265,0.1214,0.1235,0.1504,0.1777,0.1542,0.1376
2,0.0860,0.1042,0.1252,0.0866,0.1277,0.1340,0.1227,0.1395,0.0983,0.1374,...,0.1218,0.1000,0.1592,0.1647,0.0731,0.1345,0.1164,0.0895,0.1168,0.1018
3,0.0765,0.0954,0.0693,0.0845,0.0937,0.0576,0.1063,0.1122,0.0693,0.0782,...,0.0836,0.0765,0.1013,0.1097,0.0735,0.0916,0.1013,0.0857,0.1122,0.0776
4,0.1022,0.1172,0.1420,0.0786,0.0882,0.1181,0.1055,0.1282,0.0975,0.0752,...,0.0908,0.1076,0.0786,0.1067,0.1029,0.0634,0.1122,0.1466,0.1004,0.1365
5,0.1199,0.1555,0.1168,0.0979,0.1660,0.1319,0.1328,0.1269,0.1122,0.1466,...,0.1643,0.1034,0.1349,0.1160,0.1134,0.1244,0.1366,0.1113,0.1294,0.0747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0076,...,0.1475,0.1122,0.1706,0.2345,0.1891,0.1744,0.1962,0.2803,0.2731,0.1894
314,0.0015,0.0034,0.0055,0.0046,0.0042,0.0076,0.0071,0.0071,0.0101,0.0071,...,0.0134,0.0126,0.0155,0.0189,0.0197,0.0097,0.0231,0.0122,0.0130,0.0159
315,0.0088,0.0092,0.0046,0.0055,0.0088,0.0055,0.0105,0.0059,0.0084,0.0088,...,0.0071,0.0105,0.0097,0.0101,0.0059,0.0071,0.0046,0.0050,0.0029,0.0100
316,0.0279,0.0265,0.0282,0.0181,0.0244,0.0193,0.0113,0.0122,0.0181,0.0143,...,0.0181,0.0164,0.0172,0.0164,0.0122,0.0210,0.0176,0.0088,0.0134,0.0106


In [27]:
df1a.query()

Unnamed: 0,StoreCode,ProductCode,Date,SalesQuantity,WeekNumber,Promo1_yes,Promo2_yes,Promo3_yes,Promo4_yes,Promo5_yes,Promo6_yes
0,8,9,2015-01-01,-1.0000,1,0,0,0,0,0,0
1,8,9,2015-01-02,0.0000,1,0,0,0,0,0,0
2,8,9,2015-01-03,0.0000,1,0,0,0,0,0,0
3,8,9,2015-01-04,0.0000,1,0,0,0,0,0,0
4,8,9,2015-01-05,0.0000,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
22849355,169,315,2015-07-27,0.0000,31,0,0,0,0,0,0
22849356,169,315,2015-07-28,0.0000,31,0,0,0,0,0,0
22849357,169,315,2015-07-29,0.0000,31,0,0,0,0,0,0
22849358,169,315,2015-07-30,0.0000,31,0,0,0,0,0,0
