# PROMOTION BUMP ASSIGNMENT
date: 2024-09-19  
Analist: Botan Fırat Bulut

## IMPORTS AND SETUP

In [1]:
# Imports:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignoner harmless warnings:
warnings.filterwarnings('ignore')

# Plot rcParams:
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['figure.dpi'] = 144
plt.rcParams['figure.titlesize'] = 22
plt.rcParams['figure.titleweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.titleweight'] = 'bold'
plt.style.use('ggplot')

### Data = assignment4.1a.csv
The data contains daily sales of sample of items in several stores on a  
specific time frame. Negative sale quantities represents returns. Each row  
represents a sale (or return) activity for an item in a store at a specific day.  
If a store-item combination has no observation in a certain day you can  
assume there is no sales for that item at that store at that day.

In [2]:
# Read assignment4.1a.csv:
df1a = pd.read_csv('./assignment4.1a.csv')
df1a.head()

Unnamed: 0,Date,StoreCode,ProductCode,SalesQuantity
0,2015-01-01,8,9,-1
1,2015-01-01,131,9,1
2,2015-01-01,144,9,2
3,2015-01-01,203,9,2
4,2015-01-01,256,9,0


In [3]:
# df1a information:
df1a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1873618 entries, 0 to 1873617
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   Date           object
 1   StoreCode      int64 
 2   ProductCode    int64 
 3   SalesQuantity  int64 
dtypes: int64(3), object(1)
memory usage: 57.2+ MB


In [4]:
# df1a null values:
df1a.isnull().sum()

Date             0
StoreCode        0
ProductCode      0
SalesQuantity    0
dtype: int64

The assignment4.1a.csv contains 3 columns with 1,873,618 records. It does  
not contain any missing values.

### Data=Promotiondates.csv
The data contains beginning and the end dates of 6 promotions that took
place in 2015.  

In [5]:
df_promotion = pd.read_csv('./PromotionDates.csv')
df_promotion

Unnamed: 0,Period,StartDate,EndDate
0,Promo1,2/10/2015,2/17/2015
1,Promo2,3/15/2015,3/22/2015
2,Promo3,5/24/2015,6/1/2015
3,Promo4,6/21/2015,6/28/2015
4,Promo5,1/9/2015,6/9/2015
5,Promo6,20/11/2015,27/11/2015


The data contains beginning and the end dates of 6 promotions  
that took place in 2015.

### TASK A

Your goal is to model the effect of promotion on products and stores. At this  
stage only use the data in the file Assignment4.1a.csv and base your model  
using the first 4 promotions.

In [6]:
# convert dataframe Date column from object to datetime:
df1a['Date'] = pd.to_datetime(df1a['Date'],
							  format='%Y-%m-%d')

# Promotion data's last 2 rows contain day first datetime format.
# We fix this manually since there are only 4 wrong entries:
df_promotion.at[4, 'StartDate'] = '9/1/2015'
df_promotion.at[4, 'EndDate'] = '9/6/2015'
df_promotion.at[5, 'StartDate'] = '11/20/2015'
df_promotion.at[5, 'EndDate'] = '11/27/2015'

In [7]:
# df1a date column datetype:
df1a['Date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1873618 entries, 0 to 1873617
Series name: Date
Non-Null Count    Dtype         
--------------    -----         
1873618 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 14.3 MB


In [8]:
# Check the update promotion data:
df_promotion

Unnamed: 0,Period,StartDate,EndDate
0,Promo1,2/10/2015,2/17/2015
1,Promo2,3/15/2015,3/22/2015
2,Promo3,5/24/2015,6/1/2015
3,Promo4,6/21/2015,6/28/2015
4,Promo5,9/1/2015,9/6/2015
5,Promo6,11/20/2015,11/27/2015


In [9]:
# We create ISO calender week column for groupping:
df1a['WeekNumber'] = df1a['Date'].dt.isocalendar().week
df1a['WeekNumber'].value_counts()

WeekNumber
12    75285
10    69697
8     68661
18    68488
7     67939
11    67328
14    66865
17    66494
9     66165
15    65999
16    65900
20    65844
19    64510
5     64229
21    62328
23    61392
6     61231
13    60921
26    59211
2     59069
24    58247
3     57745
22    56793
28    56045
4     55768
25    55147
30    55010
29    54824
27    49482
31    42576
1     24425
Name: count, dtype: Int64

In [10]:
# Convert promotion data columns to datetype:
df_promotion['StartDate'] = pd.to_datetime(df_promotion['StartDate'],
										  format='%m/%d/%Y')

df_promotion['EndDate'] = pd.to_datetime(df_promotion['EndDate'],
										  format='%m/%d/%Y')

# Check the updated promotion data information:
df_promotion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Period     6 non-null      object        
 1   StartDate  6 non-null      datetime64[ns]
 2   EndDate    6 non-null      datetime64[ns]
dtypes: datetime64[ns](2), object(1)
memory usage: 272.0+ bytes


In [11]:
# Obtain week numbers of promotion start and end dates:
df_promotion['StartWeekNumber'] = df_promotion['StartDate']\
	.dt.isocalendar().week

df_promotion['EndWeekNumber'] = df_promotion['EndDate']\
	.dt.isocalendar().week

df_promotion

Unnamed: 0,Period,StartDate,EndDate,StartWeekNumber,EndWeekNumber
0,Promo1,2015-02-10,2015-02-17,7,8
1,Promo2,2015-03-15,2015-03-22,11,12
2,Promo3,2015-05-24,2015-06-01,21,23
3,Promo4,2015-06-21,2015-06-28,25,26
4,Promo5,2015-09-01,2015-09-06,36,36
5,Promo6,2015-11-20,2015-11-27,47,48


In [12]:
# Merge promotion information with sales information:
def promo_and_sales_merger(target_df: pd.DataFrame,
						   promo_df: pd.DataFrame) -> None:
	"""
	Add flags to target df using promotion information.
	"""

	target_promotions = promo_df['Period'].values

	# Loop each promotion date information:
	for promo in target_promotions:

		# Get the start and end weeks:
		start_week = promo_df\
			.query('Period == @promo')['StartWeekNumber'].values[0]
		
		end_week = promo_df\
		.query('Period == @promo')['EndWeekNumber'].values[0]

		# Obtain target weeks:
		target_weeks = [start_week, end_week]
		
		# Add relevant flag to the data :
		target_df[f'{promo}_yes'] = False
		update_index = target_df\
			.query('WeekNumber in @target_weeks').index

		# Flag promo weeks:
		for idx in update_index:
			target_df.at[idx, f'{promo}_yes'] = True
		
	return None

In [13]:
# Run function:
promo_and_sales_merger(target_df=df1a,
					  promo_df=df_promotion)

In [14]:
# Investigate StoreCode column:
df1a['StoreCode'].value_counts()

StoreCode
331    17470
276    15414
307    13846
99     13438
2      12636
       ...  
130      545
19       324
227       44
152        1
169        1
Name: count, Length: 340, dtype: int64

In [15]:
# Investigate ProductCode column:
df1a['ProductCode'].value_counts()

ProductCode
149    54060
218    36766
221    32043
168    26545
205    24817
       ...  
261        1
165        1
182        1
227        1
310        1
Name: count, Length: 317, dtype: int64

In [16]:
# Describe Date:
df1a['Date'].describe().T

count                          1873618
mean     2015-04-14 10:09:35.587126528
min                2015-01-01 00:00:00
25%                2015-02-23 00:00:00
50%                2015-04-13 00:00:00
75%                2015-06-03 00:00:00
max                2015-07-31 00:00:00
Name: Date, dtype: object

In [17]:
# Describe SalesQuantity:
pd.options.display.float_format = '{:.4f}'.format
df1a['SalesQuantity']\
	.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).T

count   1873618.0000
mean          2.2466
std           5.0290
min         -60.0000
10%           0.0000
25%           0.0000
50%           1.0000
75%           2.0000
90%           4.0000
max         912.0000
Name: SalesQuantity, dtype: float64

In [18]:
# Get promo column information:
promo_cols = [f'Promo{x}_yes' for x in range(1, 7)]
df1a[promo_cols].sum()

Promo1_yes    136600
Promo2_yes    142613
Promo3_yes    123720
Promo4_yes    114358
Promo5_yes         0
Promo6_yes         0
dtype: int64

In [None]:
df1a[' ']

- There are 340 unique store codes. Store 331 has  the most records with  
value of 17470.
- There are 317 unique product codes. Prdouct 149 has the most records with  
value of 54060
- Data date range is from 2015-01-01 to 2015-07-31.  
- Sales Quantity has mean value of 2.2466 with std 5.290. Most of the values  
are in the range 0 to 4. We observe a extereme value of 912.
- Assignment4.1a.csv data does not contain any values  regarding promo5 and  
promo6 date range.
- Most of the sales observed in Promo2 weeks.  
-  

Divide product and stores  into 3 clusters each. Product with higher  
average weekly sale per store during non-promotion periods will be called  
“Fast items” and items with lower weekly average sale per store will be  
labeled as “Slow items”, items in between will be called “Medium items”.   
Grouping parameters selection is left to you. Apply similar approach to  
Stores as well.

In [19]:
# Obtain promotion periods from df1a: 
promo_indices = []
for promo in promo_cols:
	promo_indices\
		.extend(df1a[df1a[promo] == True].index.tolist())

# Generate no promotion data:
df1a_no_promo = df1a.drop(index=promo_indices)

In [20]:
# Validate no promotion:
df1a_no_promo[promo_cols].sum()

Promo1_yes    0
Promo2_yes    0
Promo3_yes    0
Promo4_yes    0
Promo5_yes    0
Promo6_yes    0
dtype: int64

In [21]:
# Obtain product sales average per week (all stores):
df1a_no_promo['WeeklyProductMeanSales'] = \
	df1a_no_promo[
		['WeekNumber', 'ProductCode', 'SalesQuantity']
		].groupby(by=['WeekNumber', 'ProductCode']).transform('mean')

In [22]:
df1a_no_promo

Unnamed: 0,Date,StoreCode,ProductCode,SalesQuantity,WeekNumber,Promo1_yes,Promo2_yes,Promo3_yes,Promo4_yes,Promo5_yes,Promo6_yes,WeeklyProductMeanSales
0,2015-01-01,8,9,-1,1,False,False,False,False,False,False,0.6471
1,2015-01-01,131,9,1,1,False,False,False,False,False,False,0.6471
2,2015-01-01,144,9,2,1,False,False,False,False,False,False,0.6471
3,2015-01-01,203,9,2,1,False,False,False,False,False,False,0.6471
4,2015-01-01,256,9,0,1,False,False,False,False,False,False,0.6471
...,...,...,...,...,...,...,...,...,...,...,...,...
1873613,2015-07-30,292,315,0,31,False,False,False,False,False,False,0.4359
1873614,2015-07-31,12,315,1,31,False,False,False,False,False,False,0.4359
1873615,2015-07-31,104,315,1,31,False,False,False,False,False,False,0.4359
1873616,2015-07-31,261,315,1,31,False,False,False,False,False,False,0.4359


In [23]:
# Obtain product sales average per week (all stores):
df1a_no_promo['WeeklyProductMeanSales'] = \
	df1a_no_promo[
		['WeekNumber', 'ProductCode', 'SalesQuantity']
		].groupby(by=['WeekNumber', 'ProductCode']).transform('mean')

In [24]:
# Obtain product sales average per store (all weeks):
df1a_no_promo['ProductMeanSalesPerStore'] = \
	df1a_no_promo[
		['StoreCode', 'ProductCode', 'SalesQuantity']
		].groupby(by=['StoreCode', 'ProductCode']).transform('mean')

In [26]:
df1a_no_promo

Unnamed: 0,Date,StoreCode,ProductCode,SalesQuantity,WeekNumber,Promo1_yes,Promo2_yes,Promo3_yes,Promo4_yes,Promo5_yes,Promo6_yes,WeeklyProductMeanSales,ProductMeanSalesPerStore
0,2015-01-01,8,9,-1,1,False,False,False,False,False,False,0.6471,0.1111
1,2015-01-01,131,9,1,1,False,False,False,False,False,False,0.6471,0.1429
2,2015-01-01,144,9,2,1,False,False,False,False,False,False,0.6471,0.4286
3,2015-01-01,203,9,2,1,False,False,False,False,False,False,0.6471,0.7273
4,2015-01-01,256,9,0,1,False,False,False,False,False,False,0.6471,0.5000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873613,2015-07-30,292,315,0,31,False,False,False,False,False,False,0.4359,0.2500
1873614,2015-07-31,12,315,1,31,False,False,False,False,False,False,0.4359,1.0000
1873615,2015-07-31,104,315,1,31,False,False,False,False,False,False,0.4359,0.5000
1873616,2015-07-31,261,315,1,31,False,False,False,False,False,False,0.4359,1.0000


In [25]:
# Pivot tables
# pct change fucntions