# MOQ Price Analysis for Wholesale B2B Businesses

## importing libraries

In [82]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

## extracting the dataset

In [83]:
df = pd.read_csv('online_retail_II.csv')
print(df.dtypes)

Invoice         object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
Price          float64
Customer ID    float64
Country         object
dtype: object


## analysing data quality

In [84]:
# finding missing data
missing_data = pd.DataFrame({
    'columns': df.columns,
    'null count': df.isnull().sum(),
    'null percentage': (df.isnull().sum() / len(df) * 100).round(2)
})

print(missing_data)

                 columns  null count  null percentage
Invoice          Invoice           0             0.00
StockCode      StockCode           0             0.00
Description  Description        4382             0.41
Quantity        Quantity           0             0.00
InvoiceDate  InvoiceDate           0             0.00
Price              Price           0             0.00
Customer ID  Customer ID      243007            22.77
Country          Country           0             0.00


In [85]:
# finding unique data
for col in df.columns:
    print(f'{col}: {df[col].nunique()}')

Invoice: 53628
StockCode: 5305
Description: 5698
Quantity: 1057
InvoiceDate: 47635
Price: 2807
Customer ID: 5942
Country: 43


In [86]:
# getting a sample of the dataset
df.head(10)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
5,489434,22064,PINK DOUGHNUT TRINKET POT,24,2009-12-01 07:45:00,1.65,13085.0,United Kingdom
6,489434,21871,SAVE THE PLANET MUG,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
7,489434,21523,FANCY FONT HOME SWEET HOME DOORMAT,10,2009-12-01 07:45:00,5.95,13085.0,United Kingdom
8,489435,22350,CAT BOWL,12,2009-12-01 07:46:00,2.55,13085.0,United Kingdom
9,489435,22349,"DOG BOWL , CHASING BALL DESIGN",12,2009-12-01 07:46:00,3.75,13085.0,United Kingdom


In [87]:
# descriptive statistics from the dataset
df.describe()

Unnamed: 0,Quantity,Price,Customer ID
count,1067371.0,1067371.0,824364.0
mean,9.938898,4.649388,15324.638504
std,172.7058,123.5531,1697.46445
min,-80995.0,-53594.36,12346.0
25%,1.0,1.25,13975.0
50%,3.0,2.1,15255.0
75%,10.0,4.15,16797.0
max,80995.0,38970.0,18287.0


## identifying and documenting problems

In [88]:
# negative numbers in quantity
negative_qty = df[df['Quantity'] < 0]
print(negative_qty.head(10))

     Invoice StockCode                        Description  Quantity  \
178  C489449     22087           PAPER BUNTING WHITE LACE       -12   
179  C489449    85206A       CREAM FELT EASTER EGG BASKET        -6   
180  C489449     21895      POTTING SHED SOW 'N' GROW SET        -4   
181  C489449     21896                 POTTING SHED TWINE        -6   
182  C489449     22083         PAPER CHAIN KIT RETRO SPOT       -12   
183  C489449     21871                SAVE THE PLANET MUG       -12   
184  C489449     84946    ANTIQUE SILVER TEA GLASS ETCHED       -12   
185  C489449    84970S  HANGING HEART ZINC T-LIGHT HOLDER       -24   
186  C489449     22090          PAPER BUNTING RETRO SPOTS       -12   
196  C489459    90200A         PURPLE SWEETHEART BRACELET        -3   

             InvoiceDate  Price  Customer ID         Country  
178  2009-12-01 10:33:00   2.95      16321.0       Australia  
179  2009-12-01 10:33:00   1.65      16321.0       Australia  
180  2009-12-01 10:33:00   4.

In [89]:
# zero or negative prices in price
zero_price = df[df['Price'] <= 0]
print(zero_price.head(10))

     Invoice StockCode   Description  Quantity          InvoiceDate  Price  \
263   489464     21733  85123a mixed       -96  2009-12-01 10:52:00    0.0   
283   489463     71477         short      -240  2009-12-01 10:52:00    0.0   
284   489467    85123A   21733 mixed      -192  2009-12-01 10:53:00    0.0   
470   489521     21646           NaN       -50  2009-12-01 11:44:00    0.0   
3114  489655     20683           NaN       -44  2009-12-01 17:26:00    0.0   
3161  489659     21350           NaN       230  2009-12-01 17:39:00    0.0   
3162  489660     35956          lost     -1043  2009-12-01 17:43:00    0.0   
3168  489663    35605A       damages      -117  2009-12-01 18:02:00    0.0   
3731  489781     84292           NaN        17  2009-12-02 11:45:00    0.0   
4296  489806     18010           NaN      -770  2009-12-02 12:42:00    0.0   

      Customer ID         Country  
263           NaN  United Kingdom  
283           NaN  United Kingdom  
284           NaN  United Kingdom

In [90]:
# special codes (non-products)
special_codes = df[df['StockCode'].str.contains('^[A-Z]+$', na=False, regex=True)]
print(special_codes['StockCode'].value_counts().head(10))

StockCode
POST         2122
DOT          1446
M            1421
D             177
S             104
ADJUST         67
AMAZONFEE      43
DCGSSGIRL      25
DCGSSBOY       23
PADS           19
Name: count, dtype: int64


In [91]:
# missing customer id
no_customer = df[df['Customer ID'].isnull()]
print(no_customer.head(10))

     Invoice StockCode                   Description  Quantity  \
263   489464     21733                  85123a mixed       -96   
283   489463     71477                         short      -240   
284   489467    85123A                   21733 mixed      -192   
470   489521     21646                           NaN       -50   
577   489525    85226C     BLUE PULL BACK RACING CAR         1   
578   489525     85227   SET/6 3D KIT CARDS FOR KIDS         1   
1055  489548     22271          FELTCRAFT DOLL ROSIE         1   
1056  489548     22254          FELT TOADSTOOL LARGE        12   
1057  489548     22273          FELTCRAFT DOLL MOLLY         3   
1058  489548     22195  LARGE HEART MEASURING SPOONS         1   

              InvoiceDate  Price  Customer ID         Country  
263   2009-12-01 10:52:00   0.00          NaN  United Kingdom  
283   2009-12-01 10:52:00   0.00          NaN  United Kingdom  
284   2009-12-01 10:53:00   0.00          NaN  United Kingdom  
470   2009-12-01 

In [92]:
# missing descriptions
no_desc = df[df['Description'].isnull()]
print(no_desc.head(10))

     Invoice StockCode Description  Quantity          InvoiceDate  Price  \
470   489521     21646         NaN       -50  2009-12-01 11:44:00    0.0   
3114  489655     20683         NaN       -44  2009-12-01 17:26:00    0.0   
3161  489659     21350         NaN       230  2009-12-01 17:39:00    0.0   
3731  489781     84292         NaN        17  2009-12-02 11:45:00    0.0   
4296  489806     18010         NaN      -770  2009-12-02 12:42:00    0.0   
4566  489821    85049G         NaN      -240  2009-12-02 13:25:00    0.0   
6378  489882    35751C         NaN        12  2009-12-02 16:22:00    0.0   
6555  489898    79323G         NaN       954  2009-12-03 09:40:00    0.0   
6576  489901     21098         NaN      -200  2009-12-03 09:47:00    0.0   
6581  489903     21166         NaN        48  2009-12-03 09:57:00    0.0   

      Customer ID         Country  
470           NaN  United Kingdom  
3114          NaN  United Kingdom  
3161          NaN  United Kingdom  
3731          NaN  

In [93]:
# finding outliers in quantity
print(f"minimun quantity: {df['Quantity'].min()}")
print(f"maximun quantity: {df['Quantity'].max()}")
print(f"mean quantity: {df['Quantity'].mean()}")
print(f"99 percentile quantity: {df['Quantity'].quantile(0.99)}")
print(f"99.9 percentile quantity: {df['Quantity'].quantile(0.999)}")

minimun quantity: -80995
maximun quantity: 80995
mean quantity: 9.9388984711033
99 percentile quantity: 100.0
99.9 percentile quantity: 500.0


## cleaning the dataset

In [94]:
# getting a copy for the process
df_clean = df.copy()
print(f"number of initial records: {len(df_clean)}")

number of initial records: 1067371


In [95]:
# cleaning report
cleaning_report = {
    'step': [],
    'description': [],
    'removed count': [],
    'remaining count': []
}

def add_cleaning_step(step_name, description, removed_count, remaining_count):
    cleaning_report['step'].append(step_name)
    cleaning_report['description'].append(description)
    cleaning_report['removed count'].append(removed_count)
    cleaning_report['remaining count'].append(remaining_count)

### step 1: removing the returns

In [96]:
# counting before removing
before_q = len(df_clean)

# filtering
df_clean = df_clean[df_clean['Quantity'] > 0]

# counting after removing
after_q = len(df_clean)
removed_q = before_q - after_q

print(f"removed: {removed_q} records")
print(f"remaining: {after_q} records")

# adding to report
add_cleaning_step('step1', 'removed negative quantities', removed_q, after_q)

removed: 22950 records
remaining: 1044421 records


### step2: removing invalid prices

In [97]:
before_p = len(df_clean)

# removing negative or zero prices
df_clean = df_clean[df_clean['Price'] > 0]

after_p = len(df_clean)
removed_p = before_p - after_p

print(f"removed: {removed_p} records")
print(f"remaining: {after_p} records")

add_cleaning_step('step2', 'removed negative and zero prices', removed_p, after_p)

removed: 2750 records
remaining: 1041671 records


### step3: removing non-product stock codes

In [98]:
before_s = len(df_clean)

# identifying special code patterns
special_patterns = ['POST', 'D', 'DOT', 'M', 'BANK CHARGES', 'PADS', 'C2', 'CRUK', 'AMAZONFEE']

# removing stock codes with special patterns
df_clean = df_clean[~(df_clean['StockCode'].isin(special_patterns))]

# removing stock codes with one worded codes
df_clean = df_clean[~(df_clean['StockCode'].str.match('^[A-Z]$', na=False))]

after_s = len(df_clean)
removed_s = before_s - after_s

print(f"removed: {removed_s} records")
print(f"remaining: {after_s} records")

add_cleaning_step('step3', 'removed non-product stock codes', removed_s, after_s)

removed: 4536 records
remaining: 1037135 records


### step4: removing missing descriptions

In [99]:
before_d = len(df_clean)

df_clean = df_clean[df_clean['Description'].notna()]

after_d = len(df_clean)
removed_d = before_d - after_d

print(f"removed: {removed_d} records")
print(f"remaining: {after_d} records")

add_cleaning_step('step4', 'removed missing descriptions', removed_d, after_d)

removed: 0 records
remaining: 1037135 records


### step5: removing no customer id values

In [100]:
# finding records with no customer ids
no_customer = df_clean[df_clean['Customer ID'].isna()]

before_c = len(df_clean)

# removing no customer id records for moq strategy
df_clean = df_clean[df_clean['Customer ID'].notna()]

after_c = len(df_clean)
removed_c = before_c - after_c

print(f"removed: {removed_c} records")
print(f"remaining: {after_c} records")

add_cleaning_step('step5', 'removed records with no customer id', removed_c, after_c)

removed: 234456 records
remaining: 802679 records


### step6: managing outliers in quantity with IQR method

In [106]:
# calculating IQR
Q1 = df_clean['Quantity'].quantile(0.25)
Q3 = df_clean['Quantity'].quantile(0.75)
IQR = Q3 - Q1

# defining bounds with (3*IQR) for extreme outliers
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR

print('quantity stats')
print(f'Q1: {Q1}')
print(f'Q3: {Q3}')
print(f'IQR: {IQR}')
print(f'accepted bounds are from {lower_bound} to {upper_bound}')

quantity stats
Q1: 2.0
Q3: 12.0
IQR: 10.0
accepted bounds are from -28.0 to 42.0


In [None]:
# identifying outliers
outliers = df_clean[]