### Importing Libraries

In [430]:
import numpy as np
import pandas as pd

### Importing Data and Analysing it 

In [431]:
cafe = pd.read_csv('C:/Users/Bibek/OneDrive/Desktop/DA Training/Cafe Sales Dirty/dirty_cafe_sales.csv')

In [432]:
cafe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              9667 non-null   object
 2   Quantity          9862 non-null   object
 3   Price Per Unit    9821 non-null   object
 4   Total Spent       9827 non-null   object
 5   Payment Method    7421 non-null   object
 6   Location          6735 non-null   object
 7   Transaction Date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


In [433]:
cafe.shape

(10000, 8)

In [434]:
cafe.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

In [435]:
cafe.isna().sum()

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [436]:
cafe.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


### Let's try and fix the item column first, by checking its relation with price per unit, to compare item and price

In [437]:
cafe.groupby(['Price Per Unit','Item']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Price Per Unit,Item,Unnamed: 2_level_1
1.0,Cookie,1026
1.0,ERROR,34
1.0,UNKNOWN,45
1.5,ERROR,37
1.5,Tea,1023
1.5,UNKNOWN,40
2.0,Coffee,1108
2.0,ERROR,31
2.0,UNKNOWN,49
3.0,Cake,1085


In [438]:
# Let's fix the ones that are the only items for the price first

mask = (cafe['Item'].isin(['ERROR','UNKNOWN'])) & (cafe['Price Per Unit'] == '1.0')
cafe.loc[mask, 'Item'] = 'Cookie'


In [439]:
cafe.groupby(['Price Per Unit','Item']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Price Per Unit,Item,Unnamed: 2_level_1
1.0,Cookie,1105
1.5,ERROR,37
1.5,Tea,1023
1.5,UNKNOWN,40
2.0,Coffee,1108
2.0,ERROR,31
2.0,UNKNOWN,49
3.0,Cake,1085
3.0,ERROR,77
3.0,Juice,1110


In [440]:
mask15 = (cafe['Item'].isin(['ERROR','UNKNOWN'])) & (cafe['Price Per Unit'] == '1.5')
cafe.loc[mask15, 'Item'] = 'Tea'

In [441]:
mask2 = (cafe['Item'].isin(['ERROR','UNKNOWN'])) & (cafe['Price Per Unit'] == '2.0')
cafe.loc[mask2, 'Item'] = 'Coffee'

In [442]:
mask5 = (cafe['Item'].isin(['ERROR','UNKNOWN'])) & (cafe['Price Per Unit'] == '5.0')
cafe.loc[mask5, 'Item'] = 'Salad'

In [443]:
cafe.groupby(['Price Per Unit','Item']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Price Per Unit,Item,Unnamed: 2_level_1
1.0,Cookie,1105
1.5,Tea,1100
2.0,Coffee,1188
3.0,Cake,1085
3.0,ERROR,77
3.0,Juice,1110
3.0,UNKNOWN,77
4.0,ERROR,61
4.0,Sandwich,1082
4.0,Smoothie,1036


In [444]:
#in the case of prices 3.0 & 4.00, we can see that the no of orders per items are similar (do not differ much)
# and orders with error and unknown items are also distributed somewhat evenly. 
# So, in this case, I will replace "unknown" with one item and "Error" with another

# had the numbers differed significantly, we would have used mean, median or mode depending on the variation


In [445]:
mask3a = (cafe['Item']=="UNKNOWN") & (cafe['Price Per Unit']=='3.0')
cafe.loc[mask3a,'Item'] = 'Juice'


In [446]:
mask3b = (cafe['Item']=="ERROR") & (cafe['Price Per Unit']=='3.0')
cafe.loc[mask3b,'Item'] = 'Cake'

In [447]:
mask4a = (cafe['Item']=="UNKNOWN") & (cafe['Price Per Unit']=='4.0')
cafe.loc[mask4a,'Item'] = 'Smoothie'

mask4b = (cafe['Item']=="ERROR") & (cafe['Price Per Unit']=='4.0')
cafe.loc[mask4b,'Item'] = 'Sandwich'

In [448]:
cafe.groupby(['Price Per Unit','Item']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Price Per Unit,Item,Unnamed: 2_level_1
1.0,Cookie,1105
1.5,Tea,1100
2.0,Coffee,1188
3.0,Cake,1162
3.0,Juice,1187
4.0,Sandwich,1143
4.0,Smoothie,1106
5.0,Salad,1166
ERROR,Cake,19
ERROR,Coffee,18


### Lets fix the price per unit of the items we know the price of


In [449]:
# create a price dictionary first

item_price = {
    'Cookie':'1.0',
    'Tea':'1.5',
    'Coffee':'2.0',
    'Cake':'3.0',
    'Juice':'3.0',
    'Sandwich':'4.0',
    'Smoothie':'4.0',
    'Salad':'5.0'
}

In [450]:
mask = cafe['Price Per Unit'].isin(['UNKNOWN', 'ERROR', pd.NA]) & ~cafe['Item'].isin(['UNKNOWN', 'ERROR'])

cafe.loc[mask, 'Price Per Unit'] = cafe.loc[mask, 'Item'].apply(lambda x: item_price.get(x))

In [451]:
cafe.groupby(['Price Per Unit','Item']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Price Per Unit,Item,Unnamed: 2_level_1
1.0,Cookie,1147
1.5,Tea,1146
2.0,Coffee,1226
3.0,Cake,1195
3.0,Juice,1231
4.0,Sandwich,1175
4.0,Smoothie,1142
5.0,Salad,1216
ERROR,ERROR,3
ERROR,UNKNOWN,4


In [452]:
cafe.groupby(['Item','Price Per Unit']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Item,Price Per Unit,Unnamed: 2_level_1
Cake,3.0,1195
Coffee,2.0,1226
Cookie,1.0,1147
ERROR,ERROR,3
ERROR,UNKNOWN,3
Juice,3.0,1231
Salad,5.0,1216
Sandwich,4.0,1175
Smoothie,4.0,1142
Tea,1.5,1146


### Let's check whether we can recover UNKNOWN & ERROR values through the help of other columns or not


In [453]:
cafe[(cafe['Item'].isin(['UNKNOWN', 'ERROR'])) & (cafe['Price Per Unit'].isin(['UNKNOWN', 'ERROR']))]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
750,TXN_5787508,ERROR,3,UNKNOWN,9.0,Credit Card,Takeaway,2023-07-23
1337,TXN_5031214,ERROR,5,UNKNOWN,5.0,,Takeaway,2023-07-29
1786,TXN_1923349,ERROR,4,ERROR,6.0,,,2023-07-06
2227,TXN_3200203,ERROR,2,UNKNOWN,8.0,,UNKNOWN,2023-12-04
2289,TXN_7524977,UNKNOWN,4,UNKNOWN,,ERROR,,2023-12-09
3434,TXN_6457997,UNKNOWN,1,ERROR,4.0,Credit Card,,2023-12-12
3666,TXN_8616276,UNKNOWN,2,UNKNOWN,3.0,Digital Wallet,Takeaway,2023-07-22
3779,TXN_7376255,UNKNOWN,UNKNOWN,UNKNOWN,25.0,,In-store,2023-05-27
4092,TXN_1840897,UNKNOWN,1,UNKNOWN,5.0,ERROR,,2023-06-03
4152,TXN_9646000,ERROR,2,ERROR,UNKNOWN,,In-store,2023-12-14


In [454]:
# Let's try using the Total Spent and Quantity rows (where the values are known) to replace the price per unit values

mask1 = cafe['Price Per Unit'].isin(['UNKNOWN', 'ERROR']) & ~ cafe['Quantity'].isin(['UNKNOWN', 'ERROR',pd.NA]) & ~cafe['Total Spent'].isin(['UNKNOWN', 'ERROR',pd.NA])

In [455]:
cafe[mask1]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
750,TXN_5787508,ERROR,3,UNKNOWN,9.0,Credit Card,Takeaway,2023-07-23
1337,TXN_5031214,ERROR,5,UNKNOWN,5.0,,Takeaway,2023-07-29
1786,TXN_1923349,ERROR,4,ERROR,6.0,,,2023-07-06
2227,TXN_3200203,ERROR,2,UNKNOWN,8.0,,UNKNOWN,2023-12-04
2289,TXN_7524977,UNKNOWN,4,UNKNOWN,,ERROR,,2023-12-09
3434,TXN_6457997,UNKNOWN,1,ERROR,4.0,Credit Card,,2023-12-12
3666,TXN_8616276,UNKNOWN,2,UNKNOWN,3.0,Digital Wallet,Takeaway,2023-07-22
4092,TXN_1840897,UNKNOWN,1,UNKNOWN,5.0,ERROR,,2023-06-03
5991,TXN_2913107,UNKNOWN,4,ERROR,8.0,,In-store,2023-05-20
6177,TXN_3232279,UNKNOWN,4,ERROR,16.0,UNKNOWN,Takeaway,2023-05-30


In [456]:
# let's change the quantity and total spent to float values 

#firstly, we have to replace unknown and error values by N/A

cafe = cafe.replace(['UNKNOWN', 'ERROR'], np.nan)

In [457]:
cafe['Quantity'] = pd.to_numeric(cafe['Quantity'], errors='coerce')
cafe['Total Spent'] = pd.to_numeric(cafe['Total Spent'], errors='coerce')
cafe['Price Per Unit'] = pd.to_numeric(cafe['Price Per Unit'],errors='coerce')

In [458]:
cafe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction ID    10000 non-null  object 
 1   Item              9636 non-null   object 
 2   Quantity          9521 non-null   float64
 3   Price Per Unit    9788 non-null   float64
 4   Total Spent       9498 non-null   float64
 5   Payment Method    6822 non-null   object 
 6   Location          6039 non-null   object 
 7   Transaction Date  9540 non-null   object 
dtypes: float64(3), object(5)
memory usage: 625.1+ KB


In [459]:
# now that the ERROR and UNKNOWN values are replaced, we should modify the mask1


mask1 = cafe['Price Per Unit'].isna() & ~ cafe['Quantity'].isna() & ~cafe['Total Spent'].isna()

In [460]:
cafe[mask1]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
56,TXN_3578141,Cake,5.0,,15.0,,Takeaway,2023-06-27
85,TXN_8035512,Tea,3.0,,4.5,Cash,,2023-10-29
104,TXN_7447872,Juice,2.0,,6.0,,,
118,TXN_4633784,,5.0,,15.0,,In-store,2023-02-06
151,TXN_4031509,,4.0,,16.0,Credit Card,Takeaway,2023-01-04
...,...,...,...,...,...,...,...,...
9764,TXN_1688292,,3.0,,9.0,Credit Card,In-store,
9820,TXN_8751702,,5.0,,15.0,Cash,,2023-02-13
9924,TXN_5981429,Juice,2.0,,6.0,Digital Wallet,,2023-12-24
9996,TXN_9659401,,3.0,,3.0,Digital Wallet,,2023-06-02


In [461]:
cafe.loc[mask1, 'Price Per Unit'] = (cafe.loc[mask1, 'Total Spent'] / cafe.loc[mask1, 'Quantity'])

In [462]:
cafe[mask1]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
56,TXN_3578141,Cake,5.0,3.0,15.0,,Takeaway,2023-06-27
85,TXN_8035512,Tea,3.0,1.5,4.5,Cash,,2023-10-29
104,TXN_7447872,Juice,2.0,3.0,6.0,,,
118,TXN_4633784,,5.0,3.0,15.0,,In-store,2023-02-06
151,TXN_4031509,,4.0,4.0,16.0,Credit Card,Takeaway,2023-01-04
...,...,...,...,...,...,...,...,...
9764,TXN_1688292,,3.0,3.0,9.0,Credit Card,In-store,
9820,TXN_8751702,,5.0,3.0,15.0,Cash,,2023-02-13
9924,TXN_5981429,Juice,2.0,3.0,6.0,Digital Wallet,,2023-12-24
9996,TXN_9659401,,3.0,1.0,3.0,Digital Wallet,,2023-06-02


In [463]:
cafe.groupby(['Price Per Unit','Item']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Price Per Unit,Item,Unnamed: 2_level_1
1.0,Cookie,1170
1.5,Tea,1164
2.0,Coffee,1245
3.0,Cake,1214
3.0,Juice,1247
4.0,Sandwich,1190
4.0,Smoothie,1165
5.0,Salad,1231


In [464]:
mask2 = cafe['Item'].isin(['UNKNOWN', 'ERROR']) & ~cafe['Price Per Unit'].isna()

cafe[mask2]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [465]:
cafe.isna().sum()

Transaction ID         0
Item                 364
Quantity             479
Price Per Unit        16
Total Spent          502
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [466]:
# let's fix item names from new prices now

price_item = {
    '1.0':'Cookie',
    '1.5':'Tea',
    '2.0':'Coffee',
    '3.0':'Cake',
    '4.0':'Sandwich',
    '5.0':'Salad'
}

mask2 = cafe['Item'].isin(['UNKNOWN', 'ERROR']) & ~cafe['Price Per Unit'].isin(['UNKNOWN', 'ERROR',pd.NA])

cafe.loc[mask2, 'Item'] = cafe.loc[mask2, 'Price Per Unit'].apply(lambda x: price_item.get(x))

In [467]:
cafe.groupby(['Price Per Unit','Item']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Price Per Unit,Item,Unnamed: 2_level_1
1.0,Cookie,1170
1.5,Tea,1164
2.0,Coffee,1245
3.0,Cake,1214
3.0,Juice,1247
4.0,Sandwich,1190
4.0,Smoothie,1165
5.0,Salad,1231


In [468]:
cafe.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4.0,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2.0,5.0,10.0,,,2023-04-27
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [469]:
cafe.groupby(['Price Per Unit','Item']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Price Per Unit,Item,Unnamed: 2_level_1
1.0,Cookie,1170
1.5,Tea,1164
2.0,Coffee,1245
3.0,Cake,1214
3.0,Juice,1247
4.0,Sandwich,1190
4.0,Smoothie,1165
5.0,Salad,1231


In [470]:
cafe['Item'].value_counts()

Item
Juice       1248
Coffee      1245
Salad       1232
Cake        1216
Sandwich    1192
Cookie      1171
Smoothie    1166
Tea         1166
Name: count, dtype: int64

In [471]:
cafe.isna().sum()

Transaction ID         0
Item                 364
Quantity             479
Price Per Unit        16
Total Spent          502
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [472]:
# now let's fix the null item names using the Price Per Unit column

In [473]:
cafe[cafe['Item'].isna()].groupby('Price Per Unit').agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Transaction ID
Price Per Unit,Unnamed: 1_level_1
1.0,42
1.5,41
2.0,46
3.0,93
4.0,96
5.0,40


In [474]:
price_to_item = {
    1.0:'Cookie',
    1.5:'Tea',
    2.0:'Coffee',
    5.0:'Salad'
}


In [475]:
cafe['Item'] = cafe.apply(lambda row: price_to_item.get(row['Price Per Unit'],row['Item']),axis=1)

In [476]:
# in case of items with values 3.0 and 4.0, there are 2 items, so we will divide them equally

price_to_items = {
    3.0: ['Cake', 'Juice'],
    4.0: ['Sandwich', 'Smoothie']
}

In [477]:
cafe[cafe['Item'].isna()].groupby('Price Per Unit').agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Transaction ID
Price Per Unit,Unnamed: 1_level_1
3.0,93
4.0,96


In [478]:
for price, items in price_to_items.items():
    mask = (
        (cafe['Price Per Unit'] == price) &
        (cafe['Item'].isin(['UNKNOWN', 'ERROR', pd.NA, None, np.nan]))
    )
    cafe.loc[mask, 'Item'] = np.random.choice(items, mask.sum())

In [479]:
cafe['Item'].isna().sum()

np.int64(6)

In [480]:
cafe[cafe['Item'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
1761,TXN_3611851,,4.0,,,Credit Card,,2023-02-09
2289,TXN_7524977,,4.0,,,,,2023-12-09
3779,TXN_7376255,,,,25.0,,In-store,2023-05-27
4152,TXN_9646000,,2.0,,,,In-store,2023-12-14
7597,TXN_1082717,,,,9.0,Digital Wallet,In-store,2023-12-13
9819,TXN_1208561,,,,20.0,Credit Card,,2023-08-19


In [481]:
cafe.isna().sum()

Transaction ID         0
Item                   6
Quantity             479
Price Per Unit        16
Total Spent          502
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [482]:
# let's fix quantity now

mask_qty = cafe['Quantity'].isna() & cafe['Price Per Unit'].notna() & cafe['Total Spent'].notna()
cafe[mask_qty]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
20,TXN_3522028,Smoothie,,4.0,20.0,Cash,In-store,2023-04-04
55,TXN_5522862,Cookie,,1.0,2.0,Credit Card,Takeaway,2023-03-19
57,TXN_2080895,Cake,,3.0,3.0,Digital Wallet,In-store,2023-04-19
66,TXN_8501819,Juice,,3.0,6.0,Cash,,2023-03-30
117,TXN_2148617,Juice,,3.0,9.0,Digital Wallet,,2023-01-10
...,...,...,...,...,...,...,...,...
9932,TXN_8502079,Tea,,1.5,3.0,Cash,,2023-04-20
9935,TXN_9778251,Tea,,1.5,6.0,,Takeaway,2023-11-09
9944,TXN_7495283,Cake,,3.0,15.0,Credit Card,Takeaway,2023-04-14
9957,TXN_6487003,Coffee,,2.0,8.0,Credit Card,Takeaway,2023-11-15


In [483]:
cafe.loc[mask_qty, 'Quantity'] = (cafe.loc[mask_qty, 'Total Spent'] / cafe.loc[mask_qty, 'Price Per Unit'])

In [484]:
cafe.isna().sum()

Transaction ID         0
Item                   6
Quantity              28
Price Per Unit        16
Total Spent          502
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [485]:
# let's try same approach for price per unit

mask_price = cafe['Price Per Unit'].isna() & cafe['Quantity'].notna() & cafe['Total Spent'].notna()
cafe[mask_price]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [486]:
# let's try same approach for total spent

mask_total = cafe['Total Spent'].isna() & cafe['Quantity'].notna() & cafe['Price Per Unit'].notna()
cafe[mask_total]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
2,TXN_4271903,Cookie,4.0,1.0,,Credit Card,In-store,2023-07-19
25,TXN_7958992,Smoothie,3.0,4.0,,,,2023-12-13
31,TXN_8927252,Cookie,2.0,1.0,,Credit Card,,2023-11-06
42,TXN_6650263,Tea,2.0,1.5,,,Takeaway,2023-01-10
94,TXN_6289610,Juice,3.0,3.0,,Cash,Takeaway,2023-08-07
...,...,...,...,...,...,...,...,...
9890,TXN_2749289,Smoothie,2.0,4.0,,Digital Wallet,Takeaway,2023-05-05
9954,TXN_1191659,Coffee,4.0,2.0,,Credit Card,In-store,2023-11-21
9977,TXN_5548914,Juice,2.0,3.0,,Digital Wallet,In-store,2023-11-04
9988,TXN_9594133,Cake,5.0,3.0,,,,


In [487]:
cafe.loc[mask_total, 'Total Spent'] = (cafe.loc[mask_total, 'Quantity'] * cafe.loc[mask_total, 'Price Per Unit'])

In [488]:
cafe.isna().sum()

Transaction ID         0
Item                   6
Quantity              28
Price Per Unit        16
Total Spent           28
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [489]:
# Now look at the rows where item, quantity, price per unit are null

cafe[cafe['Item'].isna()|cafe['Quantity'].isna()| cafe['Price Per Unit'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
65,TXN_4987129,Sandwich,3.0,,,,In-store,2023-10-20
236,TXN_8562645,Salad,,5.0,,,In-store,2023-05-18
278,TXN_3229409,Juice,,3.0,,Cash,Takeaway,2023-04-15
629,TXN_9289174,Cake,,,12.0,Digital Wallet,In-store,2023-12-30
641,TXN_2962976,Juice,,3.0,,,,2023-03-17
738,TXN_8696094,Sandwich,,4.0,,,Takeaway,2023-05-14
912,TXN_1575608,Sandwich,,,20.0,,Takeaway,2023-01-05
1008,TXN_7225428,Tea,,,3.0,Credit Card,Takeaway,2023-03-07
1482,TXN_3593060,Smoothie,,,16.0,Cash,,2023-03-05
1674,TXN_9367492,Tea,2.0,,,Cash,In-store,2023-06-19


In [490]:
# look at the data where name is known and price is n/a or vice versa

cafe[cafe['Item'].isna() & ~cafe['Price Per Unit'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [491]:
cafe[~cafe['Item'].isna() & cafe['Price Per Unit'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
65,TXN_4987129,Sandwich,3.0,,,,In-store,2023-10-20
629,TXN_9289174,Cake,,,12.0,Digital Wallet,In-store,2023-12-30
912,TXN_1575608,Sandwich,,,20.0,,Takeaway,2023-01-05
1008,TXN_7225428,Tea,,,3.0,Credit Card,Takeaway,2023-03-07
1482,TXN_3593060,Smoothie,,,16.0,Cash,,2023-03-05
1674,TXN_9367492,Tea,2.0,,,Cash,In-store,2023-06-19
3162,TXN_3577949,Cake,3.0,,,,Takeaway,2023-04-25
6225,TXN_6859249,Cookie,,,2.0,,,
7035,TXN_8872984,Salad,5.0,,,Credit Card,In-store,2023-08-23
9893,TXN_3809533,Juice,2.0,,,Digital Wallet,Takeaway,2023-02-02


In [492]:
# use the method as done earlier to add the price 

item_price = {
    'Cookie':1.0,
    'Tea':1.5,
    'Coffee':2.0,
    'Cake':3.0,
    'Juice':3.0,
    'Sandwich':4.0,
    'Smoothie':4.0,
    'Salad':5.0
}

In [493]:
cafe['Price Per Unit'] = cafe.apply(
    lambda row: item_price.get(row['Item'],row['Price Per Unit']),axis=1
)

In [494]:
cafe.isna().sum()

Transaction ID         0
Item                   6
Quantity              28
Price Per Unit         6
Total Spent           28
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [495]:
cafe[cafe['Item'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
1761,TXN_3611851,,4.0,,,Credit Card,,2023-02-09
2289,TXN_7524977,,4.0,,,,,2023-12-09
3779,TXN_7376255,,,,25.0,,In-store,2023-05-27
4152,TXN_9646000,,2.0,,,,In-store,2023-12-14
7597,TXN_1082717,,,,9.0,Digital Wallet,In-store,2023-12-13
9819,TXN_1208561,,,,20.0,Credit Card,,2023-08-19


In [496]:
cafe[cafe['Quantity'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
236,TXN_8562645,Salad,,5.0,,,In-store,2023-05-18
278,TXN_3229409,Juice,,3.0,,Cash,Takeaway,2023-04-15
629,TXN_9289174,Cake,,3.0,12.0,Digital Wallet,In-store,2023-12-30
641,TXN_2962976,Juice,,3.0,,,,2023-03-17
738,TXN_8696094,Sandwich,,4.0,,,Takeaway,2023-05-14
912,TXN_1575608,Sandwich,,4.0,20.0,,Takeaway,2023-01-05
1008,TXN_7225428,Tea,,1.5,3.0,Credit Card,Takeaway,2023-03-07
1482,TXN_3593060,Smoothie,,4.0,16.0,Cash,,2023-03-05
2796,TXN_9188692,Cake,,3.0,,Credit Card,,2023-12-01
3203,TXN_4565754,Smoothie,,4.0,,Digital Wallet,Takeaway,2023-10-06


In [497]:
# lets reuse the previous approach to update missing numbers

mask_total = cafe['Total Spent'].isna() & cafe['Quantity'].notna() & cafe['Price Per Unit'].notna()
cafe[mask_total]


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
65,TXN_4987129,Sandwich,3.0,4.0,,,In-store,2023-10-20
1674,TXN_9367492,Tea,2.0,1.5,,Cash,In-store,2023-06-19
3162,TXN_3577949,Cake,3.0,3.0,,,Takeaway,2023-04-25
7035,TXN_8872984,Salad,5.0,5.0,,Credit Card,In-store,2023-08-23
9893,TXN_3809533,Juice,2.0,3.0,,Digital Wallet,Takeaway,2023-02-02


In [498]:
cafe.loc[mask_total, 'Total Spent'] = (cafe.loc[mask_total, 'Quantity'] * cafe.loc[mask_total, 'Price Per Unit'])


In [499]:
mask_price = cafe['Price Per Unit'].isna() & cafe['Quantity'].notna() & cafe['Total Spent'].notna()
cafe[mask_price]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [500]:
# cafe.loc[mask_price, 'Price Per Unit'] = (cafe.loc[mask_price, 'Total Spent'] / cafe.loc[mask_price, 'Quantity'])


In [501]:
mask_qty = cafe['Quantity'].isna() & cafe['Price Per Unit'].notna() & cafe['Total Spent'].notna()
cafe[mask_qty]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
629,TXN_9289174,Cake,,3.0,12.0,Digital Wallet,In-store,2023-12-30
912,TXN_1575608,Sandwich,,4.0,20.0,,Takeaway,2023-01-05
1008,TXN_7225428,Tea,,1.5,3.0,Credit Card,Takeaway,2023-03-07
1482,TXN_3593060,Smoothie,,4.0,16.0,Cash,,2023-03-05
6225,TXN_6859249,Cookie,,1.0,2.0,,,


In [502]:
cafe.loc[mask_qty, 'Quantity'] = (cafe.loc[mask_qty, 'Total Spent'] / cafe.loc[mask_qty, 'Price Per Unit'])

In [503]:
cafe[mask_qty]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
629,TXN_9289174,Cake,4.0,3.0,12.0,Digital Wallet,In-store,2023-12-30
912,TXN_1575608,Sandwich,5.0,4.0,20.0,,Takeaway,2023-01-05
1008,TXN_7225428,Tea,2.0,1.5,3.0,Credit Card,Takeaway,2023-03-07
1482,TXN_3593060,Smoothie,4.0,4.0,16.0,Cash,,2023-03-05
6225,TXN_6859249,Cookie,2.0,1.0,2.0,,,


In [504]:
cafe.isna().sum()


Transaction ID         0
Item                   6
Quantity              23
Price Per Unit         6
Total Spent           23
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [505]:
# let's now use the fill out null values in quantity column by the mean value for each item

cafe['Quantity'] = cafe.groupby('Item')['Quantity'].transform(
    lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else np.nan)
)

In [506]:
cafe.isna().sum()


Transaction ID         0
Item                   6
Quantity               6
Price Per Unit         6
Total Spent           23
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [507]:
# fix Total spent now
mask_total = cafe['Total Spent'].isna() & cafe['Quantity'].notna() & cafe['Price Per Unit'].notna()
cafe.loc[mask_total, 'Total Spent'] = (cafe.loc[mask_total, 'Quantity'] * cafe.loc[mask_total, 'Price Per Unit'])


In [508]:
cafe.isna().sum()

Transaction ID         0
Item                   6
Quantity               6
Price Per Unit         6
Total Spent            3
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [509]:
cafe[cafe['Item'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
1761,TXN_3611851,,,,,Credit Card,,2023-02-09
2289,TXN_7524977,,,,,,,2023-12-09
3779,TXN_7376255,,,,25.0,,In-store,2023-05-27
4152,TXN_9646000,,,,,,In-store,2023-12-14
7597,TXN_1082717,,,,9.0,Digital Wallet,In-store,2023-12-13
9819,TXN_1208561,,,,20.0,Credit Card,,2023-08-19


In [510]:
# let's work on transaction date; we should change it to datetime format

cafe['Transaction Date'] = pd.to_datetime(cafe['Transaction Date'])

In [511]:
cafe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction ID    10000 non-null  object        
 1   Item              9994 non-null   object        
 2   Quantity          9994 non-null   float64       
 3   Price Per Unit    9994 non-null   float64       
 4   Total Spent       9997 non-null   float64       
 5   Payment Method    6822 non-null   object        
 6   Location          6039 non-null   object        
 7   Transaction Date  9540 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 625.1+ KB


In [512]:
cafe['Transaction Date'].isna().sum()

np.int64(460)

In [513]:
cafe[cafe['Transaction Date'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
11,TXN_3051279,Sandwich,2.0,4.0,8.0,Credit Card,Takeaway,NaT
29,TXN_7640952,Cake,4.0,3.0,12.0,Digital Wallet,Takeaway,NaT
33,TXN_7710508,Cookie,5.0,1.0,5.0,Cash,,NaT
77,TXN_2091733,Salad,1.0,5.0,5.0,,In-store,NaT
103,TXN_7028009,Cake,4.0,3.0,12.0,,Takeaway,NaT
...,...,...,...,...,...,...,...,...
9933,TXN_9460419,Cake,1.0,3.0,3.0,,Takeaway,NaT
9937,TXN_8253472,Cake,1.0,3.0,3.0,,,NaT
9949,TXN_3130865,Juice,3.0,3.0,9.0,,In-store,NaT
9983,TXN_9226047,Smoothie,3.0,4.0,12.0,Cash,,NaT


In [514]:
cafe.head(15)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2.0,5.0,10.0,,,2023-04-27
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5.0,4.0,20.0,Credit Card,,2023-03-31
6,TXN_4433211,Juice,3.0,3.0,9.0,,Takeaway,2023-10-06
7,TXN_6699534,Sandwich,4.0,4.0,16.0,Cash,,2023-10-28
8,TXN_4717867,Juice,5.0,3.0,15.0,,Takeaway,2023-07-28
9,TXN_2064365,Sandwich,5.0,4.0,20.0,,In-store,2023-12-31


In [515]:
# let's check the date distribution

cafe['Year'] = cafe['Transaction Date'].dt.year
cafe['Month'] = cafe['Transaction Date'].dt.month_name()


In [516]:
cafe.groupby(['Year','Month']).agg({
    'Transaction ID':'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Transaction ID
Year,Month,Unnamed: 2_level_1
2023.0,April,774
2023.0,August,803
2023.0,December,795
2023.0,February,727
2023.0,January,818
2023.0,July,791
2023.0,June,818
2023.0,March,827
2023.0,May,777
2023.0,November,784


In [517]:
# We will fill n/a values by chosing random date in 2023 as all data are from the same year

date_range_2023 = pd.date_range(start='2023-01-01', end='2023-12-31')

mask = cafe['Transaction Date'].isna()
cafe.loc[mask, 'Transaction Date'] = np.random.choice(date_range_2023, size=mask.sum())

In [518]:
cafe = cafe.drop(columns=['Year','Month'])

In [519]:
cafe.isna().sum()

Transaction ID         0
Item                   6
Quantity               6
Price Per Unit         6
Total Spent            3
Payment Method      3178
Location            3961
Transaction Date       0
dtype: int64

In [520]:
# now let's move on to payment method

cafe['Payment Method'].value_counts()

Payment Method
Digital Wallet    2291
Credit Card       2273
Cash              2258
Name: count, dtype: int64

In [521]:
# let's divide the n/a values randomly between digital wallet and credit card

mask_payment = cafe['Payment Method'].isna()
cafe.loc[mask_payment, 'Payment Method'] = np.random.choice(['Credit Card', 'Digital Wallet'])

In [522]:
cafe['Payment Method'].value_counts()

Payment Method
Digital Wallet    5469
Credit Card       2273
Cash              2258
Name: count, dtype: int64

In [523]:
# now let's check location values

cafe['Location'].value_counts()

Location
Takeaway    3022
In-store    3017
Name: count, dtype: int64

In [524]:
# we will divide n/a values randomly in a similar way

mask_loc = cafe['Location'].isna()
cafe.loc[mask_loc, 'Location'] = np.random.choice(['Takeaway', 'In-store'])


In [525]:
cafe['Location'].value_counts()

Location
Takeaway    6983
In-store    3017
Name: count, dtype: int64

In [526]:
cafe.isna().sum()

Transaction ID      0
Item                6
Quantity            6
Price Per Unit      6
Total Spent         3
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [527]:
cafe[cafe['Item'].isna()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
1761,TXN_3611851,,,,,Credit Card,Takeaway,2023-02-09
2289,TXN_7524977,,,,,Digital Wallet,Takeaway,2023-12-09
3779,TXN_7376255,,,,25.0,Digital Wallet,In-store,2023-05-27
4152,TXN_9646000,,,,,Digital Wallet,In-store,2023-12-14
7597,TXN_1082717,,,,9.0,Digital Wallet,In-store,2023-12-13
9819,TXN_1208561,,,,20.0,Credit Card,Takeaway,2023-08-19


In [529]:
# since we do not have any meaningful data here, we will remove these rows 

cafe = cafe.dropna(subset=['Item'])

In [530]:
cafe.isna().sum()

Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [531]:
cafe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9994 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction ID    9994 non-null   object        
 1   Item              9994 non-null   object        
 2   Quantity          9994 non-null   float64       
 3   Price Per Unit    9994 non-null   float64       
 4   Total Spent       9994 non-null   float64       
 5   Payment Method    9994 non-null   object        
 6   Location          9994 non-null   object        
 7   Transaction Date  9994 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 702.7+ KB


In [532]:
cafe.to_csv('C:/Users/Bibek/OneDrive/Desktop/DA Training/Cafe Sales Dirty/cleaned_cafe_sales.csv',index=False)