In [183]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split

In [184]:
df = pd.read_csv('dirty_cafe_sales.csv')

In [185]:
df.shape

(10000, 8)

In [186]:
df.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [187]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              9667 non-null   object
 2   Quantity          9862 non-null   object
 3   Price Per Unit    9821 non-null   object
 4   Total Spent       9827 non-null   object
 5   Payment Method    7421 non-null   object
 6   Location          6735 non-null   object
 7   Transaction Date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


In [188]:
df.isnull().sum()

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [189]:
df.describe()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
count,10000,9667,9862,9821.0,9827.0,7421,6735,9841
unique,10000,10,7,8.0,19.0,5,4,367
top,TXN_1961373,Juice,5,3.0,6.0,Digital Wallet,Takeaway,UNKNOWN
freq,1,1171,2013,2429.0,979.0,2291,3022,159


In [190]:
df['Transaction ID'] = df['Transaction ID'].str.strip("TXN_")

In [191]:
df.Item.unique()

array(['Coffee', 'Cake', 'Cookie', 'Salad', 'Smoothie', 'UNKNOWN',
       'Sandwich', nan, 'ERROR', 'Juice', 'Tea'], dtype=object)

In [192]:
df['Item'].isna().sum()

np.int64(333)

In [193]:
df['Item'].isnull().sum()

np.int64(333)

In [194]:
df['Item'] = df['Item'].fillna(df['Item'].mode()[0]) 

In [195]:
df['Item'].isnull().sum()

np.int64(0)

In [196]:
df.Item.unique()

array(['Coffee', 'Cake', 'Cookie', 'Salad', 'Smoothie', 'UNKNOWN',
       'Sandwich', 'Juice', 'ERROR', 'Tea'], dtype=object)

In [197]:
df.sample(5)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
4471,1445532,Coffee,3,2.0,6.0,Cash,Takeaway,2023-03-03
1093,9124680,Cookie,2,1.0,2.0,Cash,Takeaway,2023-10-03
2383,9137801,Juice,5,4.0,20.0,Credit Card,In-store,2023-06-10
6548,8152941,Coffee,3,2.0,6.0,,Takeaway,ERROR
9600,7915173,UNKNOWN,3,1.0,3.0,Cash,Takeaway,


In [198]:
df["Payment Method"].unique()

array(['Credit Card', 'Cash', 'UNKNOWN', 'Digital Wallet', 'ERROR', nan],
      dtype=object)

In [199]:
print((df['Item'] == 'ERROR').sum())
print((df['Quantity'] == 'ERROR').sum())
print((df['Price Per Unit'] == 'ERROR').sum())
print((df['Total Spent'] == 'ERROR').sum())
print((df['Payment Method'] == 'ERROR').sum())
print((df['Location'] == 'ERROR').sum())
print((df['Transaction Date'] == 'ERROR').sum())

292
170
190
164
306
358
142


In [200]:
print((df['Item'] == 'UNKNOWN').sum())
print((df['Quantity'] == 'UNKNOWN').sum())
print((df['Price Per Unit'] == 'UNKNOWN').sum())
print((df['Total Spent'] == 'UNKNOWN').sum())
print((df['Payment Method'] == 'UNKNOWN').sum())
print((df['Location'] == 'UNKNOWN').sum())
print((df['Transaction Date'] == 'UNKNOWN').sum())

344
171
164
165
293
338
159


In [201]:
df.isnull().sum()

Transaction ID         0
Item                   0
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [202]:
df1 = df[:]

In [206]:
for i in ['Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date']:
    df1[f'{i}_Errors'] = (df1[i] == 'ERROR').astype(int)
    df1[f'{i}_UNKNOWN'] = (df1[i] == 'UNKNOWN').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[f'{i}_Errors'] = (df1[i] == 'ERROR').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[f'{i}_UNKNOWN'] = (df1[i] == 'UNKNOWN').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[f'{i}_Errors'] = (df1[i] == 'ERROR').astype(int)
A value is trying to be set on a copy of a 

In [207]:
df1

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date,Item_Errors,Item_UNKNOWN,...,Price Per Unit_Errors,Price Per Unit_UNKNOWN,Total Spent_Errors,Total Spent_UNKNOWN,Payment Method_Errors,Payment Method_UNKNOWN,Location_Errors,Location_UNKNOWN,Transaction Date_Errors,Transaction Date_UNKNOWN
0,1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19,0,0,...,0,0,1,0,0,0,0,0,0,0
3,7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27,0,0,...,0,0,0,0,0,1,0,1,0,0
4,3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,7672686,Coffee,2,2.0,4.0,,UNKNOWN,2023-08-30,0,0,...,0,0,0,0,0,0,0,1,0,0
9996,9659401,Juice,3,,3.0,Digital Wallet,,2023-06-02,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,5255387,Coffee,4,2.0,8.0,Digital Wallet,,2023-03-02,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,7695629,Cookie,3,,3.0,Digital Wallet,,2023-12-02,0,0,...,0,0,0,0,0,0,0,0,0,0
