In [1]:
import pandas as pd
import numpy as np

data = pd.read_pickle('../data/data_step2.pkl')
data.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2.0,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [6]:
data.loc[(data['Item'] == 'UNKNOWN') | (data['Item'] == 'ERROR'), 'Item'].count()

np.int64(31)

In [3]:
data.groupby('Item')['Price Per Unit'].value_counts().unstack().fillna(0) 

Price Per Unit,1.0,1.5,2.0,3.0,4.0,5.0
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cake,0.0,0.0,0.0,1185.0,0.0,0.0
Coffee,0.0,0.0,1284.0,0.0,0.0,0.0
Cookie,1209.0,0.0,0.0,0.0,0.0,0.0
Juice,0.0,0.0,0.0,1359.0,0.0,0.0
Salad,0.0,0.0,0.0,0.0,0.0,1270.0
Sandwich,0.0,0.0,0.0,0.0,1301.0,0.0
Smoothie,0.0,0.0,0.0,0.0,1139.0,0.0
Tea,0.0,1199.0,0.0,0.0,0.0,0.0


Como apenas 31 tuplas apresentavam valores com erro na coluna Item, eles foram apagados:

In [2]:
data_cleaned = data[~data['Item'].isin(['UNKNOWN', 'ERROR'])]
data_cleaned = data_cleaned[data_cleaned['Item'].notna()]

In [3]:
data_cleaned['Item'].unique()

array(['Coffee', 'Cake', 'Cookie', 'Salad', 'Smoothie', 'Juice',
       'Sandwich', 'Tea'], dtype=object)

Optou-se por preencher os valores nulos, com UNKNOWN ou ERROR na coluna Payment Method com base no agrupamento de valores possíveis que possam se encaixar.

In [11]:
data_cleaned.loc[(data_cleaned['Payment Method'].isin(['UNKNOWN','ERROR'])) | 
                (data_cleaned['Payment Method'].isna()), 'Payment Method'].count()

np.int64(595)

In [5]:
payment_group = (
    data_cleaned[
    ~data_cleaned['Payment Method'].isin(['UNKNOWN','ERROR'])
    & data_cleaned['Payment Method'].notna()
    ]
    .groupby(['Item','Quantity','Price Per Unit','Payment Method'])
    .size() # Contagem das ocorrencias
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)
payment_group.head()

Unnamed: 0,Item,Quantity,Price Per Unit,Payment Method,count
50,Juice,2.0,3.0,Digital Wallet,84
89,Sandwich,5.0,4.0,Digital Wallet,84
48,Juice,2.0,3.0,Cash,77
35,Cookie,2.0,1.0,Digital Wallet,73
64,Salad,2.0,5.0,Credit Card,72


In [19]:
# Copia dos dados 
data_copy = data_cleaned.copy()

# Mask dos dados ausentes e com erro
mask = (data_copy['Payment Method'].isin(['UNKNOWN','ERROR']) | data_copy['Payment Method'].isna())

In [20]:
# Iteração sobre o dataframe
for i,row in data_copy[mask].iterrows():
    item = row['Item']
    price = row['Price Per Unit']
    quant = row['Quantity']

    # Avaliando o match com as cobinações
    match = payment_group [
        (payment_group['Item'] == item) &
        (payment_group['Quantity'] == quant) &
        np.isclose(payment_group['Price Per Unit'], price, rtol=1e-03)
    ]

    # Preechimento com base na primeira tupla que tem a combinação
    if not match.empty:
        data_copy.at[i, 'Payment Method'] = match.iloc[0]['Payment Method']

In [21]:
data_copy.loc[(data_copy['Payment Method'].isin(['UNKNOWN', 'ERROR'])) | 
               (data_copy['Payment Method'].isna()), 'Payment Method'].count()

np.int64(0)

In [22]:
data_copy.to_csv('../data/data_cleaned.csv')