In [10]:
import pandas as pd
import numpy as np

data = pd.read_csv("dirty_cafe_sales.csv")
data = pd.DataFrame(data)
data.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [12]:
data.describe()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
count,10000,9667,9862,9821.0,9827.0,7421,6735,9841
unique,10000,10,7,8.0,19.0,5,4,367
top,TXN_9226047,Juice,5,3.0,6.0,Digital Wallet,Takeaway,UNKNOWN
freq,1,1171,2013,2429.0,979.0,2291,3022,159


In [16]:
data['Item'].unique()

array(['Coffee', 'Cake', 'Cookie', 'Salad', 'Smoothie', 'UNKNOWN',
       'Sandwich', nan, 'ERROR', 'Juice', 'Tea'], dtype=object)

In [65]:
# Avaliando se o preço unitário aparece em todos os registros do item Coffee
price_coffee = (data['Price Per Unit'].where(data['Item'] == 'Coffee'))
price_coffee

0         2
1       NaN
2       NaN
3       NaN
4         2
       ... 
9995      2
9996    NaN
9997      2
9998    NaN
9999    NaN
Name: Price Per Unit, Length: 10000, dtype: object

In [66]:
price_coffee.count()

np.int64(1165)

In [67]:
price_coffee[:] = 2
price_coffee.unique()

array([2], dtype=object)

In [68]:
# Solução eficaz para alterar os dados com Item == Coffee (também poderia ser feito diretamente)
data.loc[data['Item'] == 'Coffee', 'Price Per Unit'] = price_coffee

In [71]:
# Confirmação de que os dados foram modificados
a = data.loc[data['Item'] == 'Coffee', 'Price Per Unit']
a.unique()

array([2], dtype=object)

In [73]:
data.dtypes

Transaction ID      object
Item                object
Quantity            object
Price Per Unit      object
Total Spent         object
Payment Method      object
Location            object
Transaction Date    object
dtype: object

In [82]:
# Valores diferentes em Quantity
data['Quantity'].unique()

array(['2', '4', '5', '3', '1', 'ERROR', 'UNKNOWN', nan], dtype=object)

In [106]:
# Convertendo para coluna numerica
quant = pd.to_numeric(data['Quantity'], errors='coerce')
price = pd.to_numeric(data['Price Per Unit'], errors='coerce')
total = pd.to_numeric(data['Total Spent'], errors='coerce')

# Verificando se as colunas são numéricas
quant.dtype, price.dtype, total.dtype

(dtype('float64'), dtype('float64'), dtype('float64'))

In [111]:
quant.unique()

array([ 2.,  4.,  5.,  3.,  1., nan])

In [115]:
df = pd.DataFrame(
    {'quant': quant,
     'price': price,
     'total': total}
)
df.head()

Unnamed: 0,quant,price,total
0,2.0,2.0,4.0
1,4.0,3.0,12.0
2,4.0,1.0,
3,2.0,5.0,10.0
4,2.0,2.0,4.0


In [125]:
quant_nan = df[df['quant'].isna()]
quant_nan['quant'].unique()

array([nan])

In [126]:
quant_nan['quant'] = quant_nan['total'] / quant_nan['price']
quant_nan.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quant_nan['quant'] = quant_nan['total'] / quant_nan['price']


Unnamed: 0,quant,price,total
20,5.0,4.0,20.0
55,2.0,1.0,2.0
57,1.0,3.0,3.0
66,2.0,3.0,6.0
117,3.0,3.0,9.0
