# Importing Modules

In [1]:
import pandas as np

# Loading Dataset

In [32]:
data = np.read_csv(r"dirty_cafe_sales.csv")

In [35]:
data.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


# Removing Trailing spaces on Columns

In [36]:
# Removing trailing spaces on column names and add an underscore for easy readability
data.columns = data.columns.str.strip().str.replace(" ", "_");

In [37]:
data.head()

Unnamed: 0,Transaction_ID,Item,Quantity,Price_Per_Unit,Total_Spent,Payment_Method,Location,Transaction_Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


# Remove extra spaces in strings fields

In [38]:
# removing spaces in string values for all columns
string_columns = data.select_dtypes(include=['object']).columns
data[string_columns] = data[string_columns].apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [39]:
data.head()

Unnamed: 0,Transaction_ID,Item,Quantity,Price_Per_Unit,Total_Spent,Payment_Method,Location,Transaction_Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


# Converting Data Fields into appropriate data types

In [40]:
# converting Quantity, Price Per Unit, and Total Spent to a int data types
int_columns = ["Quantity", "Price_Per_Unit", "Total_Spent"]

# using for loop to change all column at once  
for column in int_columns:
    data[column] = np.to_numeric(data[column], errors='coerce')

In [41]:
# converting Transaction_Date to datetime format
data['Transaction_Date'] = np.to_datetime(data['Transaction_Date'], errors='coerce')

# Replacing Invalid Placeholders

In [42]:
# replacing "Error" and "Unknown" values with NaN 
data.replace(["ERROR", "UNKNOWN"], np.NA, inplace=True)

# Finding missing values

In [43]:
# finding missing values
total_missing_value = data.isnull().sum()
print(total_missing_value)

# total null values
print(f"Total null values: {data.isnull().sum().sum()}")

Transaction_ID         0
Item                 969
Quantity             479
Price_Per_Unit       533
Total_Spent          502
Payment_Method      3178
Location            3961
Transaction_Date     460
dtype: int64
Total null values: 10082


In [44]:
# percentage of missing values
for column in data.columns:
    percentage = data[column].isnull().mean()
    if percentage > 0:
        print(f'{column}: {round(percentage * 100,2)}%')

Item: 9.69%
Quantity: 4.79%
Price_Per_Unit: 5.33%
Total_Spent: 5.02%
Payment_Method: 31.78%
Location: 39.61%
Transaction_Date: 4.6%


In [46]:
# defining a function to fill missing values
columns_to_fill = ['Item','Quantity', 'Price_Per_Unit', 'Total_Spent', 'Payment_Method', 'Location', 'Transaction_Date']

# filling missing values with 'Unknown'
data[columns_to_fill] = data[columns_to_fill].fillna('Unknown')

# checking if there are any missing values left
missing_values = data.isnull().sum()
print(missing_values)

Transaction_ID      0
Item                0
Quantity            0
Price_Per_Unit      0
Total_Spent         0
Payment_Method      0
Location            0
Transaction_Date    0
dtype: int64


In [49]:
data.head()

Unnamed: 0,Transaction_ID,Item,Quantity,Price_Per_Unit,Total_Spent,Payment_Method,Location,Transaction_Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08 00:00:00
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16 00:00:00
2,TXN_4271903,Cookie,4.0,1.0,Unknown,Credit Card,In-store,2023-07-19 00:00:00
3,TXN_7034554,Salad,2.0,5.0,10.0,Unknown,Unknown,2023-04-27 00:00:00
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,2023-06-11 00:00:00
