In [1]:
import pandas as pd
from datetime import datetime

In [2]:
# Task1: Read raw CSV
df = pd.read_csv("raw_sales.csv", dtype=str)  # read as string to handle quotes/blanks

In [3]:
df

Unnamed: 0,order_id,product_id,quantity,price_per_unit,order_date
0,1,101,2.0,20.0,2025/06/01
1,2,102,-1.0,15.5,2025-06-01
2,3,103,1.0,35.0,2025-06-01
3,4,104,3.0,20.0,2025-06-02
4,5,105,,99.0,06-03-2025
5,6,106,2.0,25.99,2025-06-03


In [4]:
#checking if all the columns are loaded as strings
df.dtypes

Unnamed: 0,0
order_id,object
product_id,object
quantity,object
price_per_unit,object
order_date,object


In [5]:
# Task2: UDFs for cleaning
def clean_int(value):
    try:
        val = int(float(value))
        return val if val >= 0 else 0
    except:
        return 0

def clean_float(value):
    try:
        return float(value)
    except:
        return 0.0

def clean_date(value):
    try:
        return pd.to_datetime(value, errors='coerce', dayfirst=False)
    except:
        return pd.NaT

In [6]:
#cleaning the data
df["order_id"] = df["order_id"].apply(clean_int)
df["product_id"] = df["product_id"].apply(clean_int)
df["quantity"] = df["quantity"].apply(clean_int)
df["price_per_unit"] = df["price_per_unit"].apply(clean_float)
df["order_date"] = df["order_date"].apply(clean_date)

In [7]:
#Task3: Creating UDF for total_price derivation
def compute_total_price(row):
    return row["quantity"] * row["price_per_unit"]

In [8]:
#Task4: Adding new total_price column
df["total_price"] = df.apply(compute_total_price, axis=1)

In [9]:
#Cehcking the cleaned and transformed data
df

Unnamed: 0,order_id,product_id,quantity,price_per_unit,order_date,total_price
0,1,101,2,20.0,2025-06-01,40.0
1,2,102,0,15.5,2025-06-01,0.0
2,3,103,1,35.0,2025-06-01,35.0
3,4,104,3,20.0,2025-06-02,60.0
4,5,105,0,99.0,2025-06-03,0.0
5,6,106,2,25.99,2025-06-03,51.98


In [10]:
#Confirming the datatypes
df.dtypes

Unnamed: 0,0
order_id,int64
product_id,int64
quantity,int64
price_per_unit,float64
order_date,datetime64[ns]
total_price,float64


In [11]:
#Task5: Saving cleaned data to cleaned_sales.csv
df.to_csv("cleaned_sales.csv", index=False)

In [12]:
#checking the downloaded dataset
check_df = pd.read_csv("cleaned_sales.csv")
check_df

Unnamed: 0,order_id,product_id,quantity,price_per_unit,order_date,total_price
0,1,101,2,20.0,2025-06-01,40.0
1,2,102,0,15.5,2025-06-01,0.0
2,3,103,1,35.0,2025-06-01,35.0
3,4,104,3,20.0,2025-06-02,60.0
4,5,105,0,99.0,2025-06-03,0.0
5,6,106,2,25.99,2025-06-03,51.98
