# Name: Fenil Patel
## Subjet: Machine learning Programming


#### Lab2 - Data Collection and Pre-processing
 

## Step 1: Hello Data 

In [26]:
import pandas as pd

# Option 1: Use raw string with r prefix
file_path = r"D:\Applied AI & ML\Machine Learning Porgramming\Labs\ecommerce_500_rows (1).csv"

# Option 2: Use double backslashes (\\)

df = pd.read_csv(file_path)
df.head(3)
# test
# test 2

Unnamed: 0,Order Date,Customer ID,Product,Unit Price,Quantity,Coupon Code,City
0,05/25/2023,CUST1000,Keyboard,533.07,4,,Calgary
1,11/21/2023,CUST1001,Monitor,1429.51,5,SAVE20,Vancouver
2,06/28/2023,CUST1002,Smartphone,1139.69,4,,Montreal


## Step 2: Pick the Right Container

Since it enables the encapsulation of both data and functionality (such as clean/total functions), a class is the ideal containment for transaction information.

# Bulk load

In [27]:
def load_transactions(path: str) -> list[Transaction]:
    
    # Load up to 500 rows from a CSV file and turn each row into a Transaction object.
    
    # Read the CSV into a pandas DataFrame, but only keep the first 500 rows
    df = pd.read_csv(path).head(500)
    
    # Convert each row into a Transaction:
    transactions = []
    for _, row in df.iterrows():
        tx = Transaction(
            row['Order Date'],              # when the order was placed
            row['Customer ID'],             # who placed the order
            row['Product'],                 # what they bought
            float(row['Unit Price']),       # cost per item
            int(row['Quantity']),           # number of items
            row.get('Coupon Code', None),   # any coupon they used (or None)
            row['City']                     # shipping destination
        )
        transactions.append(tx)
    
    # Return the full list of Transaction objects
    return transactions

# Example usage: point this to your CSV file location
transactions = load_transactions(
    r"D:\Applied AI & ML\Machine Learning Porgramming\Labs\ecommerce_500_rows (1).csv"
)


In [28]:


def load_transactions(path: str) -> list[Transaction]:
    df = pd.read_csv(path).head(500)
    transactions = [
        Transaction(
            row['Order Date'],
            row['Customer ID'],
            row['Product'],
            float(row['Unit Price']),
            int(row['Quantity']),
            row.get('Coupon Code', None),
            row['City']
        )
        for _, row in df.iterrows()
    ]
    return transactions

transactions = load_transactions("D:\Applied AI & ML\Machine Learning Porgramming\Labs\ecommerce_500_rows (1).csv")


# Quick profiling 

In [29]:
# Create transactions list from df if not already created
transactions = [
    Transaction(
        date=row['Order Date'],
        customer_id=row['Customer ID'],
        product=row['Product'],
        price=row['Unit Price'],
        quantity=row['Quantity'],
        coupon_code=row['Coupon Code'],
        shipping_city=row['City']
    )
    for _, row in df.iterrows()
]

# Extract prices and cities
prices = [t.price for t in transactions]
cities = set(t.shipping_city for t in transactions)

# Calculate basic stats
min_price = min(prices)
mean_price = sum(prices) / len(prices)
max_price = max(prices)
unique_city_count = len(cities)

# Print output nicely
print("Price Summary")
print("-" * 30)
print(f"Minimum Price  : ${min_price:,.2f}")
print(f"Average Price  : ${mean_price:,.2f}")
print(f"Maximum Price  : ${max_price:,.2f}")
print()
print(" Shipping Cities")
print("-" * 30)
print(f"Number of Unique Cities: {unique_city_count}")


Price Summary
------------------------------
Minimum Price  : $50.03
Average Price  : $794.01
Maximum Price  : $1,497.55

 Shipping Cities
------------------------------
Number of Unique Cities: 5


## Step 6: Spot the Grime


In [30]:
transactions[10].price = -99.99
transactions[20].coupon_code = "N/A"
transactions[30].price = "N/A"

## Step 7: Cleaning Rules


In [31]:
# Before cleaning
dirty_before = [t for t in transactions if isinstance(t.price, str) or t.price < 0]
print("Dirty before:", len(dirty_before))

# Clean all
for t in transactions:
    t.clean()

# After cleaning
dirty_after = [t for t in transactions if isinstance(t.price, str) or t.price < 0]
print("Dirty after:", len(dirty_after))

Dirty before: 2
Dirty after: 0


## Step 8: Transformations


In [32]:
def parse_coupon(code):
    if code and str(code).startswith("SAVE"):
        return int(code[4:])
    return 0

for t in transactions:
    t.discount = parse_coupon(t.coupon_code)

## Step 9: Feature Engineering


In [33]:
for t in transactions:
    t.days_since_purchase = (
        datetime.now() - datetime.strptime(t.date, "%m/%d/%Y")
    ).days

## Step 10: Mini-Aggregation

In [34]:
# Compute city revenue if not already defined
df['Revenue'] = df['Unit Price'] * df['Quantity']
city_revenue = df.groupby('City')['Revenue'].sum().reset_index()

print(city_revenue)


        City    Revenue
0    Calgary  222488.67
1   Montreal  233576.12
2     Ottawa  221019.08
3    Toronto  275478.61
4  Vancouver  226960.33


## Step 11: Serialization Checkpoint


In [35]:
import pandas as pd
import json
import pyarrow as pa
import pyarrow.parquet as pq

# Turn each Transaction into a simple dict of its fields
json_data = [t.__dict__ for t in transactions]

# Save all transactions to a human-readable JSON file
with open("cleaned_data.json", "w") as f:
    json.dump(json_data, f, indent=2)  # indent=2 makes it easy to read

# Now, load that same data into a pandas DataFrame
df = pd.DataFrame(json_data)

# Convert the DataFrame into an Apache Arrow table
table = pa.Table.from_pandas(df)

# Write the Arrow table out as a Parquet file for fast, compact storage
pq.write_table(table, "cleaned_data.parquet")
# Read the Parquet file back into a DataFrame

## Step 12: Soft Interview Reflection
OOP helped encapsulate logic cleanly.

Combining functionality and data was made simpler by object-oriented programming.  Code became scalable and accessible by enclosing functionality such as `.clean()` and `.total()` inside `Transaction`.