# Name: Fenil Patel
## Subjet: Machine learning Programming


#### Lab2 - Data Collection and Pre-processing
 

## Step 1: Hello Data 

In [None]:
import pandas as pd

# Option 1: Use raw string with r prefix
file_path = r"D:\Applied AI & ML\Machine Learning Porgramming\Labs\ecommerce_500_rows (1).csv"

# Option 2: Use double backslashes (\\)

df = pd.read_csv(file_path)
df.head(3)


## Step 2: Pick the Right Container

Since it enables the encapsulation of both data and functionality (such as clean/total functions), a class is the ideal containment for transaction information.

In [None]:
# Function to load transaction data from a CSV file and convert each row into a Transaction object
def load_transactions(path: str) -> list[Transaction]:
    # Read the first 500 rows from the CSV file into a pandas DataFrame
    df = pd.read_csv(path).head(500)

    # Use a list comprehension to create a list of Transaction objects from each row
    transactions = [
        Transaction(
            row['Order Date'],                    # Extract order date
            row['Customer ID'],                   # Extract customer ID
            row['Product'],                       # Extract product name
            float(row['Unit Price']),             # Convert unit price to float
            int(row['Quantity']),                 # Convert quantity to integer
            row.get('Coupon Code', None),         # Get coupon code if available; otherwise, use None
            row['City']                           # Extract shipping city
        )
        for _, row in df.iterrows()               # Loop through each row of the DataFrame
    ]

    # Return the list of Transaction objects
    return transactions

# Load transactions from the specified file path
transactions = load_transactions("D:\\Applied AI & ML\\Machine Learning Porgramming\\Labs\\ecommerce_500_rows (1).csv")


# Step 3: Transaction class OOP Data structure

In [None]:
from dataclasses import dataclass
from typing import Optional
from datetime import datetime

# This class represents a single e-commerce transaction with fields for all important info.
@dataclass
class Transaction:
    date: str               # The date when the transaction occurred
    customer_id: str        # Unique ID of the customer
    product: str            # Name of the product purchased
    price: float            # Price of a single unit of the product
    quantity: int           # Number of units purchased
    coupon_code: Optional[str]  # Optional discount code applied to the order
    shipping_city: str      # City where the order is being shipped

    # This method checks and fixes bad data in the price field.
    def clean(self):
        # If the price is missing or marked as "N/A", treat it as 0.0
        if isinstance(self.price, str) and self.price == "N/A":
            self.price = 0.0
        # If the price is accidentally entered as negative, flip it to positive
        if self.price < 0:
            self.price = abs(self.price)

    # This method calculates the total order amount (price Ã— quantity)
    def total(self):
        return self.price * self.quantity


# Step 4: Bulk load

In [None]:
def load_transactions(path: str) -> list[Transaction]:
    
    # Load up to 500 rows from a CSV file and turn each row into a Transaction object.
    
    df = pd.read_csv(path).head(500)
    
    # Convert each row into a Transaction:
    transactions = []
    for _, row in df.iterrows():
        tx = Transaction(
            row['Order Date'],              # when the order was placed
            row['Customer ID'],             # who placed the order
            row['Product'],                 # what they bought
            float(row['Unit Price']),       # cost per item
            int(row['Quantity']),           # number of items
            row.get('Coupon Code', None),   # any coupon they used (or None)
            row['City']                     # shipping destination
        )
        transactions.append(tx)
    
    # Return the full list of Transaction objects
    return transactions

transactions = load_transactions(
    r"D:\Applied AI & ML\Machine Learning Porgramming\Labs\ecommerce_500_rows (1).csv"
)


# Step 5: Quick profiling 

In [None]:
# Create transactions list from df if not already created
transactions = [
    Transaction(
        date=row['Order Date'],
        customer_id=row['Customer ID'],
        product=row['Product'],
        price=row['Unit Price'],
        quantity=row['Quantity'],
        coupon_code=row['Coupon Code'],
        shipping_city=row['City']
    )
    for _, row in df.iterrows()
]

# Extract prices and cities
prices = [t.price for t in transactions]
cities = set(t.shipping_city for t in transactions)

# Calculate basic stats
min_price = min(prices)
mean_price = sum(prices) / len(prices)
max_price = max(prices)
unique_city_count = len(cities)

# Print output nicely
print("Price Summary")
print("-" * 30)
print(f"Minimum Price  : ${min_price:,.2f}")
print(f"Average Price  : ${mean_price:,.2f}")
print(f"Maximum Price  : ${max_price:,.2f}")
print()
print(" Shipping Cities")
print("-" * 30)
print(f"Number of Unique Cities: {unique_city_count}")


## Step 6: Spot the Grime


In [None]:
transactions[10].price = -99.99
transactions[20].coupon_code = "N/A"
transactions[30].price = "N/A"

## Step 7: Cleaning 


In [None]:
# Before cleaning
dirty_before = [t for t in transactions if isinstance(t.price, str) or t.price < 0]
print("Dirty before:", len(dirty_before))

# Clean all
for t in transactions:
    t.clean()

# After cleaning
dirty_after = [t for t in transactions if isinstance(t.price, str) or t.price < 0]
print("Dirty after:", len(dirty_after))

## Step 8: Transformations


In [None]:
def parse_coupon(code):
    # Check if the code exists and starts with the word "SAVE"
    if code and str(code).startswith("SAVE"):
        # Extract the number after "SAVE" and convert it to an integer
        return int(code[4:])
    return 0

# Go through each transaction and assign a discount based on the coupon code
for t in transactions:
    t.discount = parse_coupon(t.coupon_code)


## Step 9: Feature Engineering


In [None]:
from datetime import datetime

# Loop through each transaction object in the list
for t in transactions:
    # Calculate how many days have passed since the purchase date
    t.days_since_purchase = (
        datetime.now() - datetime.strptime(t.date, "%m/%d/%Y")
    ).days  # Convert the date string to a datetime object and subtract from today


## Step 10: Mini-Aggregation

In [None]:
# Compute city revenue if not already defined
df['Revenue'] = df['Unit Price'] * df['Quantity']
city_revenue = df.groupby('City')['Revenue'].sum().reset_index()

print(city_revenue)


## Step 11: Serialization Checkpoint


In [None]:
import pandas as pd
import json
import pyarrow as pa
import pyarrow.parquet as pq

# Turn each Transaction into a simple dict of its fields
json_data = [t.__dict__ for t in transactions]

# Save all transactions to a human-readable JSON file
with open("cleaned_data.json", "w") as f:
    json.dump(json_data, f, indent=2)  # indent=2 makes it easy to read

# Now, load that same data into a pandas DataFrame
df = pd.DataFrame(json_data)

# Convert the DataFrame into an Apache Arrow table
table = pa.Table.from_pandas(df)

# Write the Arrow table out as a Parquet file for fast, compact storage
pq.write_table(table, "cleaned_data.parquet")
# Read the Parquet file back into a DataFrame

## Step 12: Soft Interview Reflection
OOP helped encapsulate logic cleanly.

Combining functionality and data was made simpler by object-oriented programming.  Code became scalable and accessible by enclosing functionality such as `.clean()` and `.total()` inside `Transaction`.

# Data-Dictionary Section

In [None]:
import pandas as pd

# Load your datasets
# Replace this with the actual file path to your primary CSV file
file_path = r"D:\Applied AI & ML\Machine Learning Porgramming\Labs\ecommerce_500_rows (1).csv"

# Load the transaction data (primary)
primary_df = pd.read_csv(file_path)

# Load the metadata we created earlier (secondary)
secondary_df = pd.read_csv(r"D:\Applied AI & ML\Machine Learning Porgramming\Labs\secondary_metadata.csv")

# Manually create a list of dictionaries that describe each column
# We'll include the column name, its type, a short description, and the source of the data
data_dictionary = [
    {"Field": "Order Date", "Type": "string", "Description": "When the order was placed", "Source": "Primary CSV"},
    {"Field": "Customer ID", "Type": "string", "Description": "A unique ID for the customer", "Source": "Primary CSV"},
    {"Field": "Product", "Type": "string", "Description": "The product that was purchased", "Source": "Primary CSV"},
    {"Field": "Unit Price", "Type": "float", "Description": "Price of a single unit", "Source": "Primary CSV"},
    {"Field": "Quantity", "Type": "int", "Description": "Number of units bought", "Source": "Primary CSV"},
    {"Field": "Coupon Code", "Type": "string", "Description": "Any discount coupon applied", "Source": "Primary CSV"},
    {"Field": "City", "Type": "string", "Description": "City where the item was shipped", "Source": "Primary CSV"},
    
    {"Field": "Product_ID", "Type": "string", "Description": "Internal product ID", "Source": "Secondary Metadata"},
    {"Field": "Category", "Type": "string", "Description": "The category the product belongs to", "Source": "Secondary Metadata"},
    {"Field": "Brand", "Type": "string", "Description": "Product's brand name", "Source": "Secondary Metadata"},
    {"Field": "Province", "Type": "string", "Description": "Province of the shipping city", "Source": "Secondary Metadata"},
    {"Field": "Country", "Type": "string", "Description": "Country of the shipping city", "Source": "Secondary Metadata"},
    {"Field": "Population", "Type": "int", "Description": "Population of the shipping city", "Source": "Secondary Metadata"},
    {"Field": "Coupon_Description", "Type": "string", "Description": "Explanation of the coupon offer", "Source": "Secondary Metadata"},
    {"Field": "Discount_Percent", "Type": "int", "Description": "Percentage of discount offered", "Source": "Secondary Metadata"},
    {"Field": "Coupon_Expiry", "Type": "string", "Description": "Expiry date of the coupon", "Source": "Secondary Metadata"},
]

# Convert this list into a DataFrame so it's easy to view or export
dictionary_df = pd.DataFrame(data_dictionary)

# Show the data dictionary
dictionary_df
