Subject: Data Preprocessing Code Submission.

Date: October 30, 2024.

Dear Professor Ilia Tetin,

I am writing on behalf of our presentation team, which includes two members:


*   LE TRAN NHA TRAN - JASMINE (Student ID: 11285100M);
*   DINH VAN LONG - BRAD (Student ID: 11285109M).

Attached below is the data preprocessing code we developed to converse from JSONL to CSV, based on the dataset extracted from Chotot.com (Stage 1 & 2).

In [2]:
%%capture
!pip install polars

In [None]:
import json
import polars as pl
from datetime import datetime
from types import SimpleNamespace
from tqdm import tqdm

In [None]:
# Define a custom class called NestedNamespace that extends SimpleNamespace
class NestedNamespace(SimpleNamespace):
    # Constructor for the class
    def __init__(self, dictionary, **kwargs):
        # Initialize the parent SimpleNamespace with any additional keyword arguments
        super().__init__(**kwargs)

        # Iterate through the items in the provided dictionary
        for key, value in dictionary.items():
            # If the value is a dictionary, recursively convert it into a NestedNamespace
            if isinstance(value, dict):
                self.__setattr__(key, NestedNamespace(value))  # Recursion for nested dict
            else:
                # If the value is not a dictionary, set it as an attribute directly
                self.__setattr__(key, value)

This script processes a JSONL file (DATA) 2024-10-14.jsonl, extracting nested data attributes to create two separate CSV files (description.csv and info.csv).

In [None]:
# Open the JSONL file for reading
with open("(DATA) 2024-10-14.jsonl") as file:
    info = []  # List to store detailed ad information
    description = []  # List to store ad descriptions

    # Iterate through each line in the file with a progress bar (tqdm for visualization)
    for line in tqdm(file, mininterval=1):
        # Parse the JSON object from the line
        obj = json.loads(line)

        # Convert the parsed object into a NestedNamespace for easier attribute access
        obj = NestedNamespace(obj)

        # Helper function to safely access nested attributes in the object
        def safe_get(obj, *attrs):
            try:
                current = obj
                # Traverse the nested attributes
                for attr in attrs:
                    current = getattr(current, attr)
                return current  # Return the final attribute value
            except AttributeError:
                return None  # Return None if any attribute in the chain is missing

        # Extract main `ad` and `ad_params` objects safely
        ad = (
            safe_get(
                obj,
                "content",
                "props",
                "pageProps",
                "initialState",
                "adView",
                "adInfo",
                "ad",
            )
            or SimpleNamespace()  # Fallback to an empty namespace if `ad` is missing
        )
        ad_params = (
            safe_get(
                obj,
                "content",
                "props",
                "pageProps",
                "initialState",
                "adView",
                "adInfo",
                "ad_params",
            )
            or SimpleNamespace()  # Fallback to an empty namespace if `ad_params` is missing
        )

        # Helper function to extract values from `ad_params`
        def get_param_value(param_name):
            try:
                # Access the `.value` attribute of a parameter safely
                return getattr(ad_params, param_name).value
            except AttributeError:
                return None  # Return None if the attribute is missing

        # Collect description-related attributes from the `ad` object
        description.append(
            {
                "ad_id": getattr(ad, "ad_id", None),  # Ad ID
                "list_id": getattr(ad, "list_id", None),  # List ID
                "subject": getattr(ad, "subject", None),  # Ad title/subject
                "body": getattr(ad, "body", None),  # Ad description body
            }
        )

        # Collect detailed ad information from both `ad` and `ad_params`
        info.append(
            {
                "ad_id": getattr(ad, "ad_id", None),  # Ad ID
                "list_id": getattr(ad, "list_id", None),  # List ID
                "list_time": datetime.fromtimestamp(
                    getattr(ad, "list_time", 0) / 1000.0  # Convert timestamp to datetime
                ).strftime(r"%Y-%m-%d")  # Format as YYYY-MM-DD
                if getattr(ad, "list_time", None) is not None
                else None,
                "account_name": getattr(ad, "account_name", None),  # Account name
                "phone": getattr(ad, "phone", None),  # Phone number
                "company_ad": getattr(ad, "company_ad", None),  # Company ad
                "price": getattr(ad, "price", None),  # Ad price
                "account_id": getattr(ad, "account_id", None),  # Account ID
                "longitude": getattr(ad, "longitude", None),  # Longitude
                "latitude": getattr(ad, "latitude", None),  # Latitude
                "full_name": getattr(ad, "full_name", None),  # Full name of the advertiser
                "sold_ads": getattr(ad, "sold_ads", None),  # Number of ads sold
                "total_rating": getattr(ad, "total_rating", None),  # Total rating
                "total_rating_for_seller": getattr(ad, "total_rating_for_seller", None),  # Seller-specific ratings
                "average_rating": getattr(ad, "average_rating", None),  # Average rating
                "average_rating_for_seller": getattr(ad, "average_rating_for_seller", None),  # Average rating for seller
                "account_oid": getattr(ad, "account_oid", None),  # Account object ID
                "area_name": getattr(ad, "area_name", None),  # Area name
                "region_name": getattr(ad, "region_name", None),  # Region name
                "number_of_images": getattr(ad, "number_of_images", None),  # Number of images in the ad
                "ward_name": getattr(ad, "ward_name", None),  # Ward name
                # Extract values from `ad_params` using the helper function
                "address": get_param_value("address"),
                "elt_condition": get_param_value("elt_condition"),
                "elt_origin": get_param_value("elt_origin"),
                "elt_warranty": get_param_value("elt_warranty"),
                "mobile_brand": get_param_value("mobile_brand"),
                "mobile_capacity": get_param_value("mobile_capacity"),
                "mobile_color": get_param_value("mobile_color"),
                "mobile_model": get_param_value("mobile_model"),
                "usage_information": get_param_value("usage_information"),
                "url": getattr(obj, "url", None),  # URL of the ad
            }
        )

# Save the collected data into two separate CSV files
pl.DataFrame(description).write_csv("description.csv")  # Save ad descriptions
pl.DataFrame(info).write_csv("info.csv")  # Save detailed ad information