## TASKS
```
DESCRIPTION                             STATUS 
- Load seed data into DB.               done
- Clean and transform scraped data      done
- Create an idempotent etl job          ongoing
```

In [1]:
import pandas as pd
from pathlib import Path
import os
from datetime import datetime
import logging

In [2]:
# Setup logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


In [None]:
scraped_data_source = r"C:\Users\APIN PC\OneDrive\Documents\DS\DE_Inter\data_epic_capstone\etl\data\ai_tools_scraped.json"
seed_data_source = r"C:\Users\APIN PC\OneDrive\Documents\DS\DE_Inter\data_epic_capstone\etl\data\seeded_ai_agents.csv"

## Data Loading

In [7]:
def read_data(source_path: str) -> pd.DataFrame:
    """

    Args:
        source_path (str): Data Path

    Raises:
        ValueError: Raises error for unsupported data type.

    Returns:
        dataframe: Pandas Dataframe.
    """

    try:
        ext = Path(source_path).suffix
        if ext == ".csv":
            return pd.read_csv(source_path)
        elif ext == ".json":
            return pd.read_json(source_path)
        elif ext == ".parquet":
            return pd.read_parquet(source_path)
        logger.info("Data successfully read!")
    except Exception as e:
        logger.error(f"Error: {e}. Unsupported file format! Use csv, json or parquet.")


## Data Preview

In [8]:
scraped_df = read_data(scraped_data_source)
seed_df = read_data(seed_data_source)

scraped_df.info()
seed_df.info()

2025-05-29 15:07:25,534 - ERROR - Error: File ..\etl\data\29-05-2025_ai_tools_scraped.json does not exist. Unsupported file format! Use csv, json or parquet.
2025-05-29 15:07:25,546 - ERROR - Error: [Errno 2] No such file or directory: '..\\etl\\data\\seeded_ai_agents.csv'. Unsupported file format! Use csv, json or parquet.


AttributeError: 'NoneType' object has no attribute 'info'

## Cleaning

Cleaning "tags" column
- separate the values in the list and choose the unique tag. 
- Each tag must be just a value. (i.e list of len 1)
    + no # in value 
    + no duplicate.

In [None]:
def baseline_cleaning(df: pd.DataFrame) -> pd.DataFrame:
    try:
        df = df.drop(columns=[col for col in ['pricing', 'page'] if col in df.columns])
        new_df = df.dropna()
        new_df = new_df.reset_index(drop=True)
        logger.info("Columns dropped and null values dropped.",
                    extra={
                     "Cols dropped": ['pricing', 'page'],
                     "Null Values Dropped": len(df) - len(new_df)
                    }
                    )
    except Exception as e:
        logger.error(f"Error Raised {e}! Is the input a dataframe? Use a pandas dataframe.",  exc_info=True)
    return df


def remove_hashtags(tags):
    try:
        if isinstance(tags, list):
            clean = [tag for tag in tags if '#' not in tag]
        elif isinstance(tags, str):
            clean = [tags] if "#" not in tags else []
        else:
            clean = []
        clean = ','.join(clean)

        if len(clean) < 4:
            clean = clean.upper()
        else:
            clean = clean.lower().capitalize()
    except Exception as e:
        logger.error(f"Error Raised at tags column cleaning {e}! Use tags column.",  exc_info=True)
    return clean


def clean_data(df):
    try:
        # df = baseline_cleaning(df=scraped_df)
        df = df.drop(columns=[col for col in ['pricing', 'page'] if col in df.columns])
        new_df = df.dropna()
        new_df = new_df.reset_index(drop=True)

        if 'tags' in df.columns:
            new_df['tags'] = new_df['tags'].apply(remove_hashtags)
        else:
            pass

        logger.info("Columns dropped and null values dropped.",
                    extra={
                     "Cols dropped": ['pricing', 'page'],
                     "Null Values Dropped": len(df) - len(new_df)
                    }
                    )
        logger.info("Tags Column Successfully cleaned.")
        logger.info("Data successfully cleaned!")
    except Exception as e:
        logger.error(f"Error Raised at full cleaning process: {e}!",  exc_info=True)
    return new_df




## Transformation

In [None]:
def get_created_at(filepath: str) -> str:
    try:
        created_timestamp = os.path.getctime(filepath)
        created_date = datetime.fromtimestamp(created_timestamp)
    except Exception as e:
        logger.error(f"Error Raised: {e}!", exc_info=True)
    return created_date.strftime("%Y-%M-%d")


def transform_data(df: pd.DataFrame, source = None) -> pd.DataFrame:
    try:
        created_day = get_created_at(scraped_data_source)
        if 'source' in df.columns:
            if df['source'] is not None:
                pass
            else:
                df['source'] = source
        else:
            df['source'] = source


        if 'created_at' in df.columns:
            if df['created_at'] is not None:
                pass
            else:
                df['created_at'] = created_day
        else:
            df['created_at'] = created_day
        if 'updated_at' in df.columns:
            if df['updated_at'] is not None:
                pass
            else:
                df['updated_at'] = None
        else:
            df['updated_at'] = None
        

        if 'trending' not in df.columns:
            df['trending'] = None
        else:
            df["trending"] = df["trending"].apply(
                lambda x: False if x == 'Low' else True
                )
    
        trans_df = df.rename(columns={'url': 'homepage_url', 'tags': 'category'})
        
        trans_df['created_at'] = pd.to_datetime(trans_df['created_at'], format="%Y-%M-%d", errors="coerce")
        trans_df['updated_at'] = pd.to_datetime(trans_df['updated_at'], format="%Y-%M-%d", errors="coerce")
        trans_df['trending'] = trans_df['trending'].notna().astype(bool)

        trans_df.to_csv(f"{df.Name}")

        logger.info("Data successfully transformed!")
    except Exception as e:
        logger.error(f"Error Raised at transformation: {e}!", exc_info=True)
    return trans_df

In [None]:
def merging_dfs(new_df, existing_df) -> pd.DataFrame:
    """
    Merging DFs to extract unique ai_tools
    Returns:
        pd.DataFrame: Merged DF with unique Ai tools
    """
    try:
        merged_df = pd.merge(new_df, existing_df, how="outer")
        merged_df.drop_duplicates(subset="name", inplace=True)
        merged_df = merged_df.reset_index(drop=True)
        logger.info("Seed DF and Scraped DF successfully merged!")
    except Exception as e:
        logger.error("Error merging DFs: %s", e, exc_info=True)
    return merged_df

## ETL

In [None]:
def run_basic_etl() -> pd.DataFrame:
    # Extract
    scraped_df = read_data(scraped_data_source)
    seed_df = read_data(seed_data_source)

    # Clean
    clean_scraped_df = clean_data(scraped_df)
    clean_seed_df = clean_data(seed_df)

    # Transform
    trans_scraped_df = transform_data(
        clean_scraped_df, source="https://aitoolsdirectory.com/"
    )
    trans_seed_df = transform_data(clean_seed_df)

    # Merge Datasets
    final_df = merging_dfs(trans_seed_df, trans_scraped_df)

    return final_df
    

In [None]:
comp_df = run_basic_etl()

2025-05-29 12:13:37,450 - INFO - Columns dropped and null values dropped.
2025-05-29 12:13:37,455 - INFO - Tags Column Successfully cleaned.
2025-05-29 12:13:37,460 - INFO - Data successfully cleaned!
2025-05-29 12:13:37,468 - INFO - Columns dropped and null values dropped.
2025-05-29 12:13:37,472 - INFO - Tags Column Successfully cleaned.
2025-05-29 12:13:37,476 - INFO - Data successfully cleaned!
2025-05-29 12:13:37,496 - INFO - Data successfully transformed!
2025-05-29 12:13:37,552 - INFO - Data successfully transformed!
2025-05-29 12:13:37,618 - INFO - Seed DF and Scraped DF successfully merged!


## Functions to Work On

In [None]:
def delta_check(new_df, existing_df):
    # Merge and check for differences
    merged = new_df.merge(
        existing_df, on=["name", "homepage_url"], how="left", suffixes=("", "_existing")
    )
    changed = merged[
        (merged["email"] != merged["email_existing"])
        | (merged["phone"] != merged["phone_existing"])
    ]
    return changed[new_df.columns] 


def fetch_existing_records(conn):
    return pd.read_sql("SELECT name, homepage_url, email, phone FROM agents", conn)



def upsert_records(conn, df):
    cursor = conn.cursor()
    for _, row in df.iterrows():
        cursor.execute(
            """
            INSERT INTO agents (name, homepage_url, email, phone)
            VALUES (?, ?, ?, ?)
            ON CONFLICT(name, homepage_url)
            DO UPDATE SET email=excluded.email, phone=excluded.phone
        """,
            (row["name"], row["homepage_url"], row["email"], row["phone"]),
        )
    conn.commit()


def etl_job(source_path):
    df = read_data(source_path)

    if needs_transformation(df):
        df = transform_data(df)

    with engine.connect("agents.db") as conn:
        existing_df = fetch_existing_records(conn)
        delta_df = delta_check(df, existing_df)

        if not delta_df.empty:
            upsert_records(conn, delta_df)
            print(f"Upserted {len(delta_df)} records.")
        else:
            print("No changes detected. Idempotent run.")


## Notes

### Pending tasks
- Load Seed data separately.    ```done```
- Load scraped data and check for duplicates with name(lower) and homepage_url.
- Write tests to check for:
    + test for duplicates.
    + test for invalid rows.
    + test for correct upserts.
