## Actions
- Load seed data into DB. 
- Clean and transform scraped data
- Create an idempotent etl job

In [1]:
import pandas as pd
from pathlib import Path
import os
from datetime import datetime
import logging

In [2]:
# Setup logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


In [3]:
scraped_data_source = r"C:\Users\APIN PC\OneDrive\Documents\DS\DE_Inter\data_epic_capstone\etl\data\ai_tools_scraped.json"
seed_data_source = r"C:\Users\APIN PC\OneDrive\Documents\DS\DE_Inter\data_epic_capstone\etl\data\seeded_ai_agents.csv"

## Data Loading

In [4]:
def read_data(source_path: str) -> pd.DataFrame:
    """

    Args:
        source_path (str): Data Path

    Raises:
        ValueError: Raises error for unsupported data type.

    Returns:
        dataframe: Pandas Dataframe.
    """

    try:
        ext = Path(source_path).suffix
        if ext == ".csv":
            return pd.read_csv(source_path)
        elif ext == ".json":
            return pd.read_json(source_path)
        elif ext == ".parquet":
            return pd.read_parquet(source_path)
        logger.info("Data successfully read!")
    except Exception as e:
        logger.error(f"Error: {e}. Unsupported file format! Use csv, json or parquet.")


## Data Preview

In [5]:
scraped_df = read_data(scraped_data_source)
seed_df = read_data(seed_data_source)

scraped_df.info()
seed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         379 non-null    object
 1   description  379 non-null    object
 2   url          377 non-null    object
 3   tags         378 non-null    object
 4   pricing      378 non-null    object
 5   page         379 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 17.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          72 non-null     object
 1   description   72 non-null     object
 2   homepage_url  72 non-null     object
 3   category      72 non-null     object
 4   source        72 non-null     object
 5   created_at    72 non-null     object
 6   updated_at    72 non-null     object
 7   trending      72 non-null     ob

## Cleaning

Cleaning "tags" column
- separate the values in the list and choose the unique tag. 
- Each tag must be just a value. (i.e list of len 1)
    + no # in value 
    + no duplicate.

In [6]:
def baseline_cleaning(df: pd.DataFrame) -> pd.DataFrame:
    try:
        df = df.drop(columns=[col for col in ['pricing', 'page'] if col in df.columns])
        new_df = df.dropna()
        new_df = new_df.reset_index(drop=True)
        logger.info("Columns dropped and null values dropped.",
                    extra={
                     "Cols dropped": ['pricing', 'page'],
                     "Null Values Dropped": len(df) - len(new_df)
                    }
                    )
    except Exception as e:
        logger.error(f"Error Raised {e}! Is the input a dataframe? Use a pandas dataframe.",  exc_info=True)
    return df


def remove_hashtags(tags):
    try:
        if isinstance(tags, list):
            clean = [tag for tag in tags if '#' not in tag]
        elif isinstance(tags, str):
            clean = [tags] if "#" not in tags else []
        else:
            clean = []
        clean = ','.join(clean)

        if len(clean) < 4:
            clean = clean.upper()
        else:
            clean = clean.lower().capitalize()
    except Exception as e:
        logger.error(f"Error Raised at tags column cleaning {e}! Use tags column.",  exc_info=True)
    return clean


def clean_data(df):
    try:
        # df = baseline_cleaning(df=scraped_df)
        df = df.drop(columns=[col for col in ['pricing', 'page'] if col in df.columns])
        new_df = df.dropna()
        new_df = new_df.reset_index(drop=True)

        if 'tags' in df.columns:
            new_df['tags'] = new_df['tags'].apply(remove_hashtags)
        else:
            pass

        logger.info("Columns dropped and null values dropped.",
                    extra={
                     "Cols dropped": ['pricing', 'page'],
                     "Null Values Dropped": len(df) - len(new_df)
                    }
                    )
        logger.info("Tags Column Successfully cleaned.")
        logger.info("Data successfully cleaned!")
    except Exception as e:
        logger.error(f"Error Raised at full cleaning process: {e}!",  exc_info=True)
    return new_df




## Transformation

In [7]:
def needs_transformation(df):
    # Add your logic here (e.g., missing columns, data types)
    # check for missing values,

    if df.

    return some_condition

SyntaxError: invalid syntax (4102930063.py, line 5)

In [62]:
def get_created_at(filepath: str) -> str:
    try:
        created_timestamp = os.path.getctime(filepath)
        created_date = datetime.fromtimestamp(created_timestamp)
    except Exception as e:
        logger.error(f"Error Raised: {e}!", exc_info=True)
    return created_date.strftime("%Y-%M-%d")


def transform_data(df: pd.DataFrame, source = None) -> pd.DataFrame:
    try:
        created_day = get_created_at(scraped_data_source)
        if 'source' in df.columns:
            if df['source'] is not None:
                pass
            else:
                df['source'] = source
        else:
            df['source'] = source


        if 'created_at' in df.columns:
            if df['created_at'] is not None:
                pass
            else:
                df['created_at'] = created_day
        else:
            df['created_at'] = created_day
        if 'updated_at' in df.columns:
            if df['updated_at'] is not None:
                pass
            else:
                df['updated_at'] = None
        else:
            df['updated_at'] = None
        

        if 'trending' not in df.columns:
            df['trending'] = None
        else:
            df['trending'] = df['trending'].replace({"Low": False, "Medium": True, "High": True})
    
        trans_df = df.rename(columns={'url': 'homepage_url', 'tags': 'category'})
        
        trans_df['created_at'] = pd.to_datetime(trans_df['created_at'], format="%Y-%M-%d", errors="coerce")
        trans_df['updated_at'] = pd.to_datetime(trans_df['updated_at'], format="%Y-%M-%d", errors="coerce")
        trans_df['trending'] = trans_df['trending'].notna().astype(bool)

        logger.info("Data successfully transformed!")
    except Exception as e:
        logger.error(f"Error Raised at transformation: {e}!", exc_info=True)
    return trans_df

## ETL

In [63]:
def run_basic_et(source: str) -> pd.DataFrame:
    # Extract
    scraped_df = read_data(source)

    # Clean
    clean_df = clean_data(scraped_df)

    # Transform
    trans_df = transform_data(clean_df, source='https://aitoolsdirectory.com/')

    # Load
    "I dey come. Mapami"

    return trans_df
    

In [64]:
seed_clean_df = run_basic_et(source=seed_data_source)
scraped_clean_df = run_basic_et(source=scraped_data_source)

2025-05-28 14:47:04,328 - INFO - Columns dropped and null values dropped.
2025-05-28 14:47:04,331 - INFO - Tags Column Successfully cleaned.
2025-05-28 14:47:04,333 - INFO - Data successfully cleaned!
  df['trending'] = df['trending'].replace({"Low": False, "Medium": True, "High": True})
2025-05-28 14:47:04,447 - INFO - Data successfully transformed!
2025-05-28 14:47:04,598 - INFO - Columns dropped and null values dropped.
2025-05-28 14:47:04,606 - INFO - Tags Column Successfully cleaned.
2025-05-28 14:47:04,611 - INFO - Data successfully cleaned!
2025-05-28 14:47:04,749 - INFO - Data successfully transformed!


In [65]:
seed_clean_df

Unnamed: 0,name,description,homepage_url,category,source,created_at,updated_at,trending
0,HubSpot Marketing AI,AI-powered marketing automation and content ge...,https://www.hubspot.com/products/marketing/art...,Marketing,HubSpot,2023-01-15 00:01:00,2024-01-01 00:10:00,True
1,Jasper.ai,AI content creation platform for marketing cop...,https://www.jasper.ai/,Marketing,Jasper,2021-01-01 00:02:00,2024-01-15 00:09:00,True
2,Copy.ai,AI-powered copywriting assistant for marketing...,https://www.copy.ai/,Marketing,Copy.ai,2020-01-01 00:10:00,2024-01-10 00:10:00,True
3,MarketMuse,AI content planning and optimization for SEO a...,https://www.marketmuse.com/,Marketing,MarketMuse,2018-01-01 00:05:00,2024-01-20 00:08:00,True
4,Persado,AI language generation for marketing messaging...,https://www.persado.com/,Marketing,Persado,2012-01-01 00:03:00,2024-01-30 00:07:00,True
...,...,...,...,...,...,...,...,...
67,Eleven Labs,AI voice synthesis and cloning platform,https://elevenlabs.io/,Others,Eleven Labs,2022-01-01 00:01:00,2024-01-20 00:08:00,True
68,Murf AI,AI voice generation for content creation,https://murf.ai/,Others,Murf,2020-01-01 00:01:00,2024-01-30 00:07:00,True
69,Synthesia,AI video creation with synthetic avatars,https://www.synthesia.io/,Others,Synthesia,2017-01-01 00:01:00,2024-01-15 00:08:00,True
70,DeepL,AI translation and language processing,https://www.deepl.com/,Others,DeepL,2017-01-01 00:01:00,2024-01-05 00:09:00,True


## Merging Data Sets

In [None]:
comp_df = pd.merge(seed_clean_df, scraped_clean_df, how='outer')
comp_df.drop_duplicates(subset='name', inplace=True)
comp_df = comp_df.reset_index(drop=True)
comp_df

In [None]:
ai_list = pd.Series(comp_df['name'])
ai_list.to_csv('../etl/data/Ai_tools_list.csv', index=False)

In [None]:
scraped_clean_df.dtypes

name                    object
description             object
homepage_url            object
category                object
source                  object
created_at      datetime64[ns]
updated_at      datetime64[ns]
trending                  bool
dtype: object

In [56]:
scraped_df.isna().sum()

for col in scraped_clean_df.columns:
    if scraped_clean_df[col].isna().sum() > 0:
        logger.info("Columns with missing values present %s", col)
    if scraped_clean_df[col].dtype == list:
        print(col)

2025-05-28 13:12:17,675 - INFO - Columns with missing values present updated_at


name
description
homepage_url
category
source


## Functions to Work On

In [None]:
def delta_check(new_df, existing_df):
    # Merge and check for differences
    merged = new_df.merge(
        existing_df, on=["name", "homepage_url"], how="left", suffixes=("", "_existing")
    )
    changed = merged[
        (merged["email"] != merged["email_existing"])
        | (merged["phone"] != merged["phone_existing"])
    ]
    return changed[new_df.columns] 


def fetch_existing_records(conn):
    return pd.read_sql("SELECT name, homepage_url, email, phone FROM agents", conn)



def upsert_records(conn, df):
    cursor = conn.cursor()
    for _, row in df.iterrows():
        cursor.execute(
            """
            INSERT INTO agents (name, homepage_url, email, phone)
            VALUES (?, ?, ?, ?)
            ON CONFLICT(name, homepage_url)
            DO UPDATE SET email=excluded.email, phone=excluded.phone
        """,
            (row["name"], row["homepage_url"], row["email"], row["phone"]),
        )
    conn.commit()


def etl_job(source_path):
    df = read_data(source_path)

    if needs_transformation(df):
        df = transform_data(df)

    with engine.connect("agents.db") as conn:
        existing_df = fetch_existing_records(conn)
        delta_df = delta_check(df, existing_df)

        if not delta_df.empty:
            upsert_records(conn, delta_df)
            print(f"Upserted {len(delta_df)} records.")
        else:
            print("No changes detected. Idempotent run.")


name           object
description    object
url            object
tags           object
pricing        object
page            int64
dtype: object

## Notes

### Pending tasks
- Load data into db
- Get created_at for scraped ai tools. 

### Action
- Create a new csv file to combine both seeded and scraped tools.  `done`
- Check for duplicates. `done`
- Log duplicates and update duplicates `pending`


In [66]:
comp_df.columns

Index(['name', 'description', 'homepage_url', 'category', 'source',
       'created_at', 'updated_at', 'trending'],
      dtype='object')