# Import components

In [1]:
import os
import logging
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
import functions_framework 

try:
    # If main.py is in the root directory
    from components.extract_content import extract_data
    from components.transform_content import transform_data
    from components.analyse_sentiment import analyze_sentiment
    from components.load_content import load_data_to_bigquery
    from components.logging_config import setup_logging, get_logger
except ImportError:
    # If main.py is somewhere else
    import sys
    import os
    # Get the absolute path to the project root directory
    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    sys.path.append(project_root)
    
    # Try again with the updated path
    from components.extract_content import extract_data
    from components.transform_content import transform_data
    from components.analyse_sentiment import analyze_sentiment
    from components.load_content import load_data_to_bigquery
    from components.logging_config import setup_logging, get_logger 


  from .autonotebook import tqdm as notebook_tqdm


# Setup logging

In [2]:

setup_logging()
logger = get_logger(__name__)

# Configure logging for cloud environment (avoid file handlers in Cloud Functions)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)


In [5]:
# Log function start
logger.info(f"News ETL pipeline triggered at {datetime.now().isoformat()}")

 # Get configuration from environment variables
service_account_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
load_method = os.environ.get("LOAD_METHOD", "append")

2025-04-27 21:51:08,895 - __main__ - INFO - News ETL pipeline triggered at 2025-04-27T21:51:08.895024


# Step 1: Extract data from NewsAPI


In [6]:
logger.info("Step 1: Extracting news data...")
articles = extract_data()
logger.info(f"Extracted {len(articles)} articles")

2025-04-27 21:51:34,470 - __main__ - INFO - Step 1: Extracting news data...
2025-04-27 21:51:34,473 - components.extract_content - INFO - Current date: 2025-04-27
2025-04-27 21:51:34,475 - components.extract_content - INFO - 7 days ago: 2025-04-20
2025-04-27 21:51:34,479 - components.extract_content - INFO - Fetching articles for topic: GenAI...
2025-04-27 21:51:37,015 - components.extract_content - INFO - Fetched 100 articles for topic: GenAI
2025-04-27 21:51:37,016 - components.extract_content - INFO - Fetching articles for topic: AI...
2025-04-27 21:51:37,444 - components.extract_content - INFO - Fetched 99 articles for topic: AI
2025-04-27 21:51:37,444 - components.extract_content - INFO - Fetching articles for topic: Technology...
2025-04-27 21:51:38,281 - components.extract_content - INFO - Fetched 100 articles for topic: Technology
2025-04-27 21:51:38,282 - components.extract_content - INFO - Total unique articles fetched: 289
2025-04-27 21:51:38,283 - __main__ - INFO - Extracte

# Step 2: Transform the data

In [7]:
logger.info("Step 2: Transforming news data...")
transformed_df = transform_data(articles)
logger.info(f"Transformed data: {len(transformed_df)} articles")

2025-04-27 21:53:25,435 - __main__ - INFO - Step 2: Transforming news data...
2025-04-27 21:53:25,442 - components.transform_content - INFO - Transforming data...
2025-04-27 21:53:25,476 - components.transform_content - INFO - After removing duplicates: 275 articles
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['publishedAt'] = pd.to_datetime(final_df['publishedAt']).dt.strftime('%Y-%m-%d')
2025-04-27 21:53:25,500 - components.transform_content - INFO - Extracting full content from URLs (this may take a while)...
100%|██████████| 275/275 [02:34<00:00,  1.78it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta

 # Step 3: Analyze sentiment

In [8]:
logger.info("Step 3: Analyzing sentiment...")
final_df = analyze_sentiment(transformed_df)
logger.info("Sentiment analysis complete")

2025-04-27 21:56:00,526 - __main__ - INFO - Step 3: Analyzing sentiment...
2025-04-27 21:56:00,526 - components.analyse_sentiment - INFO - Initializing sentiment analysis pipeline...
Device set to use mps:0
2025-04-27 21:56:01,463 - components.analyse_sentiment - INFO - Sentiment analysis pipeline initialized successfully
2025-04-27 21:56:01,465 - components.analyse_sentiment - INFO - Applying sentiment analysis to 275 articles...
100%|██████████| 9/9 [00:11<00:00,  1.24s/it]
2025-04-27 21:56:12,641 - components.analyse_sentiment - INFO - Sentiment analysis completed successfully
2025-04-27 21:56:12,647 - __main__ - INFO - Sentiment analysis complete


# Step 4: Load data to BigQuery

In [9]:
logger.info("Step 4: Loading data to BigQuery...")
rows_loaded = load_data_to_bigquery(
            final_df,
            service_account_path=service_account_path,
            method=load_method
)

2025-04-27 22:12:19,412 - __main__ - INFO - Step 4: Loading data to BigQuery...
2025-04-27 22:12:19,430 - components.load_content - INFO - BigQuery Configuration - Project: upheld-quanta-455417-m4, Dataset: news_dataset, Table: news_articles
2025-04-27 22:12:19,431 - components.load_content - INFO - Service account file not found or not specified. Using default credentials.
  dataframe.to_gbq(


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=k8EZrVoFF8LBSPl3CPMjvqKmrOOLdH&prompt=consent&access_type=offline


2025-04-27 22:12:31,829 - google_auth_oauthlib.flow - INFO - "GET /?state=k8EZrVoFF8LBSPl3CPMjvqKmrOOLdH&code=4/0Ab_5qlk-3m0G2KFknSj4jF76rTHLvr3JYkLgwNQ2hMwE65Zu7EwtTcFx9NNAutTiRLuNYg&scope=https://www.googleapis.com/auth/bigquery HTTP/1.1" 200 65
275 out of 275 rows loaded.<?, ?it/s]2025-04-27 22:12:36,267 - pandas_gbq.gbq - INFO - 
100%|██████████| 1/1 [00:00<00:00, 1216.45it/s]
2025-04-27 22:12:36,269 - components.load_content - INFO - Successfully loaded 275 rows to news_dataset.news_articles


In [10]:
logger.info(f"Successfully loaded {rows_loaded} rows to BigQuery")

2025-04-27 22:18:43,729 - __main__ - INFO - Successfully loaded 275 rows to BigQuery
