In [36]:
# extract_data function : 

def extract_data():
    """
    Extract news articles from NewsAPI for multiple topics and combine them.
    
    Returns:
        list: Combined list of articles from different topics
    """
    import datetime 
    import os 
    from dotenv import load_dotenv
    from newsapi import NewsApiClient 
    
    # Get current date and date from 7 days ago
    current_date = datetime.datetime.now().strftime('%Y-%m-%d')
    print("Current date:", current_date)
    
    seven_days_ago = (datetime.datetime.now() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
    print("7 days ago:", seven_days_ago)
    
    # Load API key from .env file
    load_dotenv()
    news_api = os.getenv("NEWS_API")
    
    # Initialize NewsAPI client
    newsapi = NewsApiClient(news_api)
    
    # Define topics to search for
    topics = ['GenAI', 'AI', 'Technology']
    combined_articles = []
    
    # Fetch articles for each topic
    for topic in topics:
        articles = newsapi.get_everything(
            q=topic,
            from_param=seven_days_ago,
            to=current_date,
            language='en',
            sort_by='relevancy',
            page=2
        )
        
        print(f"Fetched {len(articles['articles'])} articles for topic: {topic}")
        combined_articles.extend(articles['articles'])
    
    # Remove duplicate articles (same URL)
    seen_urls = set()
    unique_articles = []
    
    for article in combined_articles:
        if article['url'] not in seen_urls:
            seen_urls.add(article['url'])
            unique_articles.append(article)
    
    print(f"Total unique articles fetched: {len(unique_articles)}")
    return unique_articles


In [None]:
import pandas as pd
import ast
import requests
from bs4 import BeautifulSoup

def extract_source_name(source):
    """
    Extract the name from a source object which can be either a dictionary or string.
    
    Args:
        source: Source object from NewsAPI (can be dict or string)
        
    Returns:
        str: Extracted source name or 'Unknown' if not found
    """
    if isinstance(source, dict):
        return source.get('name', 'Unknown')
    elif isinstance(source, str):
        try:
            dict_data = ast.literal_eval(source)  # Attempt to parse it as a dictionary
            if isinstance(dict_data, dict):
                return dict_data.get('name', 'Unknown')
        except (SyntaxError, ValueError):
            # If source is a plain string, return it directly
            return source
    return 'Unknown'

def extract_content(url):
    """
    Extract the main content from a webpage given its URL.
    
    Args:
        url (str): URL of the article
        
    Returns:
        str: Extracted text content or error message
    """
    try:
        headers = {"User-Agent": "Mozilla/5.0"}  # Mimic a browser request
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error for bad responses
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract meaningful text (modify based on website structure)
        paragraphs = soup.find_all("p")
        content = " ".join(p.text for p in paragraphs)
        
        return content[:1000]  # Return first 1000 characters to avoid large data
    except Exception as e:
        return f"Error: {e}"

def transform_data(combined_articles):
    """
    Transform the combined articles into a cleaned and structured DataFrame.
    
    Args:
        combined_articles (list): List of article dictionaries from NewsAPI
        
    Returns:
        pandas.DataFrame: Transformed and cleaned DataFrame
    """
    # Convert to DataFrame
    combined_articles_df = pd.DataFrame(combined_articles)
    
    # Drop the urlToImage column
    combined_articles_df = combined_articles_df.drop('urlToImage', axis=1)
    
    # Remove duplicates based on description
    final_df = combined_articles_df.drop_duplicates(subset=["description"], keep='first')
    
    # Format dates
    final_df['publishedAt'] = pd.to_datetime(final_df['publishedAt']).dt.strftime('%Y-%m-%d')
    
    # Apply source name extraction
    final_df.loc[:, 'source'] = final_df['source'].apply(extract_source_name)
    
    # Extract full content from each article URL
    final_df['full_content'] = final_df['url'].apply(extract_content)
    
    return final_df

# continue changing 

In [22]:
#  Authenticating from big query 
from google.cloud import bigquery
import os 

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service_account.json"
# Initialize BigQuery client
client = bigquery.Client()

In [23]:


load_dotenv()
project_id = os.getenv("project_id")
dataset_id = os.getenv("dataset_id")
table_id = os.getenv("table_id")
table_ref = f"{project_id}.{dataset_id}.{table_id}"



In [24]:
query = f"""
    SELECT  *
    FROM `{table_ref}`
    ORDER BY publishedAt DESC
    LIMIT 10;
"""

In [25]:
# Run the query
query_job = client.query(query)

# Fetch results
results = query_job.result()

In [26]:
results

<google.cloud.bigquery.table.RowIterator at 0x112952e40>

In [27]:
import pandas as pd

results_df =results.to_dataframe()



In [28]:
results_df = final_df
results_df

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,full_content
0,Business Insider,Lakshmi Varanasi,Wharton has overhauled its curriculum around A...,The University of Pennsylvania's Wharton Schoo...,https://www.businessinsider.com/wharton-busine...,https://i.insider.com/67f0266fb8b41a9673fcb642...,2025-04-07,"Wharton has launched a new ""Artificial Intelli...",The nation's oldest business school is evolvin...
1,VentureBeat,"Pashootan Vaezipoor, Georgian",DeepSeek jolts AI industry: Why AI’s next leap...,"To contextualize DeepSeek’s disruption, let's ...",https://venturebeat.com/ai/deepseek-jolts-ai-i...,https://venturebeat.com/wp-content/uploads/202...,2025-04-05,Join our daily and weekly newsletters for the ...,Join our daily and weekly newsletters for the ...
2,The Verge,Kevin Nguyen,The 7 writing apps I used to start and finish ...,Thereâs a famous two-decade-old Paris Review...,https://www.theverge.com/apps/642131/7-writing...,https://platform.theverge.com/wp-content/uploa...,2025-04-05,The 7 writing apps I used to start and finish ...,"Can you ever use too much software? Yes, but h..."
3,Slashdot.org,EditorDavid,New Tinder Game 'Lets You Flirt With AI Charac...,"Tinder ""is experimenting with a chatbot that c...",https://slashdot.org/story/25/04/05/0414240/ne...,https://a.fsdn.com/sd/topics/ai_64.png,2025-04-06,"Tinder ""is experimenting with a chatbot that c...",\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\tFollow...
4,Slashdot.org,BeauHD,"Midjourney Releases V7, Its First New AI Image...",Midjourney's new V7 image model features a rev...,https://slashdot.org/story/25/04/04/2258217/mi...,https://a.fsdn.com/sd/topics/ai_64.png,2025-04-04,"To use it, you'll first have to rate around 20...",\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\tSlashd...
...,...,...,...,...,...,...,...,...,...
291,Uncrate.com,,The Y Code Advanced Eye Serum,Keeping dark circles and puffiness around the ...,https://shop.uncrate.com/products/well-kept-ad...,http://shop.uncrate.com/cdn/shop/files/y-code-...,2025-04-01,Editor’s Note\r\nKeeping dark circles and puff...,Have an account? \nLog in to check out faster....
293,AppleInsider,news@appleinsider.com (William Gallagher),Apple still wants an iPhone without physical b...,A new rumor says that Apple has not forgotten ...,https://appleinsider.com/articles/25/04/02/app...,https://photos5.appleinsider.com/gallery/63189...,2025-04-02,A new rumor says that Apple has not forgotten ...,\n\n\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tAAPL...
294,CNET,Jon Reed,AI Tools Helped Restore Speech for a Woman Wit...,Scientists used a brain implant and generative...,https://www.cnet.com/tech/services-and-softwar...,https://www.cnet.com/a/img/resize/4919801e2fa1...,2025-04-07,The technology that allows you to transcribe y...,\n Scientists used a brain implant and gene...
295,Forbes,"Gaurav Tewari, Forbes Councils Member, \n Gaur...",Trends In AI Payments Technology Changing How ...,Much like the internet opened the way for inno...,https://www.forbes.com/councils/forbesbusiness...,https://imageio.forbes.com/specials-images/ima...,2025-04-03,"Gaurav Tewari, founder and Managing Partner of...","ByGaurav Tewari ByGaurav Tewari, Forbes Coun..."


In [29]:
# applying the Bert model 

from transformers import pipeline 
 
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")


  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


In [30]:
def analyze_sentiment(text):
    if pd.isna(text) or text == "":
        return {"label": "NEUTRAL", "score": 0.5}  # Default for empty content
    try:
        # DistilBERT has a max token limit, so truncate if needed
        truncated_text = text[:512]  # Simple truncation approach
        result = sentiment_pipeline(truncated_text)[0]
        return result
    except Exception as e:
        print(f"Error processing text: {e}")
        return {"label": "NEUTRAL", "score": 0.5}  # Default for errors

In [31]:
from tqdm import tqdm 
text_column = 'full_content'  # You can change this to your preferred column

# Apply sentiment analysis in batches to avoid memory issues
batch_size = 32
results = []

# Process in batches
for i in tqdm(range(0, len(results_df), batch_size)):
    batch = results_df[text_column].iloc[i:i+batch_size].fillna("").tolist()
    # Filter out empty strings
    batch = [text for text in batch if text.strip()]
    if batch:
        batch_results = sentiment_pipeline(batch)
        # Extend results with empty values for any skipped items
        results.extend(batch_results)
    
    # Fill in missing results for empty strings
    while len(results) < min(i+batch_size, len(results_df)):
        results.append({"label": "NEUTRAL", "score": 0.5})


100%|██████████| 9/9 [00:09<00:00,  1.03s/it]


In [32]:

# Use .loc to assign values:
results_df.loc[:, 'sentiment_label'] = [result['label'] for result in results]
results_df.loc[:, 'sentiment_score'] = [result['score'] for result in results]
results_df.loc[:, 'sentiment_value'] = results_df['sentiment_label'].map({'POSITIVE': 1, 'NEGATIVE': -1, 'NEUTRAL': 0})

# Save the results
results_df.to_csv('news_with_sentiment.csv', index=False)

# Display sentiment distribution
sentiment_counts = results_df['sentiment_label'].value_counts()
print("Sentiment Distribution:")
print(sentiment_counts)

# Optional: Calculate average sentiment by source
source_sentiment = results_df.groupby('source')['sentiment_value'].mean().sort_values(ascending=False)
print("\nAverage Sentiment by Source:")
print(source_sentiment)

Sentiment Distribution:
sentiment_label
POSITIVE    133
NEGATIVE    124
NEUTRAL       1
Name: count, dtype: int64

Average Sentiment by Source:
source
Kotaku                          1.0
Japan Today                     1.0
Schott.com                      1.0
Pypi.org                        1.0
Postsecret.com                  1.0
                               ... 
Rand.org                       -1.0
Researchbuzz.me                -1.0
International Business Times   -1.0
Computerworld                  -1.0
Seocopywriting.com             -1.0
Name: sentiment_value, Length: 86, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df.loc[:, 'sentiment_label'] = [result['label'] for result in results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df.loc[:, 'sentiment_score'] = [result['score'] for result in results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df.loc[:, 'sentiment_value'] = results_

In [33]:
results_df 

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,full_content,sentiment_label,sentiment_score,sentiment_value
0,Business Insider,Lakshmi Varanasi,Wharton has overhauled its curriculum around A...,The University of Pennsylvania's Wharton Schoo...,https://www.businessinsider.com/wharton-busine...,https://i.insider.com/67f0266fb8b41a9673fcb642...,2025-04-07,"Wharton has launched a new ""Artificial Intelli...",The nation's oldest business school is evolvin...,POSITIVE,0.997062,1
1,VentureBeat,"Pashootan Vaezipoor, Georgian",DeepSeek jolts AI industry: Why AI’s next leap...,"To contextualize DeepSeek’s disruption, let's ...",https://venturebeat.com/ai/deepseek-jolts-ai-i...,https://venturebeat.com/wp-content/uploads/202...,2025-04-05,Join our daily and weekly newsletters for the ...,Join our daily and weekly newsletters for the ...,NEGATIVE,0.961355,-1
2,The Verge,Kevin Nguyen,The 7 writing apps I used to start and finish ...,Thereâs a famous two-decade-old Paris Review...,https://www.theverge.com/apps/642131/7-writing...,https://platform.theverge.com/wp-content/uploa...,2025-04-05,The 7 writing apps I used to start and finish ...,"Can you ever use too much software? Yes, but h...",POSITIVE,0.904158,1
3,Slashdot.org,EditorDavid,New Tinder Game 'Lets You Flirt With AI Charac...,"Tinder ""is experimenting with a chatbot that c...",https://slashdot.org/story/25/04/05/0414240/ne...,https://a.fsdn.com/sd/topics/ai_64.png,2025-04-06,"Tinder ""is experimenting with a chatbot that c...",\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\tFollow...,NEGATIVE,0.999008,-1
4,Slashdot.org,BeauHD,"Midjourney Releases V7, Its First New AI Image...",Midjourney's new V7 image model features a rev...,https://slashdot.org/story/25/04/04/2258217/mi...,https://a.fsdn.com/sd/topics/ai_64.png,2025-04-04,"To use it, you'll first have to rate around 20...",\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\tSlashd...,NEGATIVE,0.999119,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
291,Uncrate.com,,The Y Code Advanced Eye Serum,Keeping dark circles and puffiness around the ...,https://shop.uncrate.com/products/well-kept-ad...,http://shop.uncrate.com/cdn/shop/files/y-code-...,2025-04-01,Editor’s Note\r\nKeeping dark circles and puff...,Have an account? \nLog in to check out faster....,POSITIVE,0.995623,1
293,AppleInsider,news@appleinsider.com (William Gallagher),Apple still wants an iPhone without physical b...,A new rumor says that Apple has not forgotten ...,https://appleinsider.com/articles/25/04/02/app...,https://photos5.appleinsider.com/gallery/63189...,2025-04-02,A new rumor says that Apple has not forgotten ...,\n\n\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tAAPL...,NEGATIVE,0.997126,-1
294,CNET,Jon Reed,AI Tools Helped Restore Speech for a Woman Wit...,Scientists used a brain implant and generative...,https://www.cnet.com/tech/services-and-softwar...,https://www.cnet.com/a/img/resize/4919801e2fa1...,2025-04-07,The technology that allows you to transcribe y...,\n Scientists used a brain implant and gene...,NEGATIVE,0.501352,-1
295,Forbes,"Gaurav Tewari, Forbes Councils Member, \n Gaur...",Trends In AI Payments Technology Changing How ...,Much like the internet opened the way for inno...,https://www.forbes.com/councils/forbesbusiness...,https://imageio.forbes.com/specials-images/ima...,2025-04-03,"Gaurav Tewari, founder and Managing Partner of...","ByGaurav Tewari ByGaurav Tewari, Forbes Coun...",POSITIVE,0.988893,1


In [34]:
# writing results to big query : 

job_config = bigquery.LoadJobConfig(
    # Choose the write disposition based on your needs:
    # WRITE_TRUNCATE: overwrite the table if it exists
    # WRITE_APPEND: append to the table if it exists
    # WRITE_EMPTY: only write if the table is empty
    write_disposition="WRITE_APPEND",
    
    # Automatically detect the schema from the dataframe
    autodetect=True
) 

destination_table_id=table_ref


In [35]:
try:
    # Convert dataframe to proper format for BigQuery
    # Handle any data type issues
    for col in results_df.columns:
        if results_df[col].dtype == 'object':
            results_df[col] = results_df[col].astype(str).fillna('')
    
    # Convert timestamp columns if needed
    if 'publishedAt' in results_df.columns:
        df['publishedAt'] = pd.to_datetime(results_df['publishedAt'])
    
    # Load the dataframe into BigQuery
    load_job = client.load_table_from_dataframe(
        results_df, destination_table_id, job_config=job_config
    )
    
    # Wait for the job to complete
    load_job.result()
    
    # Get the resulting table and print info
    destination_table = client.get_table(destination_table_id)
    print(f"Loaded {destination_table.num_rows} rows and {len(destination_table.schema)} columns to {destination_table_id}")
    
except Exception as e:
    print(f"Error uploading to BigQuery: {e}")

Error uploading to BigQuery: name 'df' is not defined


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df[col] = results_df[col].astype(str).fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df[col] = results_df[col].astype(str).fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df[col] = results_df[col].astype(str).fillna('')
A value is trying to be set on a c