<a href="https://colab.research.google.com/github/EpicMike87/AIProject/blob/main/Google_Headline_Scraper_%26_Sentiment_Analysis_AzureAPI_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Code to mount google drive and change working directory
from google.colab import drive
import os

drive.mount('/content/drive')
ai_directory_path = '/content/drive/My Drive/AI'

if not os.path.exists(ai_directory_path):
    os.makedirs(ai_directory_path)
    print(f"Created AI directory in Google Drive")
else:
    print(f"AI directory already exists in Google Drive")

%cd '/content/drive/My Drive/AI'
%pwd
%ls

Mounted at /content/drive
AI directory already exists in Google Drive
/content/drive/My Drive/AI
google_news_headlines_by_date.csv  sentiment_analysis_results_2.csv


In [2]:
# Checks present working directory and presence of csv files
%pwd
%ls

google_news_headlines_by_date.csv  sentiment_analysis_results_2.csv


In [None]:
# Run these on first use

!pip install feedparser
!pip install azure-ai-textanalytics
!pip install flair
!pip install azure-ai-textanalytics

In [None]:
### Google News Headline Scraper

In [5]:
import pandas as pd
from datetime import datetime, timedelta
import feedparser

# Define your start and end dates for filtering
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 4, 8)

rss_url = "https://news.google.com/rss/search?q=%22S%26P+500%22+OR+%22SPX%22+OR+%22$SPY%22&hl=en-US&gl=US&ceid=US:en"
feed = feedparser.parse(rss_url)

articles = []

for entry in feed.entries:
    published_date = datetime(*entry.published_parsed[:6])

    # Filter articles by the specified start and end dates
    if start_date <= published_date <= end_date:
        headline = entry.title
        articles.append({'date': published_date, 'headline': headline})

# Create a DataFrame from the list of articles
df_articles = pd.DataFrame(articles)

print(df_articles.head())

output_file_name = 'google_news_headlines_by_date.csv'
df_articles.to_csv(output_file_name, index=False)

print(f"Filtered and saved articles to {output_file_name}.")


                 date                                           headline
0 2024-04-06 08:20:38  S&P 500 Snapshot: Index Begins Q2 With Weekly ...
1 2024-04-04 16:02:00  The S&P 500’s Rally Is Broadening Out Beyond T...
2 2024-04-06 09:40:00  This ETF is beating the S&P 500 — and it's com...
3 2024-04-06 20:07:00  Is It Smart to Buy Stocks With the S&P 500 at ...
4 2024-04-04 20:59:42  S&P 500 Gains and Losses Today: Index Slips Am...
Filtered and saved articles to google_news_headlines_by_date.csv.


In [7]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
import pandas as pd

# Set up the Azure Text Analytics client
def authenticate_client():
    endpoint = "https://sentimentanalysishp.cognitiveservices.azure.com/"  # Replace with your endpoint
    key = "dedffb82877445dd93d95092e65b54bc"  # Replace with your key
    return TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(key))

# Analyze sentiment of documents (headlines)
def analyze_headline_sentiment(headlines, dates):
    client = authenticate_client()
    sentiment_responses = []

    for headline, date in zip(headlines, dates):
        try:
            response = client.analyze_sentiment(documents=[headline], language="en")

            if response[0].is_error:
                print(f"Error analyzing sentiment for '{headline}': {response[0].error}")
            else:
                # Adjust threshold for positive and negative sentiment. This overcomes the default parameters and makes the model more sensitive to positive and negative leanings.
                sentiment_label = "POSITIVE" if response[0].confidence_scores["positive"] > 0.3 else "NEGATIVE" if response[0].confidence_scores["negative"] > 0.3 else "NEUTRAL"

                sentiment_response = {
                    'date': date,
                    'headline': headline,
                    'sentiment': sentiment_label,
                    'confidence_scores': response[0].confidence_scores
                }
                sentiment_responses.append(sentiment_response)
        except Exception as e:
            print(f"Error analyzing sentiment for '{headline}': {e}")

    return sentiment_responses

# Specify the CSV file path containing headlines
csv_path = '/content/drive/My Drive/AI/google_news_headlines_by_date.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# Extract all headlines and dates from the DataFrame
headlines = df['headline'].dropna().tolist()
dates = df['date'].dropna().tolist()

# Analyze sentiment for all headlines
sentiment_responses = analyze_headline_sentiment(headlines, dates)

# Print sentiment analysis results for each headline
for response in sentiment_responses:
    print(f"Date: {response['date']} - Headline: {response['headline']}")
    print(f"Sentiment: {response['sentiment']}, Scores: {response['confidence_scores']}")
    print("-------")


Date: 2024-04-06 08:20:38 - Headline: S&P 500 Snapshot: Index Begins Q2 With Weekly Loss - Advisor Perspectives
Sentiment: NEUTRAL, Scores: {'positive': 0.01, 'neutral': 0.86, 'negative': 0.13}
-------
Date: 2024-04-04 16:02:00 - Headline: The S&P 500’s Rally Is Broadening Out Beyond Tech Stocks - Barron's
Sentiment: NEUTRAL, Scores: {'positive': 0.05, 'neutral': 0.94, 'negative': 0.01}
-------
Date: 2024-04-06 09:40:00 - Headline: This ETF is beating the S&P 500 — and it's completely different from the index - Yahoo Finance
Sentiment: NEUTRAL, Scores: {'positive': 0.03, 'neutral': 0.8, 'negative': 0.17}
-------
Date: 2024-04-06 20:07:00 - Headline: Is It Smart to Buy Stocks With the S&P 500 at an All-Time High? History Offers a Clear Answer - Yahoo Finance
Sentiment: NEUTRAL, Scores: {'positive': 0.14, 'neutral': 0.83, 'negative': 0.03}
-------
Date: 2024-04-04 20:59:42 - Headline: S&P 500 Gains and Losses Today: Index Slips Amid Rate Cut Uncertainty - Investopedia
Sentiment: NEUTRAL,