# Data Scraping for Sentimental Analysis

Data is from Ecomonic Times and Reddit websites, the period is total 37 days, from 21 of March to 25 of April on 2024.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import math

# reddit news import
import praw

# sentimental analysis
from textblob import TextBlob 

import darts
from darts import TimeSeries
from darts.dataprocessing.transformers import MissingValuesFiller
from darts.dataprocessing.transformers import Scaler
from darts import concatenate

# model
from darts.models import TCNModel,NBEATSModel,TransformerModel,BlockRNNModel
# darts model evaluation
from darts.metrics import mape, rmse, r2_score

  _empty_series = pd.Series()
The statsforecast module could not be imported. To enable support for the StatsForecastAutoARIMA, StatsForecastAutoETS and Croston models, please consider installing it.


In [2]:
# average 2-3 news update a day.
def scrape_economic_times_news():
    url = 'https://economictimes.indiatimes.com/topic/microsoft'
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract all title elements
        title_elements = soup.find_all('h2')
        titles = [title_element.text.strip() for title_element in title_elements]
        
        # Extract all date elements
        date_elements = soup.find_all('time')
        dates = [date_element.text.strip() for date_element in date_elements]
        
        return titles, dates
        
    else:
        return None

# Call the function to scrape the titles and dates
titles, dates = scrape_economic_times_news()

data_list = []
if titles and dates:
    print("NEWS Titles:")
    for title, date in zip(titles, dates):
        print(f"Title: {title}, Date: {date}")
        data_list.append({'Title': title, 'Date': date})
else:
    print("Failed to fetch news.")

df_ET = pd.DataFrame(data_list)
df_ET['Date'] = df_ET['Date'].str[:12]
df_ET['Date'] = pd.to_datetime(df_ET['Date'], format='%d %b, %Y')
df_ET['Date'] = pd.to_datetime(df_ET['Date'], format='%d %b, %Y').dt.strftime('%Y-%m-%d')
df_ET.info()

NEWS Titles:
Title: Apple will revamp Siri to catch up to its chatbot competitors, Date: 12 May, 2024, 04:32 PM IST
Title: Eye on AI: Apple’s chip game, Meta's GenAI tools for advertisers and other top developments, Date: 12 May, 2024, 06:00 AM IST
Title: Ola's Bhavish Aggarwal snaps ties with Microsoft Azure in stand against Western tech, Date: 11 May, 2024, 08:59 PM IST
Title: Britain attracts new £1 biliion AI investment, Date: 11 May, 2024, 02:23 PM IST
Title: Microsoft hit with $242 million US verdict in Cortana patent lawsuit, Date: 11 May, 2024, 09:46 AM IST
Title: OpenAI plans to announce Google search competitor on Monday, sources say, Date: 10 May, 2024, 09:33 AM IST
Title: Explainer: What risks do advanced AI models pose in the wrong hands?, Date: 09 May, 2024, 09:52 PM IST
Title: US eyes curbs on China's access to AI software behind apps like ChatGPT, Date: 08 May, 2024, 04:36 PM IST
Title: AI boom set to fuel data centre deals in Asia this year, Date: 08 May, 2024, 08:34 A

In [3]:
# 2. reddit

reddit = praw.Reddit(client_id='YY345NV_AtY5_QeqD6wOmQ',
                     client_secret='G649dr6HOD97xHKgBz7NsS3GNCHbEQ',
                     user_agent='Stockprediction_aalborg')

subreddit = reddit.subreddit('Microsoft') 
keywords = ['Microsoft']

search_results = []

# search with keywords
for keyword in keywords:
    
    for submission in subreddit.search(query=keyword, sort='new', time_filter='week'):
        search_results.append({
            'Title': submission.title,
            'Author': submission.author,
            'Score': submission.score,
            'URL': submission.url,
            'Date': datetime.datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        })


for result in search_results:
    print(result)


df_RD = pd.DataFrame(search_results)
df_RD.head(5)

# modifiy date format
df_RD['Date'] = df_RD['Date'].str[:11]
df_RD['Date'] = pd.to_datetime(df_RD['Date'].str.strip(), format='%Y-%m-%d')
df_RD['Date'] = pd.to_datetime(df_RD['Date'], format='%Y-%m-%d')
df_RD = df_RD[['Date','Title']] # select needed columns

{'Title': 'Windows activation error 0xC004F211 Please help', 'Author': Redditor(name='Airtronik'), 'Score': 1, 'URL': 'https://www.reddit.com/r/microsoft/comments/1cqul0t/windows_activation_error_0xc004f211_please_help/', 'Date': '2024-05-13 08:50:53'}
{'Title': 'Inherited Onedrive Account ', 'Author': Redditor(name='sandgrubber'), 'Score': 2, 'URL': 'https://www.reddit.com/r/microsoft/comments/1cqr8bs/inherited_onedrive_account/', 'Date': '2024-05-13 04:58:33'}
{'Title': 'Phishing threshold increase ', 'Author': Redditor(name='Lonely_Panda4322'), 'Score': 1, 'URL': 'https://www.reddit.com/r/microsoft/comments/1cqr25j/phishing_threshold_increase/', 'Date': '2024-05-13 04:47:57'}
{'Title': 'What the hell is wrong with Microsoft bug reports.', 'Author': Redditor(name='KleeLovesGanyu'), 'Score': 0, 'URL': 'https://www.reddit.com/r/microsoft/comments/1cqqu9x/what_the_hell_is_wrong_with_microsoft_bug_reports/', 'Date': '2024-05-13 04:34:42'}
{'Title': 'Job Postings in Multiple States', 'Aut

In [4]:
# merge two files
df_text = pd.concat([df_RD, df_ET], axis = 0)
df_text['Date'] = pd.to_datetime(df_text['Date'])
df_text.sort_values(by='Date', inplace=True)
duplicate_rows = df_text[df_text.duplicated()]
df_all = df_text.drop_duplicates()


# Get today's date
today_date = datetime.datetime.now().strftime('%Y-%m-%d')

# Define the filename with today's date
filename = f'df_all_{today_date}.csv'

# Save the DataFrame to CSV with the filename
df_all.to_csv(filename, index=False)