Sentiment Analysis Notebook

In [1]:
#importing neccessary libraries 

#web scraping libraries
from newspaper import Article
import newspaper
import pandas as pd
from newspaper import Article

#gettting the date
from bs4 import BeautifulSoup as bs
import requests

#sumarizer libraries
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

#sentiment score analysis. 
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class scraper():
    def __init__(self, url):
        self.url = url
        self.error_counter = 0
        self.df = None
    
    def scrape_news(self,):
        """
        Scrapes Tesla news from the given Yahoo Finance URL.

        Args:

        Returns:
            A pandas DataFrame with columns 'title', 'summary', and 'date'.
        """
        try:
            paper = newspaper.build(self.url)
            articles = paper.articles
        except:
            print("Invalid Link!")

        data = []
        for article in articles:
            try:   
                article.download()
                article.parse()
                text = article.text
                summary = summarize(text)
            except:
                text= ""
                summary = ""
                self.error_counter += 1

            data.append({'title': article.title,
                        'text': text,
                        'url': article.url,
                        'summary': summary})

        self.df = pd.DataFrame(data)
        return self.df
    
    def saver(self, name, mode='w', dataFrame=None):
        """
        saves the data to a file if called

        Args:
            name: what the name of the file should be.
            dataFrame: the dataFrame that operation will be done on
            mode: how you want to save? 'a', 'w', 'x'
            - 'w', truncate the file first.
            - 'x', exclusive creation, failing if the file already exists.
            - 'a', append to the end of file if it exists.
            

        Return:
            None
        """
        if not dataFrame:
            self.df.to_csv(f'{name}.csv', index=False, mode=mode)
        else:
            dataFrame.to_csv(f'{name}.csv', index=False, mode=mode)
        print('saved!')
        

def summarize(text):
    if type(text) == type(None):
        return ""
    # Prepare the parser and summarizer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()

    # Set the number of sentences you want in the summary
    summary = summarizer(parser.document, 3)  # Summary with 3 sentences

    sentenceV = ""
    # Print the summary sentences
    for sentence in summary:
        sentenceV += str(sentence)
    
    return sentenceV

In [3]:
def getting_date_no1(df):
    '''
    Method number one of getting dates. The current method built into the code. 
        has some error. 
    
    Args:
        df: the data Frame.
    
    Returns:
        dataFrame. 
    
    '''
    error_counter = 0
    
    for i, url in enumerate(df['url']):
        try:
            article = Article(url)
            article.download()
            article.parse()
            df.at[i, 'date'] = article.publish_date
        except Exception as e:
            df.at[i, 'text'] = None
            df.at[i, 'date'] = None
            error_counter += 1
            # print('error occured', e)
    print(f"Error Count: {error_counter}")
    return df

In [4]:
def getting_date_no2(df, url):
    '''
    Method number two of getting dates for different urls.

    Args:
        df: pandas dataframe. the dataFrame that needs to be adjusted
        url: the url that the dataFrame belongs to.
    
    Returns:
        the dataFrame back
    '''
    r = requests.get(url)
    soup = bs(r.text, 'html.parser')

    dates = soup.select('div.mt-1.text-sm.text-faded.sm\\:order-1.sm\\:mt-0')

    all_dates = []

    for date in dates:
        all_dates.append(date['title'])
    
    df["date"] = pd.Series(all_dates)
    print(f"Unsuccessful operations: {df[df['date'].apply(type) == type(np.nan)].shape[0]}")
    return df

In [5]:
def bert_sentiment_analysis(df):
    tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
    model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone", num_labels=3)

    X = df['summary'].to_list()
    labels = {0:'neutral', 1:'positive',2:'negative'}

    sent_val = list()
    for x in X:
        inputs = tokenizer(x, return_tensors="pt", padding=True)
        with torch.no_grad():
            outputs = model(**inputs).logits
    
        probs = torch.nn.functional.softmax(outputs, dim=-1)
        val = labels[np.argmax(outputs.detach().numpy())]
        
        score_dict = {
            'neutral': probs[0][0].item(),
            'positive': probs[0][1].item(),
            'negative': probs[0][2].item()
        }
        
        sent_val.append(score_dict)
        
    neural_list = []
    positive_list = []
    negative_list = []

    for item in sent_val:
        neural_list.append(item['neutral'])
        positive_list.append(item['positive'])
        negative_list.append(item['negative'])

    df['neutral'] = neural_list
    df['positive'] = positive_list
    df['negative'] = negative_list

    return df

In [6]:
def analysis_on_berted_df(df):
    try: 
        positives = df['positive'].mean()
        negative = df['negative'].mean()
        neutral = df['neutral'].mean()
        return f"positive: {positives}, negative: {negative}, neutral: {neutral}"
    except:
        return f"You have to run it through bert_sentiment_analysis() first"
        

In [7]:
class do_everything():
    '''
    Does everything you need to do.

    Args:
        url: String. The url
        fileName= the file name of the csv that will be saved to
    
    '''
    def __init__(self, url, fileName='project') -> None:
        self.url = url
        self.your_scraper = scraper(url=url)
        #scrape the url
        self.your_scraper.scrape_news()
        self.df = self.your_scraper.df
        # print(self.your_scraper.df.head())
        print(self.df.head())
        #get the date method one
        self.df_date_1 = getting_date_no1(self.df)

        #get the date method two
        self.df_date_2 = getting_date_no2(self.df, url)

        #analyse with finbert:
        self.analysed = bert_sentiment_analysis(self.df)

        self.anlaysed_str = analysis_on_berted_df(self.analysed)

        #save to csv:
        self.your_scraper.saver(name=fileName,mode='w', dataFrame=self.analysed)

        print(self.anlaysed_str)
        print(self.analysed.head())
    

In [10]:
msft_news = do_everything(url='https://stockanalysis.com/stocks/msft/', fileName='microsft_news')

Empty DataFrame
Columns: []
Index: []


KeyError: 'url'

In [None]:
url = 'https://stockanalysis.com/stocks/aapl/'
your_scraper = scraper(url=url)
your_scraper.scrape_news()

In [16]:
import newspaper
paper = newspaper.build('https://stockanalysis.com/stocks/aapl/')
print(len(paper.articles))

0


In [20]:
def method_number_two_scraping():
    r = requests.get(url)
    soup = bs(r.text, 'html.parser')
    myBaddies = soup.select('gap-4.border-gray-300.bg-white.p-4 shadow.last\\:pb-1.last\\:shadow-none.dark\\:border-dark-600.dark\\:bg-dark-800.sm\\:border-b.sm\\:px-0.sm\\:shadow-none.sm\\:last\\:border-b-0.lg\\:gap-5.sm\\:grid.sm\\:grid-cols-news.sm\\:py-6')
    links = soup.select('a.text-default.hover\\:text-blue-brand_sharp.dark\\:text-neutral-300.dark\\:hover\\:text-blue-darklink')

    # Extract href attributes
    hrefs = [link['href'] for link in links if 'href' in link.attrs]
    print(hrefs)
    print(myBaddies)

In [None]:
r = requests.get(url)
soup = bs(r.text, 'html.parser')
containers = soup.select('div.gap-4.border-gray-300.bg-white.p-4.shadow.last\\:pb-1.last\\:shadow-none.dark\\:border-dark-600.dark\\:bg-dark-800.sm\\:border-b.sm\\:px-0.sm\\:shadow-none.sm\\:last\\:border-b-0.lg\\:gap-5.sm\\:grid.sm\\:grid-cols-news.sm\\:py-6')

# Extract href attributes
data = []
for container in containers:
    title = container.select('div.mt-1.text-sm.text-faded.sm\\:order-1.sm\\:mt-0')[0]['title']
    aTag = container.select('a.text-default.hover\\:text-blue-brand_sharp.dark\\:text-neutral-300.dark\\:hover\\:text-blue-darklink')[0]
    link = aTag['href']
    headline = aTag.text
    article = Article(link)
    try:
        # Download the article content
        article.download()

        # Parse the article to extract text and metadata
        article.parse()

        # Access the article text
        text = article.text
        summary = summarize(text)
    except Exception as e:
        text = ""
        print(f"An error occurred: {e}")
    data.append({
        'title': title,
        'link': link,
        'headline': headline,
        'text': text,
        'summary': summary
    })

    


scrapeResults = pd.DataFrame(data)
scrapeResults.head()

An error occurred: name 'article' is not defined
An error occurred: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.wsj.com/articles/apple-offers-100-million-investment-in-indonesia-to-lift-iphone-16-ban-67462dba?mod=rss_Technology on URL https://www.wsj.com/articles/apple-offers-100-million-investment-in-indonesia-to-lift-iphone-16-ban-67462dba?mod=rss_Technology
An error occurred: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.barrons.com/articles/qualcomm-stock-apple-amd-intel-chips-7c82508f on URL https://www.barrons.com/articles/qualcomm-stock-apple-amd-intel-chips-7c82508f
An error occurred: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/technology/apple-sends-100-mln-investment-proposal-build-plant-indonesia-2024-11-20/ on URL https://www.reuters.com/technology/apple-sends-100-mln-investment-proposal-build-plant-indonesia-2024-11-20/
An error occurred: Article

Unnamed: 0,title,link,headline,text
0,"Nov 21, 2024, 9:00 AM EST",https://www.prnewswire.com/news-releases/john-...,John Hancock Adds New Apple Watch Series 10 to...,
1,"Nov 21, 2024, 8:58 AM EST",https://www.cnbc.com/2024/11/21/cfpb-expands-o...,CFPB expands oversight of digital payments ser...,"BOSTON, Nov. 21, 2024 /PRNewswire/ - Today, Jo..."
2,"Nov 21, 2024, 6:20 AM EST",https://techxplore.com/news/2024-11-apple-urge...,Apple urges judge to dismiss US antitrust laws...,"Rohit Chopra, director of the CFPB, testifies ..."
3,"Nov 21, 2024, 1:25 AM EST",https://www.wsj.com/articles/apple-offers-100-...,Apple Offers $100 Million Investment in Indone...,This article has been reviewed according to Sc...
4,"Nov 20, 2024, 12:49 PM EST",https://www.pymnts.com/apple/2024/apple-seeks-...,Apple Seeks Dismissal of U.S. Smartphone Monop...,
