Code for preliminary testing of webscraping and sentiment analysis

In [21]:
%pip list

Package                       Version
----------------------------- ------------
aiobotocore                   2.4.2
aiofiles                      22.1.0
aiohttp                       3.8.3
aioitertools                  0.7.1
aiosignal                     1.2.0
aiosqlite                     0.18.0
alabaster                     0.7.12
anaconda-catalogs             0.2.0
anaconda-client               1.12.0
anaconda-navigator            2.4.2
anaconda-project              0.11.1
anyio                         3.5.0
appdirs                       1.4.4
applaunchservices             0.3.0
appnope                       0.1.2
appscript                     1.1.2
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arrow                         1.2.3
astroid                       2.14.2
astropy                       5.1
asttokens                     2.0.5
async-timeout                 4.0.2
atomicwrites                  1.4.0
attrs                         22.1.0
Automat     

Note: you may need to restart the kernel to use updated packages.


In [20]:
%pip install newspaper3k

Collecting newspaper3k
  Obtaining dependency information for newspaper3k from https://files.pythonhosted.org/packages/d7/b9/51afecb35bb61b188a4b44868001de348a0e8134b4dfa00ffc191567c4b9/newspaper3k-0.2.8-py3-none-any.whl.metadata
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Obtaining dependency information for feedparser>=5.2.1 from https://files.pythonhosted.org/packages/7c/d4/8c31aad9cc18f451c49f7f9cfb5799dadffc88177f7917bc90a66459b1d7/feedparser-6.0.11-py3-none-any.whl.metadata
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


In [59]:
from textblob import TextBlob
import nltk
from bs4 import BeautifulSoup as BS
from newspaper import Article
from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer
import requests as req
import pandas as pd
import numpy as np
from scipy import stats
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    nltk.download('vader_lexicon')
    
try:
    nltk.data.find('corpora/movie_reviews')
except LookupError:
    nltk.download('movie_reviews')



In [60]:
# for testing, we extract from existing data's news link articles
def get_news_links(file_name):
    df = pd.read_csv(file_name)
    url_list = df["article_url"].dropna().tolist()
    return url_list
        
    

In [61]:
def get_news_text(url):
    article = Article(url)
    try:
        article.download()
        article.parse()
        return article.text
    except Exception as error:
        return error

In [62]:
def get_sentiment(text):
    if not isinstance(text, str):
        return "Error"
    else:
        blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
        return blob.sentiment
    

In [45]:
url_list = get_news_links('dataset/news_maux/AAPL_main_2022-01-04_2022-01-06.csv')
for url in url_list:
    text = get_news_text(url)
    sentiment = get_sentiment(text)
    print(sentiment)

Sentiment(classification='pos', p_pos=0.9997272414111645, p_neg=0.00027275858882850636)
Error
Sentiment(classification='pos', p_pos=1.0, p_neg=4.965559666318901e-19)


In [63]:
# testing accuracy for TextBlob, using a directory of data
df_news = pd.read_csv('dataset/news/AAPL_main.csv')
df_rela = pd.read_csv('dataset/news/AAPL_relation.csv')


In [64]:
def combine_dataframes_with_time_range(main_df, relation_df, start_time, end_time):
    start_time = pd.to_datetime(start_time)
    end_time = pd.to_datetime(end_time)
    
    main_df['publish_time'] = pd.to_datetime(main_df['publish_time'])
    relation_df['time'] = pd.to_datetime(relation_df['time'])
    filtered_rel = relation_df[relation_df['ticker'] == relation_df['source_ticker']]
    
    merged_df = main_df.merge(filtered_rel, left_on='id', right_on='news_id', how='inner')
    
    filtered_df = merged_df
    
    time_filtered_df = filtered_df[(filtered_df['publish_time'] >= start_time) & (filtered_df['publish_time'] <= end_time)]
    
    result_df = time_filtered_df[['id', 'publish_time', 'article_url', 'sentiment']]
    
    return result_df

In [146]:
merged_df = combine_dataframes_with_time_range(df_news, df_rela, "2024-07-01", "2024-09-01")

In [147]:
print(merged_df)

                                                    id        publish_time  \
245  f9b58febeb4565a6dd607e3216decb281b237a1e76b9e9... 2024-08-31 11:15:00   
246  9cc8d83bae696e5f0565a1a0391596b06171e5ed3e8402... 2024-08-31 10:24:00   
247  f83a6765bf7b69e13cdef5999c8e5c99a26598acdce856... 2024-08-31 10:10:00   
248  42cf4288fdeddba7d0846f4bdcdb97946c96f0849c11d0... 2024-08-31 07:01:00   
249  d3e51924ac19db5ffb2f24d1621a9a213f30f9ed3aef5c... 2024-08-30 14:14:29   
..                                                 ...                 ...   
658  1fcac93fed0e133da96bfb1c8953aac2f52188cd442308... 2024-07-02 23:30:49   
659  860ffff9a7f8e6ac55b2ddd19bd169ecd1a9f503e9f1dc... 2024-07-02 21:05:40   
660  4d843941162f622729de3d2276012094d587ca8dfb916f... 2024-07-02 20:15:43   
661  1d7b72c113d7c86046e8832efab111ddbc673a08a41547... 2024-07-02 20:14:53   
662  61d80aa431470bc0a67ae461d70964cc7827fa8a37eefd... 2024-07-02 19:16:19   

                                           article_url sentimen

In [108]:
def compare_sentiments_blob(merged_df):
    total_result = 0
    correct_result = 0
    false_positive = 0
    false_negative = 0

    for row in merged_df.itertuples():
        if total_result % 10 == 0:
            print("processed: " + str(total_result))
        if row.sentiment != "neutral":
            url = row.article_url
            text = get_news_text(url)
            # print(text)
            sentiment = get_sentiment(text)
            if sentiment != "Error":
    #             print(sentiment)
    #             print(row.sentiment)
                blob_result = sentiment.classification
                old = row.sentiment
                if blob_result == "pos" and old == "positive":
                    correct_result += 1
                elif blob_result == "pos" and old == "negative":
                    false_positive += 1
                elif blob_result == "neg" and old == "positive":
                    false_negative += 1
                elif blob_result == "neg" and old == "negative":
                    correct_result += 1
                total_result += 1
                
    return total_result, correct_result, false_positive, false_negative
        
    

In [109]:
total_result, correct_result, false_positive, false_negative = compare_sentiments_blob(merged_df)

processed: 0
processed: 0
processed: 10
processed: 10
processed: 20
processed: 20
processed: 20
processed: 20
processed: 20
processed: 20
processed: 20
processed: 30
processed: 30
processed: 40
processed: 40


In [110]:
print(correct_result / total_result)
print(false_positive / total_result)
print(false_negative / total_result)

0.9512195121951219
0.04878048780487805
0.0


In [111]:
counts = merged_df["sentiment"].value_counts()

print("non-neutrals: " + str(counts.get('positive') + counts.get('negative')))
print("positives: " + str(counts.get('positive')))
print("negatives: " + str(counts.get('negative')))

non-neutrals: 65
positives: 60
negatives: 5


Conclusion: bad performance, everything is positive

In [149]:
# testing accuracy for Vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [153]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_vader(text):
    if not isinstance(text, str):
        return "Error"
    else:
        scores = analyzer.polarity_scores(text)
        return scores
    
# def compare_sentiments_vader(merged_df):
#     total_result = 0
#     correct_result = 0
#     compound_correct_result = 0
#     false_positive = 0
#     false_negative = 0
#     false_neutral = 0

#     for row in merged_df.itertuples():
#         if total_result % 10 == 0:
#             print("processed: " + str(total_result))
            
#         url = row.article_url
#         text = get_news_text(url)
#         # print(text)
#         sentiment = get_sentiment_vader(text)
#         if sentiment != "Error":
#             old = row.sentiment
            
#             # check compound scores
#             if sentiment["compound"] > 0.1 and old == "positive":
#                 compound_correct_result += 1
#             elif sentiment["compound"] < 0.1 and old == "negative":
#                 compound_correct_result += 1
#             elif old == "neutral":
#                 compound_correct_result += 1
                
            
#             del sentiment["compound"]
            
#             blob_result = max(sentiment, key=sentiment.get)
            
#             # check individual scores
#             if blob_result == "pos" and old == "positive":
#                 correct_result += 1
#             elif blob_result == "neg" and old == "negative":
#                 correct_result += 1
#             elif blob_result == "neu" and old == "neutral":
#                 correct_result += 1
                
            
#             total_result += 1
                
#     return total_result, correct_result, compound_correct_result

def compare_sentiments_vader(merged_df):
    sentiment_matrix = {
        "positive": {"positive": 0, "neutral": 0, "negative": 0},
        "neutral": {"positive": 0, "neutral": 0, "negative": 0},
        "negative": {"positive": 0, "neutral": 0, "negative": 0},
    }
    
    i = 0

    for row in merged_df.itertuples():
        if i % 10 == 0:
            print("Processing: " + str(i))
        url = row.article_url
        text = get_news_text(url)
        
        sentiment = get_sentiment_vader(text)
        if sentiment != "Error":
            old = row.sentiment
            
            if sentiment["compound"] > 0.1:
                predicted = "positive"
            elif sentiment["compound"] < -0.1:
                predicted = "negative"
            else:
                predicted = "neutral"

            # Increment the corresponding matrix cell
            if old in sentiment_matrix and predicted in sentiment_matrix:
                sentiment_matrix[predicted][old] += 1
        i += 1

    result_matrix = pd.DataFrame(sentiment_matrix).T  

    # Add explanations for rows and columns
    result_matrix["Explanation"] = [
        "Predicted as positive",
        "Predicted as neutral",
        "Predicted as negative",
    ]
    explanation_row = {
        "positive": "Labeled positive from API",
        "neutral": "Labeled neutral from API",
        "negative": "Labeled negative from API",
        "Explanation": "Column description",
    }
    result_matrix = result_matrix.append(explanation_row, ignore_index=True)

    return result_matrix

In [157]:
def add_explanations_to_matrix(result_matrix):
    result_matrix["Explanation"] = [
        "Predicted as positive",
        "Predicted as neutral",
        "Predicted as negative",
        "Column description",  
    ]

    explanation_row = {
        "positive": "API labeled pos",
        "neutral": "API labeled neu",
        "negative": "API labeled neg",
        "Explanation": "Column description",
    }

    if not (result_matrix.iloc[-1] == pd.Series(explanation_row)).all():
        result_matrix = result_matrix[:-1].append(explanation_row, ignore_index=True)
    
    return result_matrix

In [151]:
result_matrix = compare_sentiments_vader(merged_df)


Processing: 0
Processing: 10
Processing: 20
Processing: 30
Processing: 40
Processing: 50
Processing: 60
Processing: 70
Processing: 80
Processing: 90
Processing: 100
Processing: 110
Processing: 120
Processing: 130
Processing: 140
Processing: 150
Processing: 160
Processing: 170
Processing: 180
Processing: 190
Processing: 200
Processing: 210
Processing: 220
Processing: 230
Processing: 240
Processing: 250
Processing: 260
Processing: 270
Processing: 280
Processing: 290
Processing: 300
Processing: 310
Processing: 320
Processing: 330
Processing: 340
Processing: 350
Processing: 360
Processing: 370
Processing: 380
Processing: 390
Processing: 400
Processing: 410
                                positive  \
0                                    175   
1                                      0   
2                                      5   
3  Texts labeled positive in the dataset   

                                neutral  \
0                                    93   
1                               

  result_matrix = result_matrix.append(explanation_row, ignore_index=True)


In [158]:
result_matrix = add_explanations_to_matrix(result_matrix)
print(result_matrix)

          positive          neutral         negative            Explanation
0              175               93               26  Predicted as positive
1                0                1                1   Predicted as neutral
2                5                5                5  Predicted as negative
3  API labeled pos  API labeled neu  API labeled neg     Column description


  result_matrix = result_matrix[:-1].append(explanation_row, ignore_index=True)


In [152]:
counts = merged_df["sentiment"].value_counts()

print("neutrals: " + str(counts.get('neutral')))
print("positives: " + str(counts.get('positive')))
print("negatives: " + str(counts.get('negative')))

neutrals: 128
positives: 241
negatives: 47


Conclusion: vader is terrible as well