<a href="https://colab.research.google.com/github/Diiamon/Election-News-Article-Exploration/blob/main/capstone_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Regex and Imports:

This section includes import statements for various libraries such as **re, requests, PIL, urllib.request, gensim, pandas, numpy, matplotlib, seaborn, wordcloud, nltk, and scipy.stats**. These libraries are used for regular expressions, web connections, image processing, text corpora, data manipulation, visualization, natural language processing, and statistical analysis.

In [1]:
## Regex
import re

##
import requests # Allows us to make a connection to an external, internet hosted location (eg: Reddit)
from PIL import Image # Allows for image processing
import urllib.request # same as reques
from gensim.corpora import Dictionary # Module that hosts a lof ot English words

## The big 4 + wordclod
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud # create super pretty wordclouds

## All around useful for NLP
import nltk
from nltk.corpus import stopwords # A pre-determined set of stopwords
from nltk.tokenize import word_tokenize # Tokenise our terms!
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from scipy.stats import f_oneway
from scipy.stats import ttest_ind

#Modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [3]:
# Download some stopwords and punctuation
#Once it is downloaded once, it does not need to be done again

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
lemmatizer = WordNetLemmatizer() #lemmatisation
l_stemmer = LancasterStemmer() # stemmer
p_stemmer = PorterStemmer() #porter stemmer

## Data Cleaning

The code defines functions to clean text data by removing URLs, email addresses, HTML tags, special characters, punctuation, numbers, emojis, and short words. It also includes tokenization, which splits text into individual words or tokens.

In [5]:
article_table = pd.read_csv('article_table_f')

In [6]:
article_table

Unnamed: 0,Article_id,Article,Title,Published_date_id,Source_id
0,52,FOUR independent candidates are standing to fi...,Four candidates for what is thought to be town...,7,6
1,54,First Minister John Swinney has said The SNP w...,SNP to include social tariff on energy and bro...,7,1
2,164,Reform UK insists its plans are “not just anot...,"Do Reform UK’s election claims on tax, immigra...",7,8
3,390,"“If you want politics as pantomime,” Sir Keir ...",Who is Jovan Owusu-Nepaul? Labour’s general el...,7,9
4,242,Shadow health secretary Wes Streeting has urge...,Streeting warns against complacency and giving...,6,4
...,...,...,...,...,...
401,135,Nigel Farage and Reform have made the biggest ...,Farage amasses 39 billion video views as Refor...,21,9
402,83,It’s set to be one of the most dramatic and co...,The Daily T: Your definitive election night guide,21,11
403,308,The Government’s antisemitism adviser has cond...,Tories face backlash over attack on Starmer’s ...,21,4
404,362,"WITH the General Election just days away, cand...",See the key points in the SNP's manifesto ahea...,21,10


### Cleaning Functions

In [7]:
article_table['cleaned_article'] = article_table['Article'].apply(str.lower)
article_table['cleaned_title'] = article_table['Title'].apply(str.lower)

In [8]:
def regex_clean(txt, regex):
    """Replace any text matching the regex

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove matches
    regex : string
        A text string of the regex pattern you want to match

    Returns
    -------
    The same txt string with the matches removes
    """

    return " ".join(re.sub(regex, "", txt).split()) # first split to have it in ['I' ,'am' ,'here'] and the join it with a space between the full texts.

def remove_emoji(string):
    emoji_pattern = re.compile("["
                          "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "\U0001F300-\U0001F5FF"  # symbols & pictographs
                           "\U0001F600-\U0001F64F"  # emoticons
                           "\U0001F680-\U0001F6FF"  # transport & map symbols
                           "\U0001F700-\U0001F77F"  # alchemical symbols
                           "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           "\U0001FA00-\U0001FA6F"  # Chess Symbols
                           "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           "\U00002702-\U000027B0"  # Dingbats
                           "\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [9]:
def cleaning_process(df):
  # define a few regex patterns to clean! - This is just a cleaning process (not the only one)
  regex_to_clean = [r'https?://\S+|www\.\S+', # removes web URLs starting with http://, https://, or www..
                    r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', # removes email addresses.
                    r'<[^>]*>', # removes HTML tags.
                    r'[^a-zA-Z\s]', # removes special characters, punctuation, and numbers, leaving only letters and spaces.
                    r'[@#]\w+', #  removes hashtags and mentions typically found in social media posts.
                    r'\d+', # removes all numeric characters.
                    r'\n|\r|\t', #  removes line breaks and tabs.
                    r'\b\w*\d\w*\b', # removes words that contain digits, which often include product codes or other non-relevant text.
                    r'https?://\S+\?(\S+)', # removes URLs that include query parameters.
                    r'\b\w{1,2}\b', # removes short words with fewer than three characters (optional, as it might remove relevant short words like "an", "is", etc.).
                    r'\[\d+\]|\[citation needed\]', # removes common citation formats like [1], [citation needed].
                    r"[!\"#$%&'()*+,\-./:;<=>?@[\\]^_`{|}~]{2,}", # removes repeated punctuation marks.
                    r'[&%$#@]' # removes artifacts like &, %, etc., which might remain after other cleaning steps.
                  ] #need to add more regex to fully clean

  #step 2.2: Apply the regex clean
  for reg in regex_to_clean:
      df['cleaned_article'] = df['cleaned_article'].apply(regex_clean, regex=reg)
      df['cleaned_title'] = df['cleaned_title'].apply(regex_clean, regex=reg)

  #step 3: apply the emoji removal as well
  df['cleaned_article'] = df['cleaned_article'].apply(remove_emoji)
  df['cleaned_title'] = df['cleaned_title'].apply(remove_emoji)

  return df

In [10]:
article = cleaning_process(article_table)

In [11]:
article

Unnamed: 0,Article_id,Article,Title,Published_date_id,Source_id,cleaned_article,cleaned_title
0,52,FOUR independent candidates are standing to fi...,Four candidates for what is thought to be town...,7,6,four independent candidates are standing fill ...,four candidates for what thought towns first e...
1,54,First Minister John Swinney has said The SNP w...,SNP to include social tariff on energy and bro...,7,1,first minister john swinney has said the snp w...,snp include social tariff energy and broadband...
2,164,Reform UK insists its plans are “not just anot...,"Do Reform UK’s election claims on tax, immigra...",7,8,reform insists its plans are not just another ...,reform uks election claims tax immigration and...
3,390,"“If you want politics as pantomime,” Sir Keir ...",Who is Jovan Owusu-Nepaul? Labour’s general el...,7,9,you want politics pantomime sir keir starmer s...,who jovan owusunepaul labours general election...
4,242,Shadow health secretary Wes Streeting has urge...,Streeting warns against complacency and giving...,6,4,shadow health secretary wes streeting has urge...,streeting warns against complacency and giving...
...,...,...,...,...,...,...,...
401,135,Nigel Farage and Reform have made the biggest ...,Farage amasses 39 billion video views as Refor...,21,9,nigel farage and reform have made the biggest ...,farage amasses billion video views reform domi...
402,83,It’s set to be one of the most dramatic and co...,The Daily T: Your definitive election night guide,21,11,its set one the most dramatic and consequentia...,the daily your definitive election night guide
403,308,The Government’s antisemitism adviser has cond...,Tories face backlash over attack on Starmer’s ...,21,4,the governments antisemitism adviser has conde...,tories face backlash over attack starmers frid...
404,362,"WITH the General Election just days away, cand...",See the key points in the SNP's manifesto ahea...,21,10,with the general election just days away candi...,see the key points the snps manifesto ahead th...


### Tokenisation

In [12]:
def tokenisation(df):
  df['article_tokens'] = df['cleaned_article'].apply(word_tokenize)
  df['title_tokens'] = df['cleaned_title'].apply(word_tokenize)

  stpwrd = nltk.corpus.stopwords.words('english')
  punc = '!"#$%&()*+, -./:;<=>?@[\]^_`{|}~”“\''
  #generate the list of punctiontion
  punc = [x for x in punc]

  ## extend the list of stopwords to include punctuations as well
  #we'll remove them both anyway
  stpwrd.extend(punc)

  ##apply the cleaning
  df['article_tokens'] = df['article_tokens'].apply(lambda x:[words for words in x if words not in stpwrd])
  df['title_tokens'] = df['title_tokens'].apply(lambda x:[words for words in x if words not in stpwrd])

  ## can get rid of all the terms that have fewer than 2 characters
  df['article_tokens'] = df['article_tokens'].apply(lambda document : [token for token in document if len(token)>2])
  df['title_tokens'] = df['title_tokens'].apply(lambda document : [token for token in document if len(token)>2])

  # Apply stemming and lemmatization
  #df['tokens_stem'] = df['tokens'].apply(lambda tokens: [l_stemmer.stem(word) for word in tokens])
  df['article_tokens_lem'] = df['article_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
  df['title_tokens_lem'] = df['title_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
  #df['tokens_stem_2'] = df['tokens'].apply(lambda tokens: [p_stemmer.stem(word) for word in tokens])

  return df

In [13]:
test = article.head(5).copy()

In [14]:
t_process = tokenisation(test)

In [15]:
t_process

Unnamed: 0,Article_id,Article,Title,Published_date_id,Source_id,cleaned_article,cleaned_title,article_tokens,title_tokens,article_tokens_lem,title_tokens_lem
0,52,FOUR independent candidates are standing to fi...,Four candidates for what is thought to be town...,7,6,four independent candidates are standing fill ...,four candidates for what thought towns first e...,"[four, independent, candidates, standing, fill...","[four, candidates, thought, towns, first, elec...","[four, independent, candidate, standing, fill,...","[four, candidate, thought, town, first, electi..."
1,54,First Minister John Swinney has said The SNP w...,SNP to include social tariff on energy and bro...,7,1,first minister john swinney has said the snp w...,snp include social tariff energy and broadband...,"[first, minister, john, swinney, said, snp, in...","[snp, include, social, tariff, energy, broadba...","[first, minister, john, swinney, said, snp, in...","[snp, include, social, tariff, energy, broadba..."
2,164,Reform UK insists its plans are “not just anot...,"Do Reform UK’s election claims on tax, immigra...",7,8,reform insists its plans are not just another ...,reform uks election claims tax immigration and...,"[reform, insists, plans, another, party, manif...","[reform, uks, election, claims, tax, immigrati...","[reform, insists, plan, another, party, manife...","[reform, uk, election, claim, tax, immigration..."
3,390,"“If you want politics as pantomime,” Sir Keir ...",Who is Jovan Owusu-Nepaul? Labour’s general el...,7,9,you want politics pantomime sir keir starmer s...,who jovan owusunepaul labours general election...,"[want, politics, pantomime, sir, keir, starmer...","[jovan, owusunepaul, labours, general, electio...","[want, politics, pantomime, sir, keir, starmer...","[jovan, owusunepaul, labour, general, election..."
4,242,Shadow health secretary Wes Streeting has urge...,Streeting warns against complacency and giving...,6,4,shadow health secretary wes streeting has urge...,streeting warns against complacency and giving...,"[shadow, health, secretary, wes, streeting, ur...","[streeting, warns, complacency, giving, matche...","[shadow, health, secretary, wes, streeting, ur...","[streeting, warns, complacency, giving, match,..."


## Word Frequency Analysis

This part of the code creates a dictionary of words from the text data and calculates their frequency. It also handles the creation of a DataFrame that lists words along with their frequency in articles and titles.

In [16]:
t_process.columns

Index(['Article_id', 'Article', 'Title', 'Published_date_id', 'Source_id',
       'cleaned_article', 'cleaned_title', 'article_tokens', 'title_tokens',
       'article_tokens_lem', 'title_tokens_lem'],
      dtype='object')

In [17]:
def word2(df):
  words = Dictionary(documents = df['tokens_lem'])

  ##
  clean_dictionary = {}

  ##
  for k,v in words.cfs.items():
      if v>1:
          clean_dictionary[words[k]] = v

  return clean_dictionary

In [18]:
# words = word(t_process)
# words_2 = list(words.keys())

In [19]:
tokens = tokenisation(article)

In [20]:
tokens.columns

Index(['Article_id', 'Article', 'Title', 'Published_date_id', 'Source_id',
       'cleaned_article', 'cleaned_title', 'article_tokens', 'title_tokens',
       'article_tokens_lem', 'title_tokens_lem'],
      dtype='object')

### Word Frequency Table

In [21]:
import pandas as pd
from gensim.corpora import Dictionary
from collections import defaultdict

def word(df):
    # Create dictionaries for article and title tokens
    article_words = Dictionary(documents=df['article_tokens_lem'])
    title_words = Dictionary(documents=df['title_tokens_lem'])

    # Dictionaries to store word frequency and article IDs for both articles and titles
    word_info = defaultdict(lambda: {'article_freq': 0, 'title_freq': 0, 'article_ids': set()})

    # Populate the word_info dictionary for article tokens
    for index, tokens in enumerate(df['article_tokens_lem']):
        article_id = df.loc[index, 'Article_id']
        for word in tokens:
            if article_words.token2id.get(word) is not None:  # Ensure the word is in the dictionary
                word_info[word]['article_freq'] += 1
                word_info[word]['article_ids'].add(article_id)

    # Populate the word_info dictionary for title tokens
    for index, tokens in enumerate(df['title_tokens_lem']):
        for word in tokens:
            if title_words.token2id.get(word) is not None:  # Ensure the word is in the dictionary
                word_info[word]['title_freq'] += 1

    # Create a list for DataFrame construction
    data = []
    for word, info in word_info.items():
        if info['article_freq'] > 0 or info['title_freq'] > 0:  # Only include words with frequency > 0
            data.append([
                word,
                info['article_freq'],
                info['title_freq'],
                list(info['article_ids'])
            ])

    # Convert the data into a DataFrame
    word_df = pd.DataFrame(data, columns=['Word', 'Article_Frequency', 'Title_Frequency', 'Article_IDs'])
    return word_df

# Assuming you have a DataFrame named df
word_table = word(article)


In [22]:
word_table.head()

Unnamed: 0,Word,Article_Frequency,Title_Frequency,Article_IDs
0,four,158,2,"[20, 45, 47, 48, 49, 50, 51, 52, 58, 60, 64, 6..."
1,independent,267,2,"[1, 2, 12, 13, 17, 19, 20, 32, 38, 39, 40, 42,..."
2,candidate,975,30,"[0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 16, 19..."
3,standing,211,5,"[1, 2, 3, 5, 7, 8, 9, 11, 13, 16, 21, 22, 28, ..."
4,fill,22,0,"[258, 259, 265, 143, 274, 275, 297, 169, 176, ..."


In [23]:
word_table['word_id'] = word_table['Word'].astype('category').cat.codes

In [24]:
word_table.head()

Unnamed: 0,Word,Article_Frequency,Title_Frequency,Article_IDs,word_id
0,four,158,2,"[20, 45, 47, 48, 49, 50, 51, 52, 58, 60, 64, 6...",8424
1,independent,267,2,"[1, 2, 12, 13, 17, 19, 20, 32, 38, 39, 40, 42,...",10591
2,candidate,975,30,"[0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 16, 19...",3027
3,standing,211,5,"[1, 2, 3, 5, 7, 8, 9, 11, 13, 16, 21, 22, 28, ...",20359
4,fill,22,0,"[258, 259, 265, 143, 274, 275, 297, 169, 176, ...",8053


## Topic Modeling

The code here prepares a corpus for topic modeling using the LDA (Latent Dirichlet Allocation) algorithm. It trains the model to identify topics within the text and associates words with these topics.

In [25]:
import pandas as pd
from gensim import corpora, models
from collections import defaultdict

# Step 1: Prepare the Corpus
all_tokens_lem = article['article_tokens_lem'].tolist() + article['title_tokens_lem'].tolist()
dictionary = corpora.Dictionary(all_tokens_lem)
corpus = [dictionary.doc2bow(text) for text in all_tokens_lem]

# Step 2: Train the LDA Model
num_topics = 10
lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics, passes=15, random_state=42)

In [26]:
# Step 3: Extract Topic Information
word_topic_info = defaultdict(lambda: {'topics': [], 'topic_freqs': []})
for word_id in dictionary.keys():
    word = dictionary[word_id]
    word_topics = lda_model.get_term_topics(word_id, minimum_probability=0.0)
    for topic_id, freq in word_topics:
        word_topic_info[word]['topics'].append(topic_id)
        word_topic_info[word]['topic_freqs'].append(freq)

# # Create the initial DataFrame with word frequency
# word_table = word(article)
# word_table['word_id'] = word_table['Word'].astype('category').cat.codes

In [27]:
# Step 4: Integrate with DataFrame
word_table['Topics'] = word_table['Word'].map(lambda w: word_topic_info[w]['topics'] if w in word_topic_info else [])
word_table['Topic_Frequencies'] = word_table['Word'].map(lambda w: word_topic_info[w]['topic_freqs'] if w in word_topic_info else [])

# Display the DataFrame
word_table.head()

Unnamed: 0,Word,Article_Frequency,Title_Frequency,Article_IDs,word_id,Topics,Topic_Frequencies
0,four,158,2,"[20, 45, 47, 48, 49, 50, 51, 52, 58, 60, 64, 6...",8424,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]","[0.00021399025, 0.0012257358, 0.00070310646, 0..."
1,independent,267,2,"[1, 2, 12, 13, 17, 19, 20, 32, 38, 39, 40, 42,...",10591,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]","[0.0023103475, 0.00015431244, 0.00091897085, 0..."
2,candidate,975,30,"[0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 16, 19...",3027,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]","[0.009531198, 0.00069500046, 0.005038194, 0.00..."
3,standing,211,5,"[1, 2, 3, 5, 7, 8, 9, 11, 13, 16, 21, 22, 28, ...",20359,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]","[0.0011134244, 0.00044080152, 0.0011909573, 0...."
4,fill,22,0,"[258, 259, 265, 143, 274, 275, 297, 169, 176, ...",8053,"[0, 2, 3, 4, 5, 7, 8]","[2.1393156e-05, 3.0331346e-05, 3.7102964e-05, ..."


In [28]:
word_table.columns

Index(['Word', 'Article_Frequency', 'Title_Frequency', 'Article_IDs',
       'word_id', 'Topics', 'Topic_Frequencies'],
      dtype='object')

### Saving Relevant Tables

In [29]:
word_table.to_csv('Topic_names_test', index=False)
Article_word = word_table[['word_id', 'Article_IDs']]
Article_word = Article_word.explode('Article_IDs')
Article_word.to_csv('Article_word_f', index=False)

In [30]:
import pandas as pd

# Assuming `word_table` DataFrame already exists with 'word_id', 'Topics', and 'Topic_Frequencies' columns

# Step 1: Extract Topics and Frequencies
word_topics_data = []

for _, row in word_table.iterrows():
    word_id = row['word_id']
    topics = row['Topics']
    freqs = row['Topic_Frequencies']

    for topic, freq in zip(topics, freqs):
        word_topics_data.append([word_id, topic, freq])

# Step 2: Create the DataFrame
word_topic_df = pd.DataFrame(word_topics_data, columns=['word_id', 'Topic', 'Topic_Frequency'])

# Step 3: (Optional) Merge with word information if necessary
# If you need to include the word itself for better readability or any other details, you can merge this DataFrame with the original `word_table`.
#word_topic_df = word_topic_df.merge(word_table[['word_id', 'Word']], on='word_id', how='left')

# Display the DataFrame
word_topic_df.head()


Unnamed: 0,word_id,Topic,Topic_Frequency
0,8424,0,0.000214
1,8424,1,0.001226
2,8424,2,0.000703
3,8424,3,0.000152
4,8424,4,0.00027


In [31]:
word_topic_df.to_csv('word_topic_f', index=False)

In [32]:
word_table = word_table[['word_id', 'Word', 'Article_Frequency', 'Title_Frequency']]
word_table.to_csv('word_table_f', index=False)

### Possible Topic labelling :

- Topic 0: Political Terms
- Topic 1: Election Procedures
- Topic 2: Campaign Strategies
- Topic 3: Voting Systems
- Topic 4: Candidate Profiles
- Topic 5: Electoral Regulations
- Topic 6: Political Parties
- Topic 7: Election Outcomes
- Topic 8: Voter Demographics
- Topic 9: Media Coverage

## Sentiment Analysis

This section includes code for sentiment analysis using rule-based methods like VADER from nltk and machine learning approaches like TextBlob. It also demonstrates how to use pre-trained BERT and RoBERTa models for sentiment analysis.

In [33]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

# Function to calculate sentiment using BERT
def sentiment_bert(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.logits.softmax(dim=-1).squeeze().tolist()

# Function to calculate sentiment using RoBERTa
def sentiment_roberta(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.logits.softmax(dim=-1).squeeze().tolist()

# Load pre-trained BERT model and tokenizer
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained RoBERTa model and tokenizer
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [34]:
def sentiment(df):

  # Ensure 'tokens_lem' is a string for sentiment analysis
  df['article_tokens_lem_text'] = df['article_tokens_lem'].apply(lambda tokens: ' '.join(tokens) if isinstance(tokens, list) else tokens)
  df['title_tokens_lem_text'] = df['title_tokens_lem'].apply(lambda tokens: ' '.join(tokens) if isinstance(tokens, list) else tokens)

  # Rule-based sentiment analysis with NLTK's Vader
  sid = SentimentIntensityAnalyzer()
  df['article_sentiment_vader'] = df['article_tokens_lem_text'].apply(lambda x: sid.polarity_scores(x)['compound'])
  df['title_sentiment_vader'] = df['title_tokens_lem_text'].apply(lambda x: sid.polarity_scores(x)['compound'])

  # Machine learning sentiment analysis with TextBlob
  df['article_sentiment_textblob'] = df['article_tokens_lem_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
  df['title_sentiment_textblob'] = df['title_tokens_lem_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

  # Combine rule-based and machine learning results
  df['article_sentiment_hybrid'] = df['article_sentiment_vader'] * 0.5 + df['article_sentiment_textblob'] * 0.5
  df['title_sentiment_hybrid'] = df['title_sentiment_vader'] * 0.5 + df['title_sentiment_textblob'] * 0.5

  # BERT sentiment analysis
  df['article_sentiment_bert_-'] = df['article_tokens_lem_text'].apply(lambda x: sentiment_bert(x, bert_model, bert_tokenizer)[0])
  df['article_sentiment_bert_+'] = df['article_tokens_lem_text'].apply(lambda x: sentiment_bert(x, bert_model, bert_tokenizer)[1])

  df['title_sentiment_bert_-'] = df['title_tokens_lem_text'].apply(lambda x: sentiment_bert(x, bert_model, bert_tokenizer)[0])
  df['title_sentiment_bert_+'] = df['title_tokens_lem_text'].apply(lambda x: sentiment_bert(x, bert_model, bert_tokenizer)[1])

  # RoBERTa sentiment analysis
  df['article_sentiment_roberta_+'] = df['article_tokens_lem_text'].apply(lambda x: sentiment_roberta(x, roberta_model, roberta_tokenizer)[0])
  df['article_sentiment_roberta_-'] = df['article_tokens_lem_text'].apply(lambda x: sentiment_roberta(x, roberta_model, roberta_tokenizer)[1])

  df['title_sentiment_roberta_+'] = df['title_tokens_lem_text'].apply(lambda x: sentiment_roberta(x, roberta_model, roberta_tokenizer)[0])
  df['title_sentiment_roberta_-'] = df['title_tokens_lem_text'].apply(lambda x: sentiment_roberta(x, roberta_model, roberta_tokenizer)[1])

  # df results
  df = df[['Article_id',	'Article', 'Title',	'Published_date_id',	'Source_id', 'article_tokens_lem', 'article_tokens_lem_text', 'title_tokens_lem', 'title_tokens_lem_text',
            'article_sentiment_vader',
            'title_sentiment_vader',
           'article_sentiment_textblob',
           'title_sentiment_textblob',
           'article_sentiment_hybrid',
           'title_sentiment_hybrid',
           'article_sentiment_bert_-',
           'article_sentiment_bert_+',
           'title_sentiment_bert_-',
           'title_sentiment_bert_+',
           'article_sentiment_roberta_-',
           'article_sentiment_roberta_+',
           'title_sentiment_roberta_-',
           'title_sentiment_roberta_+'
           ]]

  return df

In [35]:
# Sample data
data = {
    'Article_id': [1, 2, 3, 4, 5],
    'Title': [
        'Amazing Product Launch',
        'Disappointing Earnings Report',
        'Neutral Outlook on Economy',
        'Exciting Developments in Tech',
        'Concerns Over Market Stability'
    ],
    'Article': [
        'The company has launched an amazing new product that is receiving excellent reviews from customers.',
        'The company’s earnings report was disappointing, falling short of market expectations and resulting in a stock price drop.',
        'Experts have given a neutral outlook on the economy, with no major changes expected in the coming months.',
        'There are exciting developments in the tech industry, with new innovations leading to positive market reactions.',
        'There are growing concerns over market stability due to recent geopolitical tensions and economic uncertainties.'
    ],
    'Sentiment': ['Positive', 'Negative', 'Neutral', 'Positive', 'Negative']
}

# Create DataFrame
df = pd.DataFrame(data)

sid = SentimentIntensityAnalyzer()
df['article_sentiment_bert_-'] = df['Article'].apply(lambda x: sentiment_bert(x, bert_model, bert_tokenizer)[0])
df['article_sentiment_bert_+'] = df['Article'].apply(lambda x: sentiment_bert(x, bert_model, bert_tokenizer)[1])
df['article_sentiment_roberta_-'] = df['Article'].apply(lambda x: sentiment_roberta(x, roberta_model, roberta_tokenizer)[0])
df['article_sentiment_roberta_+'] = df['Article'].apply(lambda x: sentiment_roberta(x, roberta_model, roberta_tokenizer)[1])
df['article_sentiment_vader'] = df['Article'].apply(lambda x: sid.polarity_scores(x)['compound'])
df['article_sentiment_textblob'] = df['Article'].apply(lambda x: TextBlob(x).sentiment.polarity)

df

# From this test:
# BERT = [-, +]
# ROBERTA = [+, -]

Unnamed: 0,Article_id,Title,Article,Sentiment,article_sentiment_bert_-,article_sentiment_bert_+,article_sentiment_roberta_-,article_sentiment_roberta_+,article_sentiment_vader,article_sentiment_textblob
0,1,Amazing Product Launch,The company has launched an amazing new produc...,Positive,0.680961,0.319039,0.547899,0.452101,0.8402,0.578788
1,2,Disappointing Earnings Report,The company’s earnings report was disappointin...,Negative,0.626007,0.373993,0.551243,0.448757,-0.7096,-0.3
2,3,Neutral Outlook on Economy,Experts have given a neutral outlook on the ec...,Neutral,0.695088,0.304912,0.550142,0.449858,-0.296,-0.065625
3,4,Exciting Developments in Tech,There are exciting developments in the tech in...,Positive,0.718966,0.281034,0.546576,0.453424,0.7783,0.221212
4,5,Concerns Over Market Stability,There are growing concerns over market stabili...,Negative,0.623843,0.376157,0.54849,0.451509,-0.5267,0.025


In [36]:
#tokens = tokenisation(article)
article_table_s = sentiment(tokens)
article_table_s.to_csv('article_table_s_first', index=False)

In [50]:
article_table_s = pd.read_csv('article_table_s_first')

In [51]:
article_table_s.head()

Unnamed: 0,Article_id,Article,Title,Published_date_id,Source_id,article_tokens_lem,article_tokens_lem_text,title_tokens_lem,title_tokens_lem_text,article_sentiment_vader,...,article_sentiment_hybrid,title_sentiment_hybrid,article_sentiment_bert_-,article_sentiment_bert_+,title_sentiment_bert_-,title_sentiment_bert_+,article_sentiment_roberta_-,article_sentiment_roberta_+,title_sentiment_roberta_-,title_sentiment_roberta_+
0,52,FOUR independent candidates are standing to fi...,Four candidates for what is thought to be town...,7,6,"['four', 'independent', 'candidate', 'standing...",four independent candidate standing fill two v...,"['four', 'candidate', 'thought', 'town', 'firs...",four candidate thought town first election year,0.8399,...,0.541379,0.125,0.53226,0.46774,0.56107,0.43893,0.462323,0.537677,0.457755,0.542245
1,54,First Minister John Swinney has said The SNP w...,SNP to include social tariff on energy and bro...,7,1,"['first', 'minister', 'john', 'swinney', 'said...",first minister john swinney said snp include s...,"['snp', 'include', 'social', 'tariff', 'energy...",snp include social tariff energy broadband gen...,0.9761,...,0.546628,0.157433,0.52903,0.47097,0.660866,0.339134,0.466663,0.533337,0.454556,0.545444
2,164,Reform UK insists its plans are “not just anot...,"Do Reform UK’s election claims on tax, immigra...",7,8,"['reform', 'insists', 'plan', 'another', 'part...",reform insists plan another party manifesto ex...,"['reform', 'uk', 'election', 'claim', 'tax', '...",reform uk election claim tax immigration envir...,0.9909,...,0.523378,0.0,0.408862,0.591138,0.652729,0.347271,0.461096,0.538904,0.453776,0.546224
3,390,"“If you want politics as pantomime,” Sir Keir ...",Who is Jovan Owusu-Nepaul? Labour’s general el...,7,9,"['want', 'politics', 'pantomime', 'sir', 'keir...",want politics pantomime sir keir starmer said ...,"['jovan', 'owusunepaul', 'labour', 'general', ...",jovan owusunepaul labour general election cand...,0.9982,...,0.607156,0.025,0.443863,0.556137,0.671745,0.328255,0.466407,0.533593,0.459533,0.540467
4,242,Shadow health secretary Wes Streeting has urge...,Streeting warns against complacency and giving...,6,4,"['shadow', 'health', 'secretary', 'wes', 'stre...",shadow health secretary wes streeting urged vo...,"['streeting', 'warns', 'complacency', 'giving'...",streeting warns complacency giving match back ...,0.9887,...,0.588329,0.125,0.494362,0.505638,0.560559,0.439441,0.464949,0.535051,0.455772,0.544228


In [52]:
article_table_s.columns

Index(['Article_id', 'Article', 'Title', 'Published_date_id', 'Source_id',
       'article_tokens_lem', 'article_tokens_lem_text', 'title_tokens_lem',
       'title_tokens_lem_text', 'article_sentiment_vader',
       'title_sentiment_vader', 'article_sentiment_textblob',
       'title_sentiment_textblob', 'article_sentiment_hybrid',
       'title_sentiment_hybrid', 'article_sentiment_bert_-',
       'article_sentiment_bert_+', 'title_sentiment_bert_-',
       'title_sentiment_bert_+', 'article_sentiment_roberta_-',
       'article_sentiment_roberta_+', 'title_sentiment_roberta_-',
       'title_sentiment_roberta_+'],
      dtype='object')

### Sentiment Labelling Process

In [53]:
article_table_s['article_sentiment_bert_ro_+_avg'] = ((article_table_s['article_sentiment_bert_+'] + article_table_s['article_sentiment_roberta_+']) / 2)
article_table_s['article_sentiment_bert_ro_-_avg'] = ((article_table_s['article_sentiment_bert_-'] + article_table_s['article_sentiment_roberta_-']) / 2)
article_table_s['title_sentiment_bert_ro_+_avg'] = ((article_table_s['title_sentiment_bert_+'] + article_table_s['title_sentiment_roberta_+']) / 2)
article_table_s['title_sentiment_bert_ro_-_avg'] = ((article_table_s['title_sentiment_bert_-'] + article_table_s['title_sentiment_roberta_-']) / 2)

In [65]:
article_table_s['article_bert_diff'] = abs(article_table_s['article_sentiment_bert_-'] - article_table_s['article_sentiment_bert_+'])
article_table_s['title_bert_diff'] = abs(article_table_s['title_sentiment_bert_-'] - article_table_s['title_sentiment_bert_+'])

article_table_s['article_roberta_diff'] = abs(article_table_s['article_sentiment_roberta_-'] - article_table_s['article_sentiment_roberta_+'])
article_table_s['title_roberta_diff'] = abs(article_table_s['title_sentiment_roberta_-'] - article_table_s['title_sentiment_roberta_+'])

article_table_s['article_avg_diff'] = abs(article_table_s['article_sentiment_bert_ro_-_avg'] - article_table_s['article_sentiment_bert_ro_+_avg'])
article_table_s['title_avg_diff'] = abs(article_table_s['title_sentiment_bert_ro_-_avg'] - article_table_s['title_sentiment_bert_ro_+_avg'])


# Define a function to label based on BERT sentiment differences
def label_bert_a(row):
    if row['article_bert_diff'] <= 0.1:
        return 'Neutral'
    elif row['article_sentiment_bert_+'] > row['article_sentiment_bert_-']:
        return 'Positive'
    elif row['article_sentiment_bert_-'] > row['article_sentiment_bert_+']:
        return 'Negative'
    return 'None'  # Default return to ensure the function always returns a value

# Define a function to label based on ROBERTA sentiment differences
def label_roberta_a(row):
    if row['article_roberta_diff'] <= 0.07:
        return 'Neutral'
    elif row['article_sentiment_roberta_+'] > row['article_sentiment_roberta_-']:
        return 'Positive'
    elif row['article_sentiment_roberta_-'] > row['article_sentiment_roberta_+']:
        return 'Negative'
    return 'None'  # Default return to ensure the function always returns a value

# Define a function to label based on ROBERTA sentiment differences
def label_avg_a(row):
    if row['article_avg_diff'] <= 0.1:
        return 'Neutral'
    elif row['article_sentiment_bert_ro_+_avg'] > row['article_sentiment_bert_ro_-_avg']:
        return 'Positive'
    elif row['article_sentiment_bert_ro_-_avg'] > row['article_sentiment_bert_ro_+_avg']:
        return 'Negative'
    return 'None'  # Default return to ensure the function always returns a value


# Apply the function to each row to create the new column 'Article_bert_label'
article_table_s['Article_bert_label'] = article_table_s.apply(label_bert_a, axis=1)
article_table_s['Article_roberta_label'] = article_table_s.apply(label_roberta_a, axis=1)
article_table_s['Article_avg_label'] = article_table_s.apply(label_avg_a, axis=1)

print(article_table_s['Article_bert_label'].value_counts())
print(article_table_s['Article_roberta_label'].value_counts())
print(article_table_s['Article_avg_label'].value_counts())

Article_bert_label
Neutral     237
Positive    144
Negative     25
Name: count, dtype: int64
Article_roberta_label
Positive    303
Neutral     103
Name: count, dtype: int64
Article_avg_label
Neutral     313
Positive     93
Name: count, dtype: int64


In [66]:
# Define a function to label based on BERT sentiment differences
def label_bert_t(row):
    if row['title_bert_diff'] <= 0.1:
        return 'Neutral'
    elif row['title_sentiment_bert_+'] > row['title_sentiment_bert_-']:
        return 'Positive'
    elif row['title_sentiment_bert_-'] > row['title_sentiment_bert_+']:
        return 'Negative'
    return 'None'  # Default return to ensure the function always returns a value

# Define a function to label based on ROBERTA sentiment differences
def label_roberta_t(row):
    if row['title_roberta_diff'] <= 0.07:
        return 'Neutral'
    elif row['title_sentiment_roberta_+'] > row['title_sentiment_roberta_-']:
        return 'Positive'
    elif row['title_sentiment_roberta_-'] > row['title_sentiment_roberta_+']:
        return 'Negative'
    return 'None'  # Default return to ensure the function always returns a value

# Define a function to label based on ROBERTA sentiment differences
def label_avg_t(row):
    if row['title_avg_diff'] <= 0.1:
        return 'Neutral'
    elif row['title_sentiment_bert_ro_+_avg'] > row['title_sentiment_bert_ro_-_avg']:
        return 'Positive'
    elif row['title_sentiment_bert_ro_-_avg'] > row['title_sentiment_bert_ro_+_avg']:
        return 'Negative'
    return 'None'  # Default return to ensure the function always returns a value


# Apply the function to each row to create the new column 'title_bert_label'
article_table_s['title_bert_label'] = article_table_s.apply(label_bert_t, axis=1)
article_table_s['title_roberta_label'] = article_table_s.apply(label_roberta_t, axis=1)
article_table_s['title_avg_label'] = article_table_s.apply(label_avg_t, axis=1)

print(article_table_s['title_bert_label'].value_counts())
print(article_table_s['title_roberta_label'].value_counts())
print(article_table_s['title_avg_label'].value_counts())

title_bert_label
Negative    295
Neutral     111
Name: count, dtype: int64
title_roberta_label
Positive    406
Name: count, dtype: int64
title_avg_label
Neutral     338
Negative     68
Name: count, dtype: int64


In [67]:
article_table_s.iloc[:5 , 27:]

Unnamed: 0,article_bert_diff,title_bert_diff,article_roberta_diff,title_roberta_diff,article_avg_diff,title_avg_diff,Article_bert_label,Article_roberta_label,Article_avg_label,title_bert_label,title_roberta_label,title_avg_label
0,0.064519,0.122139,0.075354,0.084489,0.005417,0.018825,Neutral,Positive,Neutral,Negative,Positive,Neutral
1,0.058059,0.321732,0.066674,0.090888,0.004307,0.115422,Neutral,Neutral,Neutral,Negative,Positive,Negative
2,0.182276,0.305457,0.077809,0.092448,0.130042,0.106505,Positive,Positive,Positive,Negative,Positive,Negative
3,0.112274,0.34349,0.067185,0.080933,0.08973,0.131278,Positive,Neutral,Neutral,Negative,Positive,Negative
4,0.011276,0.121118,0.070102,0.088456,0.040689,0.016331,Neutral,Positive,Neutral,Negative,Positive,Neutral


### Quality / Performance checks

In [68]:
import random

random_num_1 = random.randint(1, 299)
random_num_2 = random.randint(1, 299)

article_test = article_table_s[['Article', 'Article_avg_label', 'Article_bert_label', 'Article_roberta_label']]

if random_num_1 != random_num_2:
  article_test = article_test.iloc[random_num_1 : random_num_2, :].copy()

article_test.head()


Unnamed: 0,Article,Article_avg_label,Article_bert_label,Article_roberta_label
27,Keir Starmer has committed to a judge-led inqu...,Neutral,Negative,Positive
28,Rishi Sunak attends the plenary session of the...,Neutral,Neutral,Neutral
29,Sir Keir Starmer has refused to rule out chang...,Neutral,Positive,Positive
30,Britain will head to the polls on July 4 after...,Neutral,Neutral,Neutral
31,Boris Johnson is being drafted in by the Torie...,Positive,Positive,Positive


In [69]:
article_table_s['article_tokens_lem'][random_num_1]

"['keir', 'starmer', 'committed', 'judgeled', 'inquiry', 'nottingham', 'attack', 'labour', 'win', 'election', 'saying', 'many', 'example', 'victim', 'family', 'member', 'let', 'downbarnaby', 'webber', 'grace', 'omalleykumar', 'ian', 'coates', 'stabbed', 'death', 'last', 'year', 'valdo', 'calocane', 'sentenced', 'hospital', 'order', 'pleading', 'guilty', 'manslaughter', 'ground', 'diminished', 'responsibility', 'due', 'paranoid', 'schizophreniabarnabys', 'mother', 'emma', 'webber', 'directly', 'challenged', 'starmer', 'listener', 'phonein', 'lbcshe', 'asked', 'get', 'downing', 'street', 'thought', 'following', 'call', 'public', 'inquiry', 'possibly', 'jury', 'judgeled', 'inquestand', 'also', 'thought', 'please', 'urgent', 'need', 'reform', 'homicide', 'law', 'also', 'victim', 'support', 'country', 'bitter', 'experience', 'confirm', 'woeful', 'inadequatethe', 'labour', 'leader', 'replied', 'family', 'horrific', 'experience', 'committed', 'holding', 'judgeled', 'inquiry', 'labour', 'winsh

In [70]:
article_table_s.columns

Index(['Article_id', 'Article', 'Title', 'Published_date_id', 'Source_id',
       'article_tokens_lem', 'article_tokens_lem_text', 'title_tokens_lem',
       'title_tokens_lem_text', 'article_sentiment_vader',
       'title_sentiment_vader', 'article_sentiment_textblob',
       'title_sentiment_textblob', 'article_sentiment_hybrid',
       'title_sentiment_hybrid', 'article_sentiment_bert_-',
       'article_sentiment_bert_+', 'title_sentiment_bert_-',
       'title_sentiment_bert_+', 'article_sentiment_roberta_-',
       'article_sentiment_roberta_+', 'title_sentiment_roberta_-',
       'title_sentiment_roberta_+', 'article_sentiment_bert_ro_+_avg',
       'article_sentiment_bert_ro_-_avg', 'title_sentiment_bert_ro_+_avg',
       'title_sentiment_bert_ro_-_avg', 'article_bert_diff', 'title_bert_diff',
       'article_roberta_diff', 'title_roberta_diff', 'article_avg_diff',
       'title_avg_diff', 'Article_bert_label', 'Article_roberta_label',
       'Article_avg_label', 'title_bert_l

### Saving Final Article Sentiment Table

In [71]:
article_table_s = article_table_s.drop(['article_sentiment_vader',
       'title_sentiment_vader', 'article_sentiment_textblob',
       'title_sentiment_textblob', 'article_sentiment_hybrid',
       'title_sentiment_hybrid', 'article_sentiment_bert_ro_+_avg',
       'article_sentiment_bert_ro_-_avg', 'title_sentiment_bert_ro_+_avg',
       'title_sentiment_bert_ro_-_avg', 'article_avg_diff',
       'title_avg_diff',
       'Article_avg_label',
       'title_avg_label'
        ], axis = 1)

In [73]:
article_table_s.to_csv('article_table_s_ff', index=False)