<a href="https://colab.research.google.com/github/Confirmation-Bias-Analyser/Confirmation-Bias-Model/blob/main/Lexicon_based_Approaches_and_Text_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install sentence_transformers
!pip install vaderSentiment

# Import essential libraries

In [2]:
import pandas as pd
from google.colab import files, drive
drive.mount('/content/drive')

import re
import os
import csv
import time
import numpy as np
from sklearn.cluster import KMeans

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_obj = SentimentIntensityAnalyzer()

from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer('all-MiniLM-L6-v2')

from textblob import TextBlob

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Declare relevant functions
Functions to clean the string from punctuation marks and other characters. Thereafter perform sentiment analysis with lexicon-based approaches of TextBlob and VADER, and followed by text clustering.

In [3]:
def getSentimentalResults(vaderObject, sentence):
  textBlobResult = TextBlob(sentence)
  vaderResult = vaderObject.polarity_scores(sentence)
  compoundScore = vaderResult.pop('compound')

  return textBlobResult.sentiment.polarity, textBlobResult.sentiment.subjectivity, vaderResult, compoundScore

def getClusters(allSentences, embedder, num_clusters = 2):
  corpus_embeddings = embedder.encode(allSentences)

  # Perform kmean clustering
  clustering_model = KMeans(n_clusters=num_clusters)
  clustering_model.fit(corpus_embeddings)
  cluster_assignment = clustering_model.labels_

  clustered_sentences = [[] for i in range(num_clusters)]
  for sentence_id, cluster_id in enumerate(cluster_assignment):
      clustered_sentences[cluster_id].append([pred_sentences[sentence_id], sentence_id])

  return cluster_assignment

def cleanComments(comments_array):
  sentences = []

  for i in comments_array:
    sequence = i.replace('\n', ' ') # Remove new line characters
    sequence = sequence.replace('\.', '')
    sequence = sequence.replace('.', '')
    sequence = sequence.replace(",", " ")
    sequence = sequence.replace("'", " ")
    sequence = sequence.replace('\\', '')
    sequence = sequence.replace('\'s', '')
    sequence = sequence.replace('&gt;', '') # Remove ampersand
    sequence = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", sequence) # Remove the user name
    sentences.append(sequence)

  return sentences

# Experimenting the function with a sample string

In [4]:
string = 'Give us believable reasons why we need mayors And what\'s with that ridiculous high salaries for mp'
getSentimentalResults(sid_obj, string)

(0.10888888888888888, 0.68, {'neg': 0.135, 'neu': 0.865, 'pos': 0.0}, -0.3612)

# Running the functions against a sample dataset of post and comments on Reddit

In [5]:
saved_path = '/content/drive/MyDrive/Final Year Project/Key Notebooks/Confirmation Bias Analyser/'
test_df = pd.read_csv(saved_path + 'reddit_data.csv')

pred_sentences = cleanComments(test_df['comment'])

In [6]:
test_df['comment'].head()

0    All I know is - anyone trying to pull a fast o...
1    Why does she think it is OK to lie about such ...
2    Here’s some perspective: \n\n1) The public nat...
3    Well said. It’s only a problem if your party i...
4    &gt;Transparency and finding out the truth is ...
Name: comment, dtype: object

## View some results of sentiment analysis

In [7]:
print(len(pred_sentences))
pred_sentences[:5]

181


['All I know is   anyone trying to pull a fast one and lie in parliament from now on will think trice   and that s a good thing for all of us',
 'Why does she think it is OK to lie about such things  Sense of impunity ',
 'Here s some perspective    1  The public nature and the fact that parliamentarians have to be the one doing this of course puts up lots of visible air time but that does not imply other priorities are forsaken   just like in the private sector there are always people doing the real work while leaders are there for direction and final decisions  2  Isn t there merit in investigating the circumstances surrounding the lies given that if indeed others were complicit in it that it would be a breach of parliamentary privilege    3  If you would like them to stop  regardless of your political views  would you be okay if those complicit are let off ',
 'Well said It s only a problem if your party is being investigated If it was a PAP MP committing a lie in Parliament I m sur

## Visualise results in dataframe

In [8]:
textblob_polarity = []
textblob_subjectivity = []
vader_results = []
vaderCompoundScores = []

for i in pred_sentences:
  result = getSentimentalResults(sid_obj, i)
  textblob_polarity.append(result[0])
  textblob_subjectivity.append(result[1])
  vader_results.append(result[2])
  vaderCompoundScores.append(result[3])

In [9]:
test_df['textblob_polarity'] = textblob_polarity
test_df['textblob_subjectivity'] = textblob_subjectivity
test_df['vader_sentiment'] = vader_results
test_df['vader_compound_score'] = vaderCompoundScores
test_df['topic_cluster'] = getClusters(pred_sentences, embedder)

test_df.to_csv('sentiment_result.csv', index=False)
test_df.head()

Unnamed: 0,user_name,id,timestamp,reply_to,comment,url,link_title,textblob_polarity,textblob_subjectivity,vader_sentiment,vader_compound_score,topic_cluster
0,MapleViolet,hpr2kav,2021-12-24 08:55:24,rmqevj,All I know is - anyone trying to pull a fast o...,,,0.45,0.6,"{'neg': 0.0, 'neu': 0.912, 'pos': 0.088}",0.4404,0
1,HaddockFillet,hra9zzo,2022-01-05 08:14:35,hpr2kav,Why does she think it is OK to lie about such ...,,,0.25,0.5,"{'neg': 0.0, 'neu': 0.827, 'pos': 0.173}",0.4466,1
2,applescript16,hpntm2t,2021-12-23 16:24:16,rmqevj,Here’s some perspective: \n\n1) The public nat...,,,0.08214286,0.388095,"{'neg': 0.05, 'neu': 0.828, 'pos': 0.122}",0.8519,1
3,iluj13,hpnwekg,2021-12-23 17:01:54,hpntm2t,Well said. It’s only a problem if your party i...,,,0.2142857,0.736508,"{'neg': 0.136, 'neu': 0.673, 'pos': 0.191}",0.2263,1
4,forzenrose,hpnzb4r,2021-12-23 17:41:51,hpnwekg,&gt;Transparency and finding out the truth is ...,,,3.700743e-17,0.755556,"{'neg': 0.192, 'neu': 0.603, 'pos': 0.205}",-0.128,1
