In [6]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob
import snscrape.modules.twitter as scrape_tweets
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('vader_lexicon')

plt.style.use('ggplot')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\tziam\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Collection of Data

In [5]:
def collect_data():
    location = '4.4149, -3.0424, 680km'
    query = '("elevy" OR "e-levy" OR "increase OR VAT" OR "Financial OR sector OR levy" OR "income OR tax OR bill" OR "covid OR levy" OR "sustainability OR levy" OR "addo OR levy" OR "covid OR tax" OR "betting OR tax" OR "exercise OR duty OR bill" OR "electronic OR levy") until:2023-06-20 since:2020-03-01 geocode:"{}"'.format(location)

    tweets = []


    for tweet in scrape_tweets.TwitterSearchScraper(query).get_items():
        tweets.append([tweet.date, tweet.username, tweet.content])



    df = pd.DataFrame(tweets, columns=['Date', 'User', 'Tweet'])
    df.head()

## Preprocessing of Tweet

In [7]:
def clean_text(text):
    # Remove mentions
    menttion_pattern = re.compile(r'@\w+')
    text = re.sub(menttion_pattern, '', text)

    # Remove Hashtags
    text = re.sub(r'#', '', text)

    # Remove retweets
    text = re.sub(r'RT[\s]+', '', text)

    # Remove urls
    text = re.sub(r'https?:\/\/\S+', '', text)

    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002600-\U000027BF"  # miscellaneous symbols
                               u"\U0001F910-\U0001F9FF"  # faces with accessories
                               u"\u200d"  # zero-width joiner
                               u"\u2600-\u26FF\u2700-\u27BF"  # additional symbols
                               u"\u3000-\u303F"  # punctuation symbols
                               u"\uFE0F"  # emoji variation selector
                               "]+", flags=re.UNICODE)
    text = re.sub(emoji_pattern, '', text)

    # Remove newlines
    newline_pattern = re.compile(r'\n')
    text = re.sub(newline_pattern, '', text)

    return text

df['Tweet'] = df['Tweet'].apply(clean_text)


NameError: name 'df' is not defined

## Sentiment Analysis using Textblob

In [None]:
# Read Data
df = pd.read_csv('cleaned_tweet_2.csv')

tweets = df['Tweet']

In [8]:
sentiments = []
for tweet in tweets:
    blob = TextBlob(str(tweet))  # Convert tweet to string if it's not already
    polarity = blob.sentiment.polarity
    if polarity > 0:
        sentiment = 'Positive'
    elif polarity < 0:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    sentiments.append(sentiment)

# Add the sentiments to the DataFrame
data['Sentiment'] = sentiments

NameError: name 'tweets' is not defined

In [None]:
# Plot the sentiment distribution
sentiment_counts = data['Sentiment'].value_counts()
plt.bar(sentiment_counts.index, sentiment_counts.values)
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Sentiment Distribution')
plt.show()

## Sentiment Analysis using Vader(NLTK)

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm


sia = SentimentIntensityAnalyzer()

In [10]:
# Run polarity score on entire dataframe
res = {}
for i, row  in tqdm(df.iterrows(), total=len(df)):
    tweet = row['Tweet']
    myid = row['User']
    res[myid] = sia.polarity_scores(tweet)

NameError: name 'df' is not defined

In [11]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'User'})
vaders = vaders.merge(df, how='left')

NameError: name 'df' is not defined

In [None]:
# Sentiment score and metadata
vaders.head()

In [None]:
# Read Data
df = pd.read_csv('Dataset\\cleaned_tweet_2.csv')

tweets = df['Tweet']

## Sentiment Analysis using Roberta

In [12]:
# Read Data
df = pd.read_csv('Dataset\\cleaned_tweet_2.csv')

tweets = df['Tweet']

In [13]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [14]:
MODEL = f'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [15]:
# Run on Roberta
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

NameError: name 'example' is not defined

In [16]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }
    return scores_dict

In [None]:
res = {}
try:
    for i, row  in tqdm(df.iterrows(), total=len(df)):
        tweet = row['Tweet']
        myid = row['User']
        vader_result = res[myid] = sia.polarity_scores(tweet)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value

        roberta_result = polarity_scores_roberta(tweet)
        both = {**vader_result, **roberta_result}
        res[myid] = both
except RuntimeError:
    printf(f'Broke for id {myid}')