# Generating Dataset For the Sentimental Analysis of Tweets

In [1]:
!pip install snscrape

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting snscrape
  Downloading snscrape-0.5.0.20230113-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.2/69.2 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: snscrape
Successfully installed snscrape-0.5.0.20230113


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Scraping the data from Twitter using `snscrapr`

In [3]:
import snscrape.modules.twitter as sntwitter

query = "(from:ANI) until:2023-01-12 since:2013-01-08"
tweets = []
limit = 50

for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.lang, tweet.content])
        
df = pd.DataFrame(tweets, columns=['Lang', 'Tweet'])
df.head()

  tweets.append([tweet.lang, tweet.content])


Unnamed: 0,Lang,Tweet
0,en,Tripura Chief Minister Manik Saha inspected th...
1,en,I'd like to appreciate Ambassador Katherine Ta...
2,en,The ground rules of IPEF were laid out very we...
3,en,We also discussed progress of Indo-Pacific Eco...
4,en,I have had a chance to spend time with US Comm...


In [4]:
df.shape

(50, 2)

## Filtering english Tweets from the Dataset

In [5]:
# To See What are the Varities of Language in the Dataset
df['Lang'].unique()

array(['en'], dtype=object)

In [6]:
df = df[(df.Lang=='en')]
df.head()

Unnamed: 0,Lang,Tweet
0,en,Tripura Chief Minister Manik Saha inspected th...
1,en,I'd like to appreciate Ambassador Katherine Ta...
2,en,The ground rules of IPEF were laid out very we...
3,en,We also discussed progress of Indo-Pacific Eco...
4,en,I have had a chance to spend time with US Comm...


In [7]:
df['Lang'].unique()

array(['en'], dtype=object)

In [8]:
df.shape

(50, 2)

## Removing the links and the username from the tweets

In [9]:
# Defining a function to change the inks and the mentiones user in the tweets
def removeLinkAndUser(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [10]:
# Getting a random tweet from the dataset
df.reset_index(drop=True, inplace=True)
sampleTweet = df['Tweet'][12]
sampleTweet

"Two cough syrups made by India's Marion Biotech should not be used for children, after the products were linked to 19 deaths in Uzbekistan: World Health Organization (WHO) https://t.co/RfxAs1Usr1"

In [11]:
# After removing username and links from the tweet
removeLinkAndUser(sampleTweet)

"Two cough syrups made by India's Marion Biotech should not be used for children, after the products were linked to 19 deaths in Uzbekistan: World Health Organization (WHO) http"

### Appling this to all the tweets in the dataframe

In [12]:
df["Tweet"]=df["Tweet"].apply(removeLinkAndUser)
df.head()

Unnamed: 0,Lang,Tweet
0,en,Tripura Chief Minister Manik Saha inspected th...
1,en,I'd like to appreciate Ambassador Katherine Ta...
2,en,The ground rules of IPEF were laid out very we...
3,en,We also discussed progress of Indo-Pacific Eco...
4,en,I have had a chance to spend time with US Comm...


# VADAR Sentiment Scoring
* VADER (Valence Aware Dictionary and sEntiment Reasoner) is a rule-based sentiment analysis tool designed to extract polarity (positive, negative, or neutral) from text data. 
* It uses a lexicon of sentiment-related words and phrases along with a set of rules to determine the sentiment of a given text.

## Limitation of VADAR
* Limited Domain-Specific Knowledge
* Doesnot Account the relationship between the words
* Over-reliance on Lexicon (The lexicon consists of a list of words and phrases that are labeled with their polarity (positive, negative, or neutral) based on their commonly perceived sentiment.)
* Inability to Capture Complex Emotions
* Lack of Understanding of Sarcasm and Irony
* Difficulty with Non-Text Data

In [13]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [14]:
# Invoking a Instance of SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [15]:
sia.polarity_scores("I am very happy")

{'neg': 0.0, 'neu': 0.334, 'pos': 0.666, 'compound': 0.6115}

In [16]:
sia.polarity_scores("I am very sad")

{'neg': 0.629, 'neu': 0.371, 'pos': 0.0, 'compound': -0.5256}

In [17]:
# Generating Polarity Score on All Tweets and store then in the res
j=1;
res = {}
for i,row in tqdm(df.iterrows(),total = len(df)):
  mytweet = row['Tweet']
  myid = j
  j = j + 1
  vader_result = sia.polarity_scores(mytweet) 
  vader_result_rename = {}
  for key, values in vader_result.items():
      vader_result_rename[f"vader_{key}"] = values
  res[myid] = vader_result_rename

  0%|          | 0/50 [00:00<?, ?it/s]

In [18]:
# Crating a dataset for the sentiment scores using the VADAR Method
sentimentScoresVader = pd.DataFrame(res).T
sentimentScoresVader.head()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound
1,0.0,1.0,0.0,0.0
2,0.0,0.846,0.154,0.7906
3,0.0,0.947,0.053,0.3384
4,0.0,0.94,0.06,0.4215
5,0.0,0.955,0.045,0.25


# Hugging Face *Method* (Roberta Pretrained Model)

In [19]:
!pip install transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [20]:
# This Hugging Face Transformers library to load a pre-trained sentiment analysis model for Twitter data.
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL) # Convert the raw text into a format so that the model can process
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/499M [00:00<?, ?B/s]

### Now Lets take a example to get the sentiment score using the Roberta Pretrained Model

In [21]:
# Checking the sentiment score of the single tweet
def polarity_score(tweet):
  encoded_text = tokenizer(tweet,return_tensors='pt')
  output = model(**encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
      "roberta_neg" : scores[0],
      "roberta_neu" : scores[1],
      "roberta_pos" : scores[2]
  }
  return scores_dict

In [22]:
polarity_score(df['Tweet'][12])

{'roberta_neg': 0.91676724,
 'roberta_neu': 0.08059349,
 'roberta_pos': 0.002639334}

### Now getting it for the rest of the tweets

In [23]:
j = 1;
res = {}
for i,row in tqdm(df.iterrows(),total = len(df)):
  try:
    mytweet = row['Tweet']
    myid = j
    j=j+1
    roborto_result = polarity_score(mytweet)
    res[myid] = roborto_result
  except RuntimeError:
    print (f"Broke at id {myid}")

  0%|          | 0/50 [00:00<?, ?it/s]

In [24]:
# Crating a dataset for the sentiment scores using the Roberta Method
sentimentScoresRoberta = pd.DataFrame(res).T
sentimentScoresRoberta.head()

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
1,0.014746,0.928273,0.05698
2,0.003203,0.12537,0.871427
3,0.001249,0.041221,0.95753
4,0.005952,0.873916,0.120132
5,0.00254,0.78626,0.2112


## So now we have sentitemt scores from 2 types of method so let's merge all of them
* VADAR Methord
* Roberta Methord

In [25]:
df.head()
df.reset_index(drop=True, inplace=True)
df.shape

(50, 2)

In [26]:
sentimentScoresVader.head()
sentimentScoresVader.reset_index(drop=True, inplace=True)
sentimentScoresVader.shape

(50, 4)

In [27]:
sentimentScoresRoberta.head()
sentimentScoresRoberta.reset_index(drop=True, inplace=True)
sentimentScoresRoberta.shape

(50, 3)

In [28]:
df_intermediate = pd.concat([sentimentScoresVader, sentimentScoresRoberta, df], axis=1, join='inner')
df_intermediate.head()
df_intermediate.shape

(50, 9)

## Since here we are following Supervised Learning we need to Lable the tweets 
Now we need to lable the tweets from a pretrained model on sentimental analysis from hugging face 

In [29]:
df_intermediate.columns

Index(['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 'roberta_neg',
       'roberta_neu', 'roberta_pos', 'Lang', 'Tweet'],
      dtype='object')

In [30]:
df_intermediate.head()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,Lang,Tweet
0,0.0,1.0,0.0,0.0,0.014746,0.928273,0.05698,en,Tripura Chief Minister Manik Saha inspected th...
1,0.0,0.846,0.154,0.7906,0.003203,0.12537,0.871427,en,I'd like to appreciate Ambassador Katherine Ta...
2,0.0,0.947,0.053,0.3384,0.001249,0.041221,0.95753,en,The ground rules of IPEF were laid out very we...
3,0.0,0.94,0.06,0.4215,0.005952,0.873916,0.120132,en,We also discussed progress of Indo-Pacific Eco...
4,0.0,0.955,0.045,0.25,0.00254,0.78626,0.2112,en,I have had a chance to spend time with US Comm...


In [31]:
from transformers import pipeline
sent_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [32]:
j = 1;
res = {}
for i,row in tqdm(df.iterrows(),total = len(df)):
  try:
    mytweet = row['Tweet']
    myid = j
    j=j+1
    roborto_result = sent_pipeline(mytweet)
    res[myid] = { 'Tag' : roborto_result[0]['label'], 'Score' : roborto_result[0]['score']}
  except RuntimeError:
    print (f"Broke at id {myid}")


  0%|          | 0/50 [00:00<?, ?it/s]

In [33]:
# Crating a dataset for the sentiment scores using the Roberta Method
LabledData = pd.DataFrame(res).T
LabledData["Score"] = LabledData["Score"].astype(float)
LabledData.head()

Unnamed: 0,Tag,Score
1,POSITIVE,0.564948
2,POSITIVE,0.999794
3,POSITIVE,0.999485
4,POSITIVE,0.956361
5,POSITIVE,0.984511


In [34]:
LabledData.head()
LabledData.reset_index(drop=True, inplace=True)
LabledData.shape

(50, 2)

In [35]:
df_final = pd.concat([df_intermediate, LabledData], axis=1, join='inner')
df_final.shape

(50, 11)

In [36]:
df_final.head()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,Lang,Tweet,Tag,Score
0,0.0,1.0,0.0,0.0,0.014746,0.928273,0.05698,en,Tripura Chief Minister Manik Saha inspected th...,POSITIVE,0.564948
1,0.0,0.846,0.154,0.7906,0.003203,0.12537,0.871427,en,I'd like to appreciate Ambassador Katherine Ta...,POSITIVE,0.999794
2,0.0,0.947,0.053,0.3384,0.001249,0.041221,0.95753,en,The ground rules of IPEF were laid out very we...,POSITIVE,0.999485
3,0.0,0.94,0.06,0.4215,0.005952,0.873916,0.120132,en,We also discussed progress of Indo-Pacific Eco...,POSITIVE,0.956361
4,0.0,0.955,0.045,0.25,0.00254,0.78626,0.2112,en,I have had a chance to spend time with US Comm...,POSITIVE,0.984511


In [37]:
df_final['Lang'].unique()

array(['en'], dtype=object)

In [38]:
df_final.describe()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,Score
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.04918,0.9036,0.04724,-0.011372,0.242207,0.517981,0.239812,0.942371
std,0.069444,0.081913,0.062338,0.49425,0.329757,0.349918,0.351672,0.107488
min,0.0,0.648,0.0,-0.875,0.00081,0.019155,0.002639,0.564948
25%,0.0,0.85125,0.0,-0.39495,0.008339,0.130953,0.016624,0.958683
50%,0.0,0.9275,0.0,0.0,0.05107,0.577634,0.055107,0.985361
75%,0.07825,0.953,0.073,0.3396,0.42406,0.88771,0.25882,0.998912
max,0.282,1.0,0.255,0.9001,0.954882,0.940829,0.980034,0.999874


In [39]:
df_final.dtypes

vader_neg         float64
vader_neu         float64
vader_pos         float64
vader_compound    float64
roberta_neg       float32
roberta_neu       float32
roberta_pos       float32
Lang               object
Tweet              object
Tag                object
Score             float64
dtype: object

In [40]:
df_final.to_csv(f"{query}.csv",index=False)

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
