In [None]:
# Install required packages
!pip install --quiet gspread pandas

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import gspread
import pandas as pd
import numpy as np
import torch
import re
from google.colab import files
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import nltk
from scipy.special import softmax

# Method 1

In [None]:
uploaded = files.upload()

In [None]:
coviddata = pd.read_excel("Country_dataset.xlsx")
coviddata

In [None]:
coviddata['increase_confirmed'] = coviddata['Confirmed 22nd June']-coviddata['Confirmed 1st april']
coviddata['increase_confirmed'] = coviddata['increase_confirmed']/coviddata['Population']
coviddata['increase_confirmed']

In [None]:
coviddata['increase_deaths'] = coviddata['Deaths 22nd June']-coviddata['Deaths 1st april']
coviddata['increase_deaths'] = coviddata['increase_deaths']/coviddata['Population']
coviddata['increase_deaths']

In [None]:
max3 = coviddata['increase_confirmed'].max()
min3 = coviddata['increase_confirmed'].min()
max4 = coviddata['increase_deaths'].max()
min4 = coviddata['increase_deaths'].min()

In [None]:
print(max3,min3,max4,min4)

In [None]:
coviddata['normalized_increase_confirmed'] = coviddata['increase_confirmed'].apply(lambda x: (((x-min3)+0.0001)/((max3-min3)+0.0001)))

In [None]:
coviddata['normalized_increase_deaths'] = coviddata['increase_deaths'].apply(lambda y: (((y-min4)+0.0001)/((max4-min4)+0.0001)))

In [None]:
coviddata['intensity'] = (coviddata['normalized_increase_confirmed']+coviddata['normalized_increase_deaths'])/2
# 'intensity' corresponds to the Severity Value (SV) as described in the manuscript.
coviddata[['Country','intensity']]

## Important NOTE

Due to Twitter/X Developer Policy and privacy constraints, raw tweet text and
user location are not included in the public release.

To reproduce the results:
1. Hydrate the published Tweet IDs using the Twitter API to retrieve the  
   **tweet_text** and **Location** fields.
2. Run *Final code for publication 1.ipynb* to compute local sentiment labels   and sentiment scores.
3. Run *Final code for publication 2.ipynb* and store the resulting dataset
   (e.g., `ordinalfinal.xlsx`).

The intermediate Excel file produced in Step 3 is **not included** in this
public repository for ethical reasons and to comply with Twitter/X Terms of
Service.

**All subsequent analyses assume that these steps have been completed.**


In [None]:
uploaded = files.upload()

In [None]:
data = pd.read_excel("ordinalfinal.xlsx")
data

In [None]:
c1=data[(data['Country']=='canada')&(data['pred_label']=='positive')]
c2=data[(data['Country']=='india')&(data['pred_label']=='positive')]
c3=data[(data['Country']=='us')&(data['pred_label']=='positive')]
c4=data[(data['Country']=='united kingdom')&(data['pred_label']=='positive')]
c5=data[(data['Country']=='bangladesh')&(data['pred_label']=='positive')]
c6=data[(data['Country']=='france')&(data['pred_label']=='positive')]
c7=data[(data['Country']=='australia')&(data['pred_label']=='positive')]
c8=data[(data['Country']=='pakistan')&(data['pred_label']=='positive')]
c9=data[(data['Country']=='netherlands')&(data['pred_label']=='positive')]
c10=data[(data['Country']=='germany')&(data['pred_label']=='positive')]
c1 = len(c1)/19622
c2 = len(c2)/19622
c3 = len(c3)/19622
c4 = len(c4)/19622
c5 = len(c5)/19622
c6 = len(c6)/19622
c7 = len(c7)/19622
c8 = len(c8)/19622
c9 = len(c9)/19622
c10 = len(c10)/19622
#print(c7)
print(c1,c2,c3,c4,c5,c6,c7,c8,c9,c10)

In [None]:
d1=data[(data['Country']=='canada')&(data['pred_label']=='negative')]
d2=data[(data['Country']=='india')&(data['pred_label']=='negative')]
d3=data[(data['Country']=='us')&(data['pred_label']=='negative')]
d4=data[(data['Country']=='united kingdom')&(data['pred_label']=='negative')]
d5=data[(data['Country']=='bangladesh')&(data['pred_label']=='negative')]
d6=data[(data['Country']=='france')&(data['pred_label']=='negative')]
d7=data[(data['Country']=='australia')&(data['pred_label']=='negative')]
d8=data[(data['Country']=='pakistan')&(data['pred_label']=='negative')]
d9=data[(data['Country']=='netherlands')&(data['pred_label']=='negative')]
d10=data[(data['Country']=='germany')&(data['pred_label']=='negative')]
d1 = len(d1)/19622
d2 = len(d2)/19622
d3 = len(d3)/19622
d4 = len(d4)/19622
d5 = len(d5)/19622
d6 = len(d6)/19622
d7 = len(d7)/19622
d8 = len(d8)/19622
d9 = len(d9)/19622
d10 = len(d10)/19622
#print(d7)
print(d1,d2,d3,d4,d5,d6,d7,d8,d9,d10)

In [None]:
e1=data[(data['Country']=='canada')&(data['pred_label']=='neutral')]
e2=data[(data['Country']=='india')&(data['pred_label']=='neutral')]
e3=data[(data['Country']=='us')&(data['pred_label']=='neutral')]
e4=data[(data['Country']=='united kingdom')&(data['pred_label']=='neutral')]
e5=data[(data['Country']=='bangladesh')&(data['pred_label']=='neutral')]
e6=data[(data['Country']=='france')&(data['pred_label']=='neutral')]
e7=data[(data['Country']=='australia')&(data['pred_label']=='neutral')]
e8=data[(data['Country']=='pakistan')&(data['pred_label']=='neutral')]
e9=data[(data['Country']=='netherlands')&(data['pred_label']=='neutral')]
e10=data[(data['Country']=='germany')&(data['pred_label']=='neutral')]
e1 = len(e1)/19622
e2 = len(e2)/19622
e3 = len(e3)/19622
e4 = len(e4)/19622
e5 = len(e5)/19622
e6 = len(e6)/19622
e7 = len(e7)/19622
e8 = len(e8)/19622
e9 = len(e9)/19622
e10 = len(e10)/19622
#print(e7)
print(e1,e2,e3,e4,e5,e6,e7,e8,e9,e10)

In [None]:
ca = coviddata.loc[0,"intensity"]
ind = coviddata.loc[1,"intensity"]
us = coviddata.loc[2,"intensity"]
uk = coviddata.loc[3,"intensity"]
bang = coviddata.loc[4,"intensity"]
fr = coviddata.loc[5,"intensity"]
aus = coviddata.loc[6,"intensity"]
pak = coviddata.loc[7,"intensity"]
ne = coviddata.loc[8,"intensity"]
ger = coviddata.loc[9,"intensity"]
print(ca, ger)

In [None]:
canadapos = (c1*ca)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
canadaneg = (d1*ca)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
canadaneu = (e1*ca)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))

indiapos = (c2*ind)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
indianeg = (d2*ind)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
indianeu = (e2*ind)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))

uspos = (c3*us)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
usneg = (d3*us)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
usneu = (e3*us)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))

ukpos = (c4*uk)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
ukneg = (d4*uk)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
ukneu = (e4*uk)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))

bangpos = (c5*bang)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
bangneg = (d5*bang)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
bangneu = (e5*bang)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))

frpos = (c6*fr)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
frneg = (d6*fr)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
frneu = (e6*fr)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))

auspos = (c7*aus)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
ausneg = (d7*aus)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
ausneu = (e7*aus)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))
print(auspos,ausneg,ausneu)

pakpos = (c8*pak)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
pakneg = (d8*pak)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
pakneu = (e8*pak)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))

nepos = (c9*ne)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
neneg = (d9*ne)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
neneu = (e9*ne)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))

gerpos = (c10*ger)/((c1*ca)+(c2*ind)+(c3*us)+(c4*uk)+(c5*bang)+(c6*fr)+(c7*aus)+(c8*pak)+(c9*ne)+(c10*ger))
gerneg = (d10*ger)/((d1*ca)+(d2*ind)+(d3*us)+(d4*uk)+(d5*bang)+(d6*fr)+(d7*aus)+(d8*pak)+(d9*ne)+(d10*ger))
gerneu = (e10*ger)/((e1*ca)+(e2*ind)+(e3*us)+(e4*uk)+(e5*bang)+(e6*fr)+(e7*aus)+(e8*pak)+(e9*ne)+(e10*ger))


In [None]:
pos = data[data['pred_label']=='positive']
neu = data[data['pred_label']=='neutral']
neg = data[data['pred_label']=='negative']
lenpos = len(pos)
lenneu = len(neu)
lenneg = len(neg)
print(lenpos,lenneu,lenneg)
probofpos = lenpos/19622
probofneu = lenneu/19622
probofneg = lenneg/19622
print(probofpos,probofneu,probofneg)

In [None]:
poscanada = (probofpos * canadapos)/((probofpos*canadapos)+(probofneg*canadaneg)+(probofneu*canadaneu))
print(poscanada)

posindia = (probofpos * indiapos)/((probofpos*indiapos)+(probofneg*indianeg)+(probofneu*indianeu))
print(posindia)

posus = (probofpos * uspos)/((probofpos*uspos)+(probofneg*usneg)+(probofneu*usneu))
print(posus)

posuk = (probofpos * ukpos)/((probofpos*ukpos)+(probofneg*ukneg)+(probofneu*ukneu))
print(posuk)

posbang = (probofpos * bangpos)/((probofpos*bangpos)+(probofneg*bangneg)+(probofneu*bangneu))
print(posbang)

posfr = (probofpos * frpos)/((probofpos*frpos)+(probofneg*frneg)+(probofneu*frneu))
print(posfr)

posaus = (probofpos * auspos)/((probofpos*auspos)+(probofneg*ausneg)+(probofneu*ausneu))
print(posaus)

pospak = (probofpos * pakpos)/((probofpos*pakpos)+(probofneg*pakneg)+(probofneu*pakneu))
print(pospak)

posne = (probofpos * nepos)/((probofpos*nepos)+(probofneg*neneg)+(probofneu*neneu))
print(posne)

posger = (probofpos * gerpos)/((probofpos*gerpos)+(probofneg*gerneg)+(probofneu*gerneu))
print(posger)

In [None]:
negcanada = (probofneg * canadaneg)/((probofpos*canadapos)+(probofneg*canadaneg)+(probofneu*canadaneu))
print(negcanada)

negindia = (probofneg * indianeg)/((probofpos*indiapos)+(probofneg*indianeg)+(probofneu*indianeu))
print(negindia)

negus = (probofneg * usneg)/((probofpos*uspos)+(probofneg*usneg)+(probofneu*usneu))
print(negus)

neguk = (probofneg * ukneg)/((probofpos*ukpos)+(probofneg*ukneg)+(probofneu*ukneu))
print(neguk)

negbang = (probofneg * bangneg)/((probofpos*bangpos)+(probofneg*bangneg)+(probofneu*bangneu))
print(negbang)

negfr = (probofneg * frneg)/((probofpos*frpos)+(probofneg*frneg)+(probofneu*frneu))
print(negfr)

negaus = (probofneg * ausneg)/((probofpos*auspos)+(probofneg*ausneg)+(probofneu*ausneu))
print(negaus)

negpak = (probofneg * pakneg)/((probofpos*pakpos)+(probofneg*pakneg)+(probofneu*pakneu))
print(negpak)

negne = (probofneg * neneg)/((probofpos*nepos)+(probofneg*neneg)+(probofneu*neneu))
print(negne)

negger = (probofneg * gerneg)/((probofpos*gerpos)+(probofneg*gerneg)+(probofneu*gerneu))
print(negger)

In [None]:
neucanada = (probofneu * canadaneu)/((probofpos*canadapos)+(probofneg*canadaneg)+(probofneu*canadaneu))
print(neucanada)

neuindia = (probofneu * indianeu)/((probofpos*indiapos)+(probofneg*indianeg)+(probofneu*indianeu))
print(neuindia)

neuus = (probofneu * usneu)/((probofpos*uspos)+(probofneg*usneg)+(probofneu*usneu))
print(neuus)

neuuk = (probofneu * ukneu)/((probofpos*ukpos)+(probofneg*ukneg)+(probofneu*ukneu))
print(neuuk)

neubang = (probofneu * bangneu)/((probofpos*bangpos)+(probofneg*bangneg)+(probofneu*bangneu))
print(neubang)

neufr = (probofneu * frneu)/((probofpos*frpos)+(probofneg*frneg)+(probofneu*frneu))
print(neufr)

neuaus = (probofneu * ausneu)/((probofpos*auspos)+(probofneg*ausneg)+(probofneu*ausneu))
print(neuaus)

neupak = (probofneu * pakneu)/((probofpos*pakpos)+(probofneg*pakneg)+(probofneu*pakneu))
print(neupak)

neune = (probofneu * neneu)/((probofpos*nepos)+(probofneg*neneg)+(probofneu*neneu))
print(neune)

neuger = (probofneu * gerneu)/((probofpos*gerpos)+(probofneg*gerneg)+(probofneu*gerneu))
print(neuger)

In [None]:
print(posfr+negfr+neufr)
print(posaus+negaus+neuaus)
print(posger+negger+neuger)

In [None]:
data['P(pos|Country)']=''
data.loc[data['Country']=='canada','P(pos|Country)']=poscanada
data.loc[data['Country']=='india','P(pos|Country)']=posindia
data.loc[data['Country']=='us','P(pos|Country)']=posus
data.loc[data['Country']=='united kingdom','P(pos|Country)']=posuk
data.loc[data['Country']=='bangladesh','P(pos|Country)']=posbang
data.loc[data['Country']=='france','P(pos|Country)']=posfr
data.loc[data['Country']=='australia','P(pos|Country)']=posaus
data.loc[data['Country']=='pakistan','P(pos|Country)']=pospak
data.loc[data['Country']=='netherlands','P(pos|Country)']=posne
data.loc[data['Country']=='germany','P(pos|Country)']=posger


In [None]:
data['P(neg|Country)']=''
data.loc[data['Country']=='canada','P(neg|Country)']=negcanada
data.loc[data['Country']=='india','P(neg|Country)']=negindia
data.loc[data['Country']=='us','P(neg|Country)']=negus
data.loc[data['Country']=='united kingdom','P(neg|Country)']=neguk
data.loc[data['Country']=='bangladesh','P(neg|Country)']=negbang
data.loc[data['Country']=='france','P(neg|Country)']=negfr
data.loc[data['Country']=='australia','P(neg|Country)']=negaus
data.loc[data['Country']=='pakistan','P(neg|Country)']=negpak
data.loc[data['Country']=='netherlands','P(neg|Country)']=negne
data.loc[data['Country']=='germany','P(neg|Country)']=negger

In [None]:
data['P(neu|Country)']=''
data.loc[data['Country']=='canada','P(neu|Country)']=neucanada
data.loc[data['Country']=='india','P(neu|Country)']=neuindia
data.loc[data['Country']=='us','P(neu|Country)']=neuus
data.loc[data['Country']=='united kingdom','P(neu|Country)']=neuuk
data.loc[data['Country']=='bangladesh','P(neu|Country)']=neubang
data.loc[data['Country']=='france','P(neu|Country)']=neufr
data.loc[data['Country']=='australia','P(neu|Country)']=neuaus
data.loc[data['Country']=='pakistan','P(neu|Country)']=neupak
data.loc[data['Country']=='netherlands','P(neu|Country)']=neune
data.loc[data['Country']=='germany','P(neu|Country)']=neuger

In [None]:
data['final_sentiment']=''
data.loc[(data['pred_label']=='positive')&(data['P(pos|Country)']>=0.5),'final_sentiment']='POSITIVE WITH LOW INTENSITY'
data.loc[(data['pred_label']=='positive')&(data['P(pos|Country)']<0.5),'final_sentiment']='POSITIVE WITH HIGH INTENSITY'
data.loc[(data['pred_label']=='negative')&(data['P(neg|Country)']>=0.5),'final_sentiment']='NEGATIVE WITH LOW INTENSITY'
data.loc[(data['pred_label']=='negative')&(data['P(neg|Country)']<0.5),'final_sentiment']='NEGATIVE WITH HIGH INTENSITY'
data.loc[(data['pred_label']=='neutral')&(data['P(neu|Country)']>=0.5),'final_sentiment']='NEUTRAL WITH LOW INTENSITY'
data.loc[(data['pred_label']=='neutral')&(data['P(neu|Country)']<0.5),'final_sentiment']='NEUTRAL WITH HIGH INTENSITY'

# Method 2

In [None]:
data['country_condition']=''
data.loc[data['SV']>=0.5,'country_condition']='BAD'
data.loc[data['SV']<0.5,'country_condition']='GOOD'

In [None]:
good = len(data[data['country_condition']=='GOOD'])/19622
bad = len(data[data['country_condition']=='BAD'])/19622
print(good,bad,good+bad)

In [None]:
goodandpos = data[(data['country_condition']=='GOOD')&(data['pred_label']=='positive')]
Lgoodandpos = len(goodandpos)
Pgoodandpos = Lgoodandpos/19622
Pgoodgivenpos = Pgoodandpos/probofpos
print(Lgoodandpos,Pgoodandpos,Pgoodgivenpos)

goodandneg = data[(data['country_condition']=='GOOD')&(data['pred_label']=='negative')]
Lgoodandneg = len(goodandneg)
Pgoodandneg = Lgoodandneg/19622
Pgoodgivenneg = Pgoodandneg/probofneg
print(Lgoodandneg,Pgoodandneg,Pgoodgivenneg)

goodandneu = data[(data['country_condition']=='GOOD')&(data['pred_label']=='neutral')]
Lgoodandneu = len(goodandneu)
Pgoodandneu = Lgoodandneu/19622
Pgoodgivenneu = Pgoodandneu/probofneu
print(Lgoodandneu,Pgoodandneu,Pgoodgivenneu)

In [None]:
Pposgivengood = (probofpos * Pgoodgivenpos)/((probofpos * Pgoodgivenpos)+(probofneg * Pgoodgivenneg)+(probofneu * Pgoodgivenneu))
Pneggivengood = (probofneg * Pgoodgivenneg)/((probofpos * Pgoodgivenpos)+(probofneg * Pgoodgivenneg)+(probofneu * Pgoodgivenneu))
Pneugivengood = (probofneu * Pgoodgivenneu)/((probofpos * Pgoodgivenpos)+(probofneg * Pgoodgivenneg)+(probofneu * Pgoodgivenneu))
print(Pposgivengood,Pneggivengood,Pneugivengood)

In [None]:
badandpos = data[(data['country_condition']=='BAD')&(data['pred_label']=='positive')]
Lbadandpos = len(badandpos)
Pbadandpos = Lbadandpos/19622
Pbadgivenpos = Pbadandpos/probofpos
print(Lbadandpos,Pbadandpos,Pbadgivenpos)

badandneg = data[(data['country_condition']=='BAD')&(data['pred_label']=='negative')]
Lbadandneg = len(badandneg)
Pbadandneg = Lbadandneg/19622
Pbadgivenneg = Pbadandneg/probofneg
print(Lbadandneg,Pbadandneg,Pbadgivenneg)

badandneu = data[(data['country_condition']=='BAD')&(data['pred_label']=='neutral')]
Lbadandneu = len(badandneu)
Pbadandneu = Lbadandneu/19622
Pbadgivenneu = Pbadandneu/probofneu
print(Lbadandneu,Pbadandneu,Pbadgivenneu)

In [None]:
Pposgivenbad = (probofpos * Pbadgivenpos)/((probofpos * Pbadgivenpos)+(probofneg * Pbadgivenneg)+(probofneu * Pbadgivenneu))
Pneggivenbad = (probofneg * Pbadgivenneg)/((probofpos * Pbadgivenpos)+(probofneg * Pbadgivenneg)+(probofneu * Pbadgivenneu))
Pneugivenbad = (probofneu * Pbadgivenneu)/((probofpos * Pbadgivenpos)+(probofneg * Pbadgivenneg)+(probofneu * Pbadgivenneu))
print(Pposgivenbad,Pneggivenbad,Pneugivenbad)

In [None]:
if Pposgivengood == max(Pposgivengood,Pneggivengood,Pneugivengood) :
    data.loc[data['country_condition']=='GOOD', 'max_prob_of_public_sentiment'] = 'Pposgivengood'
    data.loc[(data['pred_label']=='positive')&(data['country_condition']=='GOOD'),'original_sentiment']='WEAKLY POSITIVE WITH LOW INTENSITY'
    data.loc[(data['pred_label']=='negative')&(data['country_condition']=='GOOD'),'original_sentiment']='STRONGLY NEGATIVE WITH HIGH INTENSITY'
    data.loc[(data['pred_label']=='neutral')&(data['country_condition']=='GOOD'),'original_sentiment']='STRONGLY NEUTRAL WITH HIGH INTENSITY'

if Pneggivengood == max(Pposgivengood,Pneggivengood,Pneugivengood) :
    data.loc[data['country_condition']=='GOOD', 'max_prob_of_public_sentiment'] = 'Pneggivengood'
    data.loc[(data['pred_label']=='positive')&(data['country_condition']=='GOOD'),'original_sentiment']='WEAKLY POSITIVE WITH HIGH INTENSITY'
    data.loc[(data['pred_label']=='negative')&(data['country_condition']=='GOOD'),'original_sentiment']='STRONGLY NEGATIVE WITH LOW INTENSITY'
    data.loc[(data['pred_label']=='neutral')&(data['country_condition']=='GOOD'),'original_sentiment']='STRONGLY NEUTRAL WITH HIGH INTENSITY'

if Pneugivengood == max(Pposgivengood,Pneggivengood,Pneugivengood) :
    data.loc[data['country_condition']=='GOOD', 'max_prob_of_public_sentiment'] = 'Pneugivengood'
    data.loc[(data['pred_label']=='positive')&(data['country_condition']=='GOOD'),'original_sentiment']='WEAKLY POSITIVE WITH HIGH INTENSITY'
    data.loc[(data['pred_label']=='negative')&(data['country_condition']=='GOOD'),'original_sentiment']='STRONGLY NEGATIVE WITH HIGH INTENSITY'
    data.loc[(data['pred_label']=='neutral')&(data['country_condition']=='GOOD'),'original_sentiment']='STRONGLY NEUTRAL WITH LOW INTENSITY'



In [None]:
if Pposgivenbad == max(Pposgivenbad,Pneggivenbad,Pneugivenbad) :
    data.loc[data['country_condition']=='BAD', 'max_prob_of_public_sentiment'] = 'Pposgivenbad'
    data.loc[(data['pred_label']=='positive')&(data['country_condition']=='BAD'),'original_sentiment']='STRONGLY POSITIVE WITH LOW INTENSITY'
    data.loc[(data['pred_label']=='negative')&(data['country_condition']=='BAD'),'original_sentiment']='WEAKLY NEGATIVE WITH HIGH INTENSITY'
    data.loc[(data['pred_label']=='neutral')&(data['country_condition']=='BAD'),'original_sentiment']='STRONGLY NEUTRAL WITH HIGH INTENSITY'

if Pneggivenbad == max(Pposgivenbad,Pneggivenbad,Pneugivenbad) :
    data.loc[data['country_condition']=='BAD', 'max_prob_of_public_sentiment'] = 'Pneggivenbad'
    data.loc[(data['pred_label']=='positive')&(data['country_condition']=='BAD'),'original_sentiment']='STRONGLY POSITIVE WITH HIGH INTENSITY'
    data.loc[(data['pred_label']=='negative')&(data['country_condition']=='BAD'),'original_sentiment']='WEAKLY NEGATIVE WITH LOW INTENSITY'
    data.loc[(data['pred_label']=='neutral')&(data['country_condition']=='BAD'),'original_sentiment']='STRONGLY NEUTRAL WITH HIGH INTENSITY'

if Pneugivenbad == max(Pposgivenbad,Pneggivenbad,Pneugivenbad) :
    data.loc[data['country_condition']=='BAD', 'max_prob_of_public_sentiment'] = 'Pneugivenbad'
    data.loc[(data['pred_label']=='positive')&(data['country_condition']=='BAD'),'original_sentiment']='STRONGLY POSITIVE WITH HIGH INTENSITY'
    data.loc[(data['pred_label']=='negative')&(data['country_condition']=='BAD'),'original_sentiment']='WEAKLY NEGATIVE WITH HIGH INTENSITY'
    data.loc[(data['pred_label']=='neutral')&(data['country_condition']=='BAD'),'original_sentiment']='STRONGLY NEUTRAL WITH LOW INTENSITY'


In [None]:
data

In [None]:
data['Tweet_ID'] = data['Tweet_ID'].astype(str)

In [None]:
data.to_excel("precise_sentiment1.xlsx", index=False)

In [None]:
files.download("precise_sentiment1.xlsx")

In [None]:
data1 = data[['tweet_text','intensity_label_median', 'intensity_label_mean', 'original_sentiment','final_sentiment']]
data1

In [None]:
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

In [None]:
def tokenization(text):
    text = re.split(r'\W+', text)
    return " ".join(text)

In [None]:
data1['Tweet Token'] = data1['tweet_text'].apply(lambda x: tokenization(x))

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
whitelist = ["no","not"]                         #some words might indicate sentiment are kept in a list

def remove_stopwords(text):
    words = text.split()
    clean_words = [word for word in words if(word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(clean_words)

In [None]:
data1['Tweet'] = data1['Tweet Token'].apply(remove_stopwords)

In [None]:
models = {
    "LogisticRegression": LogisticRegression(solver='sag'),
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "RandomForest": RandomForestClassifier(n_estimators=250, random_state=5)
}


In [None]:
# split only ONCE
train_idx, test_idx = train_test_split(
    np.arange(len(data1)),
    test_size=0.3,
    random_state=5
)

In [None]:
X_train_text = data1["Tweet"].iloc[train_idx]
X_test_text  = data1["Tweet"].iloc[test_idx]

In [None]:
# Common TF-IDF for all tasks
tfv = TfidfVectorizer()
X_train_vec = tfv.fit_transform(X_train_text)
X_test_vec = tfv.transform(X_test_text)

In [None]:
def evaluate_target_fixed_split(target_series, target_name):

    print(f" Evaluating Target Variable: {target_name}")

    # Encode target labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(target_series)

    y_train = y_encoded[train_idx]
    y_test  = y_encoded[test_idx]

    # Setup Stratified K-Fold on training data only
    skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=5)

    for model_name, model in models.items():

        print(f"\n----- Model: {model_name} -----")
        cv_scores = []

        # CROSS-VALIDATION (on TRAIN ONLY)
        for tr, val in skf.split(X_train_text, y_train):
            # Fit TF-IDF only on each fold (avoids leakage)
            tfv_cv = TfidfVectorizer()
            X_tr = tfv_cv.fit_transform(X_train_text.iloc[tr])
            X_val = tfv_cv.transform(X_train_text.iloc[val])
            y_tr = y_train[tr]
            y_val = y_train[val]

            model.fit(X_tr, y_tr)
            cv_scores.append(model.score(X_val, y_val))

        print("CV Accuracies (%):", np.round(np.array(cv_scores) * 100, 2))
        print("Mean CV Accuracy (%):", round(np.mean(cv_scores) * 100, 2))

        # ---- HOLDOUT EVALUATION ----
        # Refit model on FULL training set
        model.fit(X_train_vec, y_train)

        # Predict on fixed test set
        y_pred = model.predict(X_test_vec)

        test_acc = accuracy_score(y_test, y_pred)
        weighted_f1 = f1_score(y_test, y_pred, average='weighted')

        print("Test Accuracy (%):", round(test_acc * 100, 2))
        print("Weighted F1:", weighted_f1)

In [None]:
evaluate_target_fixed_split(data1["intensity_label_mean"], "intensity_label_mean")

In [None]:
evaluate_target_fixed_split(data1["original_sentiment"], "original_sentiment")


In [None]:
evaluate_target_fixed_split(data1["final_sentiment"], "final_sentiment")

In [None]:
data1['intensity_label_mean'].value_counts()

In [None]:
data1['original_sentiment'].value_counts()

In [None]:
data1['final_sentiment'].value_counts()