In [1]:
import pandas as pd
import numpy as np

from collections import Counter

import statsmodels.api as sm

from tqdm.auto import tqdm
tqdm.pandas()

## Reading the data

In [2]:
df = pd.read_csv('Tweets - Sentiment Analysis (RoBERTa) Raw Values.csv', lineterminator='\n')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3735462 entries, 0 to 3735461
Data columns (total 12 columns):
 #   Column    Dtype  
---  ------    -----  
 0   tweetid   int64  
 1   text      object 
 2   hashtags  object 
 3   language  object 
 4   date      object 
 5   negative  float64
 6   neutral   float64
 7   positive  float64
 8   anger     float64
 9   joy       float64
 10  optimism  float64
 11  sadness   float64
dtypes: float64(7), int64(1), object(4)
memory usage: 342.0+ MB


In [4]:
df.head()

Unnamed: 0,tweetid,text,hashtags,language,date,negative,neutral,positive,anger,joy,optimism,sadness
0,1499174584720969730,Map situation in #Ukraine after the seventh da...,"['Ukraine', 'RussiaUkraineConflict']",en,MAR03,0.237624,1.357651,-1.783151,1.320624,-1.717873,-0.575291,1.707901
1,1499174584976826368,#Ukraine: Let's just say it's not just the TB-...,['Ukraine'],en,MAR03,0.94736,0.889939,-2.091296,1.598669,-0.535707,0.341185,-0.105763
2,1499174585073242116,⚡️The SWIFT company confirmed that it will dis...,"['EU', 'Russian']",en,MAR03,-0.295086,1.447142,-1.220792,2.702571,-1.68421,-0.772874,0.6065
3,1499174585987600384,#Ukraine: Ukrainian forces recovered a Eniks E...,['Ukraine'],en,MAR03,-1.926291,1.267447,0.810379,1.197331,0.001189,0.04589,0.067511
4,1499174586159665155,Volunteers needed for a rapid-response #DH #Di...,"['DH', 'DigitalHumanities', 'CulturalHeritage']",en,MAR03,-1.135949,1.682476,-0.762521,0.115792,-0.28695,0.58314,0.51661


In [5]:
df['hashtags'] = df.hashtags.progress_apply(eval)

  0%|          | 0/3735462 [00:00<?, ?it/s]

## Getting popular hashtags

In [6]:
hashtags = df.hashtags.to_list()
hashtags = [j for i in hashtags for j in i]  # Flattening

In [7]:
hashtag_occurances = Counter(hashtags)
hashtag_occurances = Counter({k: c for k, c in hashtag_occurances.items() if c >= 500})  # Only keeping popular hashtags

In [8]:
print("Top 10 hashtags")
hashtag_occurances.most_common(10)

Top 10 hashtags


[('Ukraine', 1361248),
 ('Russia', 652633),
 ('Putin', 414755),
 ('StopPutin', 201149),
 ('SafeAirliftUkraine', 159124),
 ('Russian', 152752),
 ('UkraineRussianWar', 147350),
 ('UkraineRussiaWar', 127101),
 ('StandWithUkraine', 126355),
 ('Kyiv', 109992)]

In [9]:
popular_hashtags = list(dict(hashtag_occurances.most_common(40)).keys())

## Transforming the dataframe

In [10]:
df.columns

Index(['tweetid', 'text', 'hashtags', 'language', 'date', 'negative',
       'neutral', 'positive', 'anger', 'joy', 'optimism', 'sadness'],
      dtype='object')

In [11]:
df = df.explode('hashtags')
df = df[['date', 'hashtags', 'negative', 'neutral', 'positive', 'anger', 'joy', 'optimism', 'sadness']]
df.rename(columns={"hashtags": "hashtag"}, inplace=True)
df = df[df.hashtag.isin(popular_hashtags)]
df.reset_index(drop=True, inplace=True)

In [12]:
df

Unnamed: 0,date,hashtag,negative,neutral,positive,anger,joy,optimism,sadness
0,MAR03,Ukraine,0.237624,1.357651,-1.783151,1.320624,-1.717873,-0.575291,1.707901
1,MAR03,Ukraine,0.947360,0.889939,-2.091296,1.598669,-0.535707,0.341185,-0.105763
2,MAR03,EU,-0.295086,1.447142,-1.220792,2.702571,-1.684210,-0.772874,0.606500
3,MAR03,Russian,-0.295086,1.447142,-1.220792,2.702571,-1.684210,-0.772874,0.606500
4,MAR03,Ukraine,-1.926291,1.267447,0.810379,1.197331,0.001189,0.045890,0.067511
...,...,...,...,...,...,...,...,...,...
4765915,MAR05,NATO,0.844373,0.767706,-1.969419,2.737618,-1.661515,0.054990,-0.094986
4765916,MAR05,Ukraine,1.888199,0.310074,-2.671582,2.163848,-2.166087,-0.164755,0.844673
4765917,MAR05,Putin,1.888199,0.310074,-2.671582,2.163848,-2.166087,-0.164755,0.844673
4765918,MAR05,SafeAirliftUkraine,1.888199,0.310074,-2.671582,2.163848,-2.166087,-0.164755,0.844673


In [13]:
df = pd.get_dummies(df,columns=['hashtag'],drop_first=True)

## Getting the top hashtags for each sentiment

In [14]:
sentiment_cols = ['negative', 'neutral', 'positive', 'anger', 'joy', 'optimism', 'sadness']
hashtag_cols = [col for col in df if col.startswith('hashtag')]

In [15]:
top_hashtags = {}
for sentiment in tqdm(sentiment_cols):
    print("**********************")
    print(f"Analyzing {sentiment}")
    result = sm.OLS(df[sentiment], df[hashtag_cols]).fit()
    print(result.summary())
    top_hashtags[sentiment] = result.params.sort_values(ascending=False).head(5).to_dict()

  0%|          | 0/7 [00:00<?, ?it/s]

**********************
Analyzing negative
                                 OLS Regression Results                                
Dep. Variable:               negative   R-squared (uncentered):                   0.273
Model:                            OLS   Adj. R-squared (uncentered):              0.273
Method:                 Least Squares   F-statistic:                          4.599e+04
Date:                Thu, 17 Mar 2022   Prob (F-statistic):                        0.00
Time:                        17:59:45   Log-Likelihood:                     -8.0505e+06
No. Observations:             4765920   AIC:                                  1.610e+07
Df Residuals:                 4765881   BIC:                                  1.610e+07
Df Model:                          39                                                  
Covariance Type:            nonrobust                                                  
                                  coef    std err          t      P>|t|      [

                                 OLS Regression Results                                
Dep. Variable:               positive   R-squared (uncentered):                   0.613
Model:                            OLS   Adj. R-squared (uncentered):              0.613
Method:                 Least Squares   F-statistic:                          1.938e+05
Date:                Thu, 17 Mar 2022   Prob (F-statistic):                        0.00
Time:                        18:00:08   Log-Likelihood:                     -7.9437e+06
No. Observations:             4765920   AIC:                                  1.589e+07
Df Residuals:                 4765881   BIC:                                  1.589e+07
Df Model:                          39                                                  
Covariance Type:            nonrobust                                                  
                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------

                                 OLS Regression Results                                
Dep. Variable:                    joy   R-squared (uncentered):                   0.459
Model:                            OLS   Adj. R-squared (uncentered):              0.459
Method:                 Least Squares   F-statistic:                          1.038e+05
Date:                Thu, 17 Mar 2022   Prob (F-statistic):                        0.00
Time:                        18:00:30   Log-Likelihood:                     -7.3337e+06
No. Observations:             4765920   AIC:                                  1.467e+07
Df Residuals:                 4765881   BIC:                                  1.467e+07
Df Model:                          39                                                  
Covariance Type:            nonrobust                                                  
                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------

                                 OLS Regression Results                                
Dep. Variable:                sadness   R-squared (uncentered):                   0.122
Model:                            OLS   Adj. R-squared (uncentered):              0.122
Method:                 Least Squares   F-statistic:                          1.691e+04
Date:                Thu, 17 Mar 2022   Prob (F-statistic):                        0.00
Time:                        18:00:52   Log-Likelihood:                     -6.1267e+06
No. Observations:             4765920   AIC:                                  1.225e+07
Df Residuals:                 4765881   BIC:                                  1.225e+07
Df Model:                          39                                                  
Covariance Type:            nonrobust                                                  
                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------

In [16]:
for sentiment, hashtags in top_hashtags.items():
    print(f"{sentiment}: {[k.split('_')[1] for k,v in list(hashtags.items()) if v > 0]}")

negative: ['Mariupol', 'SafeAirliftUkraine', 'StopPutin', 'UkraineUnderAttack', 'Putin']
neutral: ['BREAKING', 'EU', 'China', 'US', 'NATO']
positive: []
anger: ['UKRAINE', 'StopRussia', 'StopPutin', 'putin', 'RussianUkrainianWar']
joy: ['SlavaUkraini', 'Zelenskyy']
optimism: ['StandWithUkraine', 'StandWithUkraine️', 'China', 'EU', 'SafeAirliftUkraine']
sadness: ['Mariupol', 'SafeAirliftUkraine', 'UkraineUnderAttack', 'BREAKING', 'Kharkiv']
