# Data stats

In [None]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');


In [None]:
import pandas as pd
import numpy as np
import re
import json
from tqdm import tqdm, trange

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)


In [None]:
# change matplotlib parameters
plt.rcParams['figure.figsize'] = [15, 10]
plt.rcParams.update({'font.size': 30})

# COVIDSenti - Sentiments

In [None]:
# sentiment
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8545013/pdf/tcss-razzak-3051189.pdf

In [None]:
path = "../data/COVIDSenti/"

df = pd.read_csv(path+'COVIDSenti.csv')
df

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
y = df['label'].tolist()
X = df['tweet'].tolist()
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    count = [0,0,0]
    for ind in train_index:
        if y[ind]=='neu':
            count[1]+=1
        elif y[ind]=='neg':
            count[0]+=1
        else:
            count[2]+=1
            
    sub_df = df.iloc[test_index]
    sub_df.to_csv('../data/COVIDSenti/fold'+str(i)+'.csv', index=False)

In [None]:
sub = df['label'].tolist()
counts = [sub.count(a) for a in ['neg', 'neu', 'pos']]
counts

In [None]:
Y = counts
COLORS = ["red", "yellow", "green"]

names = ['Negative', 'Neutral', 'Positive']
sns.set_style("darkgrid")
ax = sns.barplot(x=names, y=Y, palette=COLORS)
for bars in ax.containers:
    ax.bar_label(bars, fmt="%.f")

plt.show()

# stance-detection-in-covid-19-tweets - Stance

In [None]:
# stance towards a topic + sentiment
# https://aclanthology.org/2021.acl-long.127.pdf

In [None]:
path = '../data/stance-detection-in-covid-19-tweets/dataset/'

tweetIds = []
df = pd.DataFrame()
df['Tweet Id'] = []
df['Target'] = []
df['Stance'] = []
df['Opinion Towards'] = []
df['Sentiment'] = []
for f in os.listdir(path):
    if 'noisy' not in f:
        df2 = pd.read_csv(path+f)
        df = pd.concat([df, df2])
        tweetIds.extend(df2['Tweet Id'].tolist())


In [None]:
tweetIds = []
for t_id in df['Tweet Id'].tolist():
    tweetIds.append(int(format(t_id, '.0f')))
df['Tweet Id'] = tweetIds
df

In [None]:
with open('../data/stance-detection-in-covid-19-tweets/tweets3.jsonl', 'r') as f:
    json_list = list(f)

tweets_list = []
   
for json_str in json_list:
    tweets = json.loads(json_str)
    for t in tweets['data']:
        tweets_list.append(t)
        

In [None]:
t_ids = [int(t['id']) for t in tweets_list]
tweets = [t['text'] for t in tweets_list]

In [None]:
sub = df[df['Tweet Id'].isin(t_ids)]
sub['tweet'] = tweets
sub

In [None]:
for target in sub['Target'].unique():
    tmp = sub[sub['Target']== target]
    y = tmp['Stance'].tolist()
    X = tmp['tweet'].tolist()
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        sub_df = tmp.iloc[test_index]
        sub_df.to_csv('../data/stance-detection-in-covid-19-tweets/'+str(target)+'/fold'+str(i)+'.csv', index=False)

In [None]:
tmp = sub[sub['Target']=='stay_at_home_orders']['Stance'].tolist()
counts = [tmp.count(a) for a in ['AGAINST', 'NONE', 'FAVOR']]
counts

In [None]:
stances = ['stay_at_home_orders', 'fauci', 'face_masks', 'school_closures']

Y = counts
COLORS = ["red", "yellow", "green"]

names = ['Against', 'None', 'Favor']
sns.set_style("darkgrid")
ax = sns.barplot(x=names, y=Y, palette=COLORS)
for bars in ax.containers:
    ax.bar_label(bars, fmt="%.f")

#ax.set(title="Stance towards 'stay_at_home_orders'")
plt.show()

# COVID-19 Twitter Dataset with Latent Topics - Emotion

In [None]:
# emotion
# https://arxiv.org/pdf/2007.06954.pdf

In [None]:
path = '../data/covid-latent/Twitter-COVID-dataset---June2022/'

df = pd.read_csv(path+'tweetid_userid_keyword_sentiments_emotions.csv')
df

In [None]:
ems = df['emotion'].tolist()

In [None]:
set(ems)

In [None]:
tweetIds = df['tweet_ID'].tolist()

In [None]:
with open('../data/covid-latent/Twitter-COVID-dataset---June2022/tweets.jsonl', 'r') as f:
    json_list = list(f)

tweets_list = []
t_ids = []
for json_str in tqdm(json_list):
    tweets = json.loads(json_str)
    
    
    for t in tweets['data']:
        tweets_list.append(t)

In [None]:
t_ids = [int(t['id']) for t in tweets_list]
tweets = [t['text'] for t in tweets_list]


In [None]:
df_data = df[df['tweet_ID'].isin(t_ids)]

In [None]:
df_data

In [None]:
df_data['tweet'] = tweets

In [None]:
num = 100000

A = 75824459
F = 64633650
H = 59444056
S = 14856610
N = len(df)-A-F-H-S

A = A/len(df)*num
F = F/len(df)*num
H = H/len(df)*num
S = S/len(df)*num
N = N/len(df)*num

L = [A, F, H, S]
txt = ['A', 'F', 'H', 'S']

In [None]:
tmp_df = pd.DataFrame(columns=df_data.columns)

for x in range(0, len(L)):
    sam = df_data[df_data['emotion']==txt[x]].sample(round(L[x]))
    tmp_df = pd.concat([tmp_df, sam])

sam = df_data[df_data['emotion'].isna()].sample(round(N))
tmp_df = pd.concat([tmp_df, sam])

tmp_df = tmp_df.sample(frac=1).reset_index(drop=True)

tmp_df

In [None]:
emotions = ['N' if not e==e else e for e in tmp_df['emotion'].tolist()]

In [None]:
ids = []
counts = {'N':0,
         'H':0,
         'A':0, 
         'S':0,
         'F':0}
for i in range(0, len(emotions)):
    e = emotions[i]
    if counts[e]<5000:
        ids.append(i)
        counts[e]+=1

In [None]:
subset = sub.iloc[ids]


In [None]:
y = ['N' if not e==e else e for e in subset['emotion'].tolist()]
X = subset['tweet'].tolist()
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    sub_df = subset.iloc[test_index]
    sub_df.to_csv('../data/covid-latent/undersampling/fold'+str(i)+'.csv', index=False)

In [None]:
emotions = ['N' if not e==e else e for e in sub['emotion'].tolist()]

In [None]:
counts = []
E = ['H', 'A', 'S', 'F', 'N']
for e in E:
    counts.append(emotions.count(e))
counts

In [None]:
Y = counts
COLORS = ["green", "red", "yellow", "blue", "purple"]

names = ['Happiness', 'Anger', 'Sadness', 'Fear', 'None']
sns.set_style("darkgrid")
ax = sns.barplot(x=names, y=Y, palette=COLORS)
for bars in ax.containers:
    ax.bar_label(bars, fmt="%.f")

#ax.set(title='Emotion attribute distribution')
plt.show()

# Birdwatch

In [None]:
df = pd.read_csv('../data/birdwatch/notes-00000.tsv', sep='\t')
df

In [None]:
with open('../data/birdwatch/idtweetVALUES.jsonl', 'r') as f:
    json_list = list(f)

tweets_list = []
ids_list = []
   
for json_str in json_list:
    tweets = json.loads(json_str)
    tweets_list.append(tweets['full_text'])
    ids_list.append(tweets['id'])

In [None]:
c = [0, 0, 0, 0, 0, 0]
labels = ['MISINFORMED_OR_POTENTIALLY_MISLEADING', 'NOT_MISLEADING']

relevant_tweets = []
relevant_ids = []
relevant_notes = []

for i in range(0, len(ids_list)):
    tid = ids_list[i]
    sub = df[df['tweetId']==tid]
    m = len(sub) if len(sub)<7 else 6
    c[m-1]+=1
    notes = sub['classification'].tolist()
    counts_lab = [notes.count(l) for l in labels]
    
    if counts_lab[0]!=counts_lab[1]:
        relevant_tweets.append(tweets_list[i])
        relevant_ids.append(tid)
        relevant_notes.append(labels[np.argmax(counts_lab)])

In [None]:
new_df = pd.DataFrame()
new_df['id'] = relevant_ids
new_df['tweet'] = relevant_tweets
new_df['note'] = relevant_notes
new_df

In [None]:
y = new_df['note'].tolist()
X = new_df['tweet'].tolist()
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    sub_df = new_df.iloc[test_index]
    sub_df.to_csv('../data/birdwatch/fold'+str(i)+'.csv', index=False)
    

In [None]:
counts = [test_df['note'].tolist().count(a) for a in range(0, 2)]

In [None]:
Y = counts
COLORS = ["lightcoral", "lightgreen"]

names = ['Misleading', 'Not Misleading']
sns.set_style("darkgrid")
ax = sns.barplot(x=names, y=Y, palette=COLORS)
for bars in ax.containers:
    ax.bar_label(bars, fmt="%.f")

#ax.set(title='Emotion attribute distribution')
plt.show()

# Russian Troll

In [None]:
sub = pd.read_csv('../data/russian-troll-tweets/IRAhandle_tweets_2.csv')

In [None]:
dfs = []

for i in trange(1, 14):
    sub = pd.read_csv('../data/russian-troll-tweets/IRAhandle_tweets_'+str(i)+'.csv')
    dfs.append(sub)
    
df = pd.concat(dfs)
df

In [None]:
sub = df['account_category'].tolist()
counts = [sub.count(a) for a in df['account_category'].unique()]
better_counts = [] # merge everything that is not left/right 
better_counts.append(counts[3])
better_counts.append(sum(counts)-counts[3]-counts[0])
better_counts.append(counts[0])
better_counts

In [None]:
Y = better_counts
COLORS = ["blue", "grey", "red"]

names = ['Left', 'Other', 'Right']
sns.set_style("darkgrid")
ax = sns.barplot(x=names, y=Y, palette=COLORS)
for bars in ax.containers:
    ax.bar_label(bars, fmt="%.f")

#ax.set(title="Stance towards 'face masks'")
plt.show()