<a href="https://colab.research.google.com/github/Amanuel94/demo-2/blob/master/view_balancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Gathering Datasets



In [50]:
import numpy as np, pandas as pd
import urllib.request
import zipfile
import os
import shutil
import tarfile
import json
import itertools

## 1.1 Fake news challenge ([link](https://www.kaggle.com/datasets/abhinavkrjha/fake-news-challenge))

In [13]:
url = "https://storage.googleapis.com/kaggle-data-sets/1249857/2084561/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230305%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230305T183036Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=803f4dc914566d9b2f5dc5496faec02ffc2c3004753aaea5140784bc7cb77ebd5a02b2f0d59f55770f3b72fcf2c94b326cffc62000d5d58078cceda2c9f6e4e7ec5b634bc50075a2938dd7456a1de56dfd998d3cc58031b7986ec2e1b4afdf80937f2d826b1fbbcda56b4b2d4a0ae59b84518b49974162e50d0e032928c802a5850837685c9ec5677a3d78ea78e025d4f90a36a4d5ad849e7c4ee144280f27b3eaee3353b9d9a05b33e9b0999d1d9b99bff6bd40ce1b76fae0097441417bc0a505cc12826b9c98dc8c2569e7ef637c79f630b42cc007dcc1d9fbb103647c6ef5910e87b2549e5f9e08e6cd76d56b45e31bf7eb8e38d1db215a39d41230accdbf"
name = "fake_news.zip"
urllib.request.urlretrieve(url, name)

!mkdir fake_news

with zipfile.ZipFile("/content/fake_news.zip", 'r') as f:
  f.extractall('./fake_news')

In [37]:
train_bodies = pd.read_csv('./fake_news/train_bodies.csv')
train_stances = pd.read_csv('./fake_news/train_stances.csv')

train_pairs = pd.merge(train_bodies, train_stances, on = 'Body ID')
train_ = train_pairs[train_pairs['Stance'] == 'disagree']

train_.shape

(840, 4)

In [None]:
support = []
deny = []

for i in range(train_pairs.shape[0]):
  if train_pairs.iloc[i]['Stance'] == "agree":
    support.append([train_pairs.iloc[i]['Body ID'], train_pairs.iloc[i]['Headline']])

  if train_pairs.iloc[i]['Stance'] == "disagree":
    deny.append([train_pairs.iloc[i]['Body ID'], train_pairs.iloc[i]['Headline']])

support_df = pd.DataFrame(support).rename(columns = {0: "Body ID", 1:"support"})
deny_df = pd.DataFrame(deny).rename(columns = {0: "Body ID", 1:"deny"})
merged_df = pd.merge(support_df, deny_df, on = 'Body ID', how = "inner")
merged_df.drop(['Body ID'], axis = 1, inplace = True)


In [35]:
merged_df.shape

(3580, 2)

In [39]:
!mkdir gathered_data
train_.to_csv("./gathered_data/fake-news-challenge.csv")
merged_df.to_csv("./gathered_data/fake-news-challenge-extended.csv")


## 1.2 RumorEval2019 ([link to paper](https://www.researchgate.net/publication/345434708_SemEval-2019_Task_7_RumourEval_Determining_Rumour_Veracity_and_Support_for_Rumours))

In [40]:
!git clone https://github.com/kochkinaelena/RumourEval2019.git

Cloning into 'RumourEval2019'...
remote: Enumerating objects: 147, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 147 (delta 1), reused 6 (delta 0), pack-reused 140[K
Receiving objects: 100% (147/147), 14.96 MiB | 16.65 MiB/s, done.
Resolving deltas: 100% (65/65), done.


In [42]:
url = 'https://figshare.com/ndownloader/files/16188500'
filename = 'rumoureval2019.tar.bz2'

urllib.request.urlretrieve(url, filename)

with tarfile.open(filename, 'r:bz2') as tar_ref:
    tar_ref.extractall('.')

with zipfile.ZipFile('/content/rumoureval2019/rumoureval-2019-training-data.zip', 'r') as zip_ref:
  zip_ref.extractall('.')

!mv /content/RumourEval2019/preprocessing /content
os.chdir('./preprocessing/')


The following cell is taken from the `preprocecssing_tweets.py` file found in the referenced github repository

In [48]:
from tree2branches import tree2branches
def load_true_labels():

    tweet_label_dict = {}
    veracity_label_dict = {}
    path_dev = "../rumoureval-2019-training-data/dev-key.json"
    with open(path_dev, 'r') as f:
        dev_key = json.load(f)

    path_train = "../rumoureval-2019-training-data/train-key.json"
    with open(path_train, 'r') as f:
        train_key = json.load(f)

    tweet_label_dict['dev'] = dev_key['subtaskaenglish']
    tweet_label_dict['train'] = train_key['subtaskaenglish']

    return tweet_label_dict
def load_dataset():

    # Load labels and split for task A and task B
    tweet_label_dict = load_true_labels()
    print(tweet_label_dict)
    dev = tweet_label_dict['dev']
    train = tweet_label_dict['train']
    dev_tweets = dev.keys()
    train_tweets = train.keys()
    # Load folds and conversations
    path_to_folds = '../rumoureval-2019-training-data/twitter-english'

    folds = sorted(os.listdir(path_to_folds))
    newfolds = [i for i in folds if i[0] != '.']
    folds = newfolds
    cvfolds = {}
    allconv = []
    train_dev_split = {}
    train_dev_split['dev'] = []
    train_dev_split['train'] = []
    train_dev_split['test'] = []

    # folds are tweet titles
    for nfold, fold in enumerate(folds):
        path_to_tweets = os.path.join(path_to_folds, fold)
        tweet_data = sorted(os.listdir(path_to_tweets))
        newfolds = [i for i in tweet_data if i[0] != '.']
        tweet_data = newfolds
        conversation = {}
        # tweet_data is the thread of tweet
        for foldr in tweet_data:
            flag = 0
            conversation['id'] = foldr
            tweets = []
            path_repl = path_to_tweets+'/'+foldr+'/replies'
            files_t = sorted(os.listdir(path_repl))
            newfolds = [i for i in files_t if i[0] != '.']
            files_t = newfolds

            if files_t != []:
                for repl_file in files_t:
                    with open(os.path.join(path_repl, repl_file)) as f:
                        for line in f:
                            tw = json.loads(line)
                            tw['used'] = 0
                            replyid = tw['id_str']
                            if replyid in dev_tweets:
                                tw['set'] = 'dev'
                                tw['label'] = dev[replyid]
        #                        train_dev_tweets['dev'].append(tw)
                                if flag == 'train':
                                    print("The tree is split between sets", foldr)
                                flag = 'dev'
                            elif replyid in train_tweets:
                                tw['set'] = 'train'
                                tw['label'] = train[replyid]
        #                        train_dev_tweets['train'].append(tw)
                                if flag == 'dev':
                                    print("The tree is split between sets", foldr)
                                flag = 'train'
                            else:
                                print("Tweet was not found! ID: ", foldr)
                            tweets.append(tw)
                            if tw['text'] is None:
                                print("Tweet has no text", tw['id'])
                conversation['replies'] = tweets

                path_src = path_to_tweets+'/'+foldr+'/source-tweet'
                files_t = sorted(os.listdir(path_src))
                with open(os.path.join(path_src, files_t[0])) as f:
                    for line in f:
                        src = json.loads(line)
                        src['used'] = 0
                        scrcid = src['id_str']
                        src['set'] = flag
                        src['label'] = tweet_label_dict[flag][scrcid]

                conversation['source'] = src
                if src['text'] is None:
                    print("Tweet has no text", src['id'])
                path_struct = path_to_tweets+'/'+foldr+'/structure.json'
                with open(path_struct) as f:
                    for line in f:
                        struct = json.loads(line)
                if len(struct) > 1:
                    # I had to alter the structure of this conversation
                    if foldr == '553480082996879360':
                        new_struct = {}
                        new_struct[foldr] = struct[foldr]
                        new_struct[foldr]['553495625527209985'] = struct['553485679129534464']['553495625527209985']
                        new_struct[foldr]['553495937432432640'] = struct['553490097623269376']['553495937432432640']
                        struct = new_struct
                    else:
                        new_struct = {}
                        new_struct[foldr] = struct[foldr]
                        struct = new_struct
                    # Take item from structure if key is same as source tweet id
                conversation['structure'] = struct

                branches = tree2branches(conversation['structure'])
                conversation['branches'] = branches
                train_dev_split[flag].append(conversation.copy())
                allconv.append(conversation.copy())
            else:
                flag = 'train'
                path_src = path_to_tweets+'/'+foldr+'/source-tweet'
                files_t = sorted(os.listdir(path_src))
                with open(os.path.join(path_src, files_t[0])) as f:
                    for line in f:
                        src = json.loads(line)
                        src['used'] = 0
                        scrcid = src['id_str']
                        src['set'] = flag
                        src['label'] = tweet_label_dict[flag][scrcid]

                conversation['source'] = src
                if src['text'] is None:
                    print("Tweet has no text", src['id'])

                path_struct = path_to_tweets+'/'+foldr+'/structure.json'
                with open(path_struct) as f:
                    for line in f:
                        struct = json.loads(line)
                if len(struct) > 1:
                    # print "Structure has more than one root"
                    new_struct = {}
                    new_struct[foldr] = struct[foldr]
                    struct = new_struct
                    # Take item from structure if key is same as source tweet id
                conversation['structure'] = struct
                branches = tree2branches(conversation['structure'])

                conversation['branches'] = branches
                train_dev_split[flag].append(conversation.copy())
                allconv.append(conversation.copy())

                print(foldr)

        cvfolds[fold] = allconv
        allconv = []

    return train_dev_split

In [None]:
tweet_data_dev  = load_dataset()['dev']
tweet_data_train = load_dataset()['train']

source_stance = []
for tweet in tweet_data_train:
  for rep in tweet['replies']:
    if rep['label'] == 'deny':
      source_stance.append([tweet['source']['text'], rep['text']])

for tweet in tweet_data_dev:
  for rep in tweet['replies']:
    if rep['label'] == 'deny':
      source_stance.append([tweet['source']['text'], rep['text']])

tweet_arr = np.array(source_stance)
tweet_df = pd.DataFrame(tweet_arr)

tweet_df.to_csv('../gathered_data/rumour-eval-2019.csv', index = False)

The above csv file contains two columns: one the source tweet, the other contradicting replies to that tweet. Another way of processing the data is provided below

In [52]:
source_stance_extended = []
for tweet in tweet_data_train:
  support = []
  deny = []
  for rep in tweet['replies']:
    if rep['label'] == 'deny':
      deny.append(rep['text'])
    if rep['label'] == 'support':
      support.append(tweet['source']['text']+". "+rep['text'])

  prod = itertools.product(support, deny)
  for tup in prod:
    source_stance_extended.append(list(tup))


for tweet in tweet_data_dev:
  support = []
  deny = []
  for rep in tweet['replies']:
    if rep['label'] == 'deny':
      deny.append(rep['text'])
    if rep['label'] == 'support':
      support.append(tweet['source']['text']+". " + rep['text'])

    prod = itertools.product(support, deny)
  
  for tup in prod:
    source_stance_extended.append(list(tup))
    
extended_df = pd.DataFrame(source_stance_extended)
extended_num = np.array(source_stance_extended)
extended_df = pd.DataFrame(extended_num)

tweet_df.to_csv('../gathered_data/rumour-eval-2019-extended.csv', index = False)

In [None]:
os.chdir('../')

## 1.3 Perspectrum ([link](https://github.com/CogComp/perspectrum))


In [None]:
!git clone https://github.com/CogComp/perspectrum.git

path  = "/content/perspectrum/data/dataset/"
with open(path + "perspectrum_with_answers_v1.0.json") as f:
  dataset = json.load(f)

with open(path + "perspective_pool_v1.0.json") as f:
  perspectives = json.load(f)

In [56]:
def findByPids(pids, perspectives):
  ans = []
  for per in perspectives:
    if per['pId'] in pids:
      ans.append(per['text'])

  return ans

In [59]:
data_arr = []
for data in dataset:
  opp  = set()
  for pers in data['perspectives']:
    
    if pers['stance_label_3'] == "UNDERMINE":
      opp =  opp.union(set(pers['pids'])) 
  if opp:  
    data_arr.append([data['text'], list(opp)])

data_ = []
for data in data_arr:
  undermine = findByPids(data[1], perspectives)

  for pers in undermine:
    data_.append([data[0], pers])


persp_df = pd.DataFrame(data_)
persp_df.to_csv('./gathered_data/perspectrum.csv', index = False)

In [60]:
extended_arr = []
for data in dataset:
  opp  = set()
  sim = set()
  for pers in data['perspectives']:
    if pers['stance_label_3'] == "UNDERMINE":
      opp =  opp.union(set(pers['pids'])) 
    if pers['stance_label_3'] == "SUPPORT":
      sim = sim.union(set(pers['pids']))
  if opp:  
    extended_arr.append([data['text'], list(opp), list(sim)])

expanded = []
for data in extended_arr:

  undermine = findByPids(data[1], perspectives)
  support  = findByPids(data[2],  perspectives)

  for pers in undermine:
    expanded.append([data[0], pers])

  prod = itertools.product(support, undermine)

  for tup in prod:
    expanded.append(list(tup))

extended_df = pd.DataFrame(expanded)
extended_df.to_csv('./gathered_data/perspectrum-extended.csv', index = False)

## 1.4 paper-with-code.com ([link](https://paperswithcode.com/task/stance-detection))



### 1.4.1 VAST

In [64]:
!git clone https://github.com/emilyallaway/zero-shot-stance.git

Cloning into 'zero-shot-stance'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 93 (delta 5), reused 3 (delta 3), pack-reused 69[K
Unpacking objects: 100% (93/93), 63.59 MiB | 7.23 MiB/s, done.
Updating files: 100% (54/54), done.


In [65]:
VAST_df = pd.read_csv("/content/zero-shot-stance/data/VAST/vast_train.csv")

support = []
deny = []
for row in range(VAST_df.shape[0]):
  if VAST_df.iloc[row]['label'] == 0:
    deny.append([VAST_df.iloc[row]['new_topic'], VAST_df.iloc[row]['post']])
  if VAST_df.iloc[row]['label'] == 1:
    support.append([VAST_df.iloc[row]['new_topic'], VAST_df.iloc[row]['post']]) 

support_df = pd.DataFrame(support)
deny_df = pd.DataFrame(deny)

support_df = support_df.rename(columns = {0:"topic", 1:"support"})
deny_df = deny_df.rename(columns = {0:"topic", 1:"deny"})

merged_df = pd.merge(support_df, deny_df, on = "topic", how = 'inner').drop_duplicates()
merged_df.to_csv('./gathered_data/vast.csv')


### 1.4.2 OpenStance

In [66]:
!git clone https://github.com/xhz0809/OpenStance.git

sem_df = pd.read_csv("/content/OpenStance/data/SemT6/original_dataset/trainingdata-all-annotations.txt",delimiter = '\t')

support = []
deny = []
for row in range(sem_df.shape[0]):
  if sem_df.iloc[row]['Stance'] == 'AGAINST':
    deny.append([sem_df.iloc[row]['Target'], sem_df.iloc[row]['Tweet']])
  if sem_df.iloc[row]['Stance'] == 'FAVOR':
    support.append([sem_df.iloc[row]['Target'], sem_df.iloc[row]['Tweet']]) 

support_df = pd.DataFrame(support)
deny_df = pd.DataFrame(deny)

support_df = support_df.rename(columns = {0:"topic", 1:"support"})
deny_df = deny_df.rename(columns = {0:"topic", 1:"deny"})

merged_df = pd.merge(support_df, deny_df, on = "topic", how = 'inner').drop_duplicates()

merged_df.to_csv('./gathered_data/semt6.csv')

Cloning into 'OpenStance'...
remote: Enumerating objects: 153, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 153 (delta 8), reused 0 (delta 0), pack-reused 130[K
Receiving objects: 100% (153/153), 10.29 MiB | 5.05 MiB/s, done.
Resolving deltas: 100% (63/63), done.


In [68]:
from pandas.io.common import file_exists
dir = '/content/gathered_data'
file_ = '/content/gathered_data.zip'

with zipfile.ZipFile(file_, 'w') as zip_object:
    
    for foldername, subfolders, filenames in os.walk(dir):        
        for filename in filenames:          
            file_path = os.path.join(foldername, filename)
        
            zip_object.write(file_path)
