<a href="https://colab.research.google.com/github/Confirmation-Bias-Analyser/Data-Collection/blob/main/Twitter_Data_Collection_from_PHEME_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install anytree
!pip install vaderSentiment

# Import Essential Libraries

In [2]:
import tensorflow as tf
import pandas as pd
from google.colab import files
from anytree import Node, RenderTree, search
import random
import json

# The shutil module offers a number of high-level 
# operations on files and collections of files.
import os
import shutil
import json

In [3]:
def createTweetsTree(dictionary, tree_root):
    for key, item in dictionary.items():
        child = Node(key, parent=tree_root)

        if len(dictionary[key]) != 0:      
            createTweetsTree(dictionary[key], child)

        else:
            continue

def processTweetsToDict(conversation_data):
    conversation_dict = {'id':[], 'timestamp':[], 'reply_to':[], 'comment':[]}

    for i in conversation_data:
        print('User ID:', i['id'], 
              'Time:', i['user']['created_at'])
        print('In reply to:', i['in_reply_to_status_id'])
        print(i['text'], '\n')

        conversation_dict['id'].append(i['id'])
        conversation_dict['timestamp'].append(i['user']['created_at'])
        conversation_dict['reply_to'].append(i['in_reply_to_status_id'])
        conversation_dict['comment'].append(i['text'])

    return conversation_dict            

# Extract Data from PHEME Dataset

In [4]:
!wget "https://figshare.com/ndownloader/files/11767817/PHEME_veracity.tar.bz2"

--2022-05-22 15:48:52--  https://figshare.com/ndownloader/files/11767817/PHEME_veracity.tar.bz2
Resolving figshare.com (figshare.com)... 54.72.163.193, 52.214.105.240, 2a05:d018:1f4:d003:e08:f20f:4c26:f45c, ...
Connecting to figshare.com (figshare.com)|54.72.163.193|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/11767817/PHEME_veracity.tar.bz2?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20220522/eu-west-1/s3/aws4_request&X-Amz-Date=20220522T154852Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=59f3231f36dfcdab30081f7bec4036b6b715940385b2b8db997abdf23330ed86 [following]
--2022-05-22 15:48:52--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/11767817/PHEME_veracity.tar.bz2?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20220522/eu-west-1/s3/aws4_request&X-Amz-Date=20220522T154852Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signatur

In [5]:
URL = "https://figshare.com/ndownloader/files/11767817/PHEME_veracity.tar.bz2"

dataset = tf.keras.utils.get_file(fname="PHEME_veracity.tar.bz2", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

In [6]:
# Unzip extracted file
!tar xzf '/content/PHEME_veracity.tar.bz2.tar.gz'

# Import extracted dataset and understand directory

In [7]:
threads = []
data = ['non-rumours', 'rumours']

for i in os.listdir('/content/all-rnr-annotated-threads'):
  if i[0] != '.':
    print(i)
    threads.append(i)

ferguson-all-rnr-threads
germanwings-crash-all-rnr-threads
gurlitt-all-rnr-threads
ottawashooting-all-rnr-threads
sydneysiege-all-rnr-threads
putinmissing-all-rnr-threads
ebola-essien-all-rnr-threads
charliehebdo-all-rnr-threads
prince-toronto-all-rnr-threads


Understand the number of files in each sub-directory.

In [8]:
rumours = 0
non_rumours = 0

for i in threads:
  path = '/content/all-rnr-annotated-threads/' + i
  print(i)

  for j in os.listdir(path):
    
    for k in data:

      for l in os.listdir(path + '/' + k):
        if k == data[0] and l[0] != '.':
          non_rumours += 1

        elif k == data[1] and l[0] != '.':
          rumours += 1
  
  print('Rumours:', rumours)
  print('Non-rumours:', non_rumours)
  print()

ferguson-all-rnr-threads
Rumours: 1704
Non-rumours: 5154

germanwings-crash-all-rnr-threads
Rumours: 3132
Non-rumours: 6540

gurlitt-all-rnr-threads
Rumours: 3498
Non-rumours: 7002

ottawashooting-all-rnr-threads
Rumours: 6318
Non-rumours: 9522

sydneysiege-all-rnr-threads
Rumours: 9450
Non-rumours: 13716

putinmissing-all-rnr-threads
Rumours: 10206
Non-rumours: 14388

ebola-essien-all-rnr-threads
Rumours: 10290
Non-rumours: 14388

charliehebdo-all-rnr-threads
Rumours: 13038
Non-rumours: 24114

prince-toronto-all-rnr-threads
Rumours: 14412
Non-rumours: 24138



# Read sample file from sub-directory

In [9]:
sample_directory = f'/content/all-rnr-annotated-threads/{random.choice(threads)}/{random.choice(data)}/'

reactions = []

for i in os.listdir(sample_directory):
  if i[0] != '.':
    sample_directory_reactions = sample_directory + i + '/reactions/'

    for j in os.listdir(sample_directory_reactions):
      if j[0] != '.':
        reactions.append(j)

    if len(reactions) < 1:
      reactions = []
      continue
      
    else:
      break

print(sample_directory_reactions)

/content/all-rnr-annotated-threads/charliehebdo-all-rnr-threads/rumours/552791182670327808/reactions/


Tweets are saved as JSON files in the sub-directories. We are able to view the attributes of the JSON file.

In [10]:
json_file = sample_directory_reactions + random.choice(reactions)
print(json_file)

with open(json_file, 'r') as j:
     contents = json.loads(j.read())

contents

/content/all-rnr-annotated-threads/charliehebdo-all-rnr-threads/rumours/552791182670327808/reactions/552812150037958656.json


{'contributors': None,
 'coordinates': None,
 'created_at': 'Wed Jan 07 13:01:01 +0000 2015',
 'entities': {'hashtags': [],
  'symbols': [],
  'urls': [],
  'user_mentions': [{'id': 733238869,
    'id_str': '733238869',
    'indices': [0, 13],
    'name': 'Royston Price',
    'screen_name': 'RoystonPrice'},
   {'id': 283640557,
    'id_str': '283640557',
    'indices': [14, 30],
    'name': 'Tim Stanley',
    'screen_name': 'timothy_stanley'}]},
 'favorite_count': 0,
 'favorited': False,
 'geo': None,
 'id': 552812150037958656,
 'id_str': '552812150037958656',
 'in_reply_to_screen_name': 'RoystonPrice',
 'in_reply_to_status_id': 552792777365671936,
 'in_reply_to_status_id_str': '552792777365671936',
 'in_reply_to_user_id': 733238869,
 'in_reply_to_user_id_str': '733238869',
 'lang': 'en',
 'place': None,
 'retweet_count': 0,
 'retweeted': False,
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'text': '@RoystonPrice @timothy_stanley are 

# Read sample directory for its tweets and comments

In [11]:
sample_directory

'/content/all-rnr-annotated-threads/charliehebdo-all-rnr-threads/rumours/'

## All tweets in sample directory

In [12]:
for i in os.listdir(sample_directory):
  if i[0] != '.':
    json_file = sample_directory + '/' + i + '/source-tweets/' + i + '.json'

    with open(json_file, 'r') as j:
      contents = json.loads(j.read())

    print(contents['text'], '\n')

"Several men in black cagoules were heard to shout 'the Prophet has been avenged.'" http://t.co/wu7XfGcZsM 

11 dead in Paris shooting at French satirical newspaper Charlie Hebdo, police official says - @AP http://t.co/ZFJYh80uxy 

French radio @Europe1 says attackers on offices of Paris weekly ran out yelling "Allahu Akbar" #Charliehebdo 

Cartoonist Stephane Charbonnier was critically injured in the attack (LeFigaro) #CharlieHebdo http://t.co/fHMfUmQffo 

The two suspected #CharlieHebdo gunmen have been killed, according to reports. #ParisAttacks http://t.co/sS4Js4JZwm http://t.co/04KSGEeu5i 

Latest on the manhunt for Charlie Hebdo attackers: http://t.co/JfmqW07Vhk http://t.co/mjiiPM2gHG 

Special forces stands on the roof of a building in Dammartin-en-Goele where a hostage-taking is underway #AFP http://t.co/gcm7TRRVfd 

BREAKING: Paris prosecutor: Gunman with hostages in kosher market in eastern Paris; police SWAT on scene. 

French media reporting owner of petrol station saw 2 me

## Look at directory within each tweet

In [13]:
for i in os.listdir(sample_directory):
  if i[0] != '.':
    print(i, '-', os.listdir(sample_directory + '/' + i))

552791182670327808 - ['annotation.json', 'source-tweets', '._annotation.json', '._reactions', '._structure.json', 'structure.json', '.DS_Store', '._.DS_Store', '._source-tweets', 'reactions']
552792988376915969 - ['annotation.json', 'source-tweets', '._annotation.json', '._reactions', '._structure.json', 'structure.json', '.DS_Store', '._.DS_Store', '._source-tweets', 'reactions']
552785367972450304 - ['annotation.json', 'source-tweets', '._annotation.json', '._reactions', '._structure.json', 'structure.json', '.DS_Store', '._.DS_Store', '._source-tweets', 'reactions']
552801880812691456 - ['annotation.json', 'source-tweets', '._annotation.json', '._reactions', '._structure.json', 'structure.json', '.DS_Store', '._.DS_Store', '._source-tweets', 'reactions']
553587225901154304 - ['annotation.json', 'source-tweets', '._annotation.json', '._reactions', '._structure.json', 'structure.json', '._source-tweets', 'reactions']
553488054640340992 - ['annotation.json', 'source-tweets', '._annotat

## Look at the structure of tweets and its comments within the sample directory

In [14]:
count = 0
allTweets = []

for i in os.listdir(sample_directory):
  if i[0] != '.':
    allTweets.append(i)
    reactions_path = sample_directory + '/' + i + '/structure.json'
    
    with open(reactions_path, 'r') as k:
      contents = json.loads(k.read())

    print(count, contents, '\n')
    count += 1

0 {'552791182670327808': {'552791461054672896': [], '552792038044098561': [], '552792355754229762': [], '552792663519666176': [], '552793207042736128': {'552793570709872640': [], '552796973418287104': []}, '552796405589614592': [], '552799240397651969': [], '552800276911235072': [], '552805942551904256': [], '552815736289517571': [], '552852666599374848': []}} 

1 {'552792988376915969': {'552793155767005186': [], '552793161572286464': [], '552793349308968962': [], '552793838260338688': [], '552793846435020800': [], '552794163759304704': [], '552794228729057280': [], '552794345561018368': [], '552794388057694209': [], '552799396748738561': [], '552802898166685698': [], '552814765694021635': [], '552817511717015553': [], '552821452865798144': [], '552873912598794243': []}} 

2 {'552785367972450304': {'552789769131163650': {'552795344959184896': [], '552799622955950080': []}, '552795689814466562': [], '552796299745976320': [], '552799320638894080': [], '552799778291994624': [], '552802003

## All the comments of each tweet

In [15]:
for i in os.listdir(sample_directory):
  if i[0] != '.':

    reactions_path = sample_directory + '/' + i + '/reactions/'
    print(i)

    for j in os.listdir(reactions_path):
      if j[0] != '.':
        with open(reactions_path + j, 'r') as k:
          contents = json.loads(k.read())

        print(contents['id'], contents['text'], contents['in_reply_to_status_id'], '\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
553586277858435072 @ABC THAN QUESTION IF THE NEWS PAPER BUILDING WAS A HIT OF A 'TERROR' SUPPORTER GROUP .. THATS THE TALK A FEW PLACES .. 553574345235103746 

553582831398830080 @ABC I thought France had a better grip on things~ 553574345235103746 

552982443511144448
553068438781190146 @Ironyisfunny8 @nielsflorusse Rest in peace Ahmed and the other victims 552982443511144448 

553035274364940289 @Ironyisfunny8 may he rest in peace - peace to his family and friends 552982443511144448 

553076409917530112 @Stariq7M @Ironyisfunny8 This comment is justifying the attack. Offending someone and killing someone aren't even in the same discussion. 553009816751579136 

553123043128905729 @Ironyisfunny8 @Stariq7M No, i dont. Please elaborate 553122782020517889 

553031496169050112 @Ironyisfunny8 @Ikhwanweb Rest In Peace. 552982443511144448 

553106816541155328 @Ironyisfunny8 @Veribatim - that's an example for show world that we mu

## Structure of comments in a sample tweet

In [16]:
wanted_id = random.choice(allTweets)

with open(sample_directory + '/' + wanted_id + '/structure.json', 'r') as k:
  contents = json.loads(k.read())

contents

{'553535862911541248': {'553538727428243456': [],
  '553539225212452865': {'553539506469494784': []},
  '553539241276608512': [],
  '553539406569938946': {'553539778541805568': []},
  '553540051838459904': [],
  '553540514411474944': [],
  '553540887871885312': [],
  '553541533886996480': [],
  '553546003693842432': []}}

## Construct tree of comments of sample tweet

In [17]:
root = Node(wanted_id)
createTweetsTree(contents[wanted_id], root)

In [18]:
for pre, fill, node in RenderTree(root):
  print("%s%s" % (pre, node.name))

553535862911541248
├── 553538727428243456
├── 553539225212452865
│   └── 553539506469494784
├── 553539241276608512
├── 553539406569938946
│   └── 553539778541805568
├── 553540051838459904
├── 553540514411474944
├── 553540887871885312
├── 553541533886996480
└── 553546003693842432


Get the main sample tweet

In [19]:
tweet_path = sample_directory + '/' + wanted_id
allComments = []

with open(tweet_path + '/source-tweets/' + wanted_id + '.json', 'r') as j:
  master_tweet= json.loads(j.read())

print(master_tweet['text'], '\n')

for j in os.listdir(tweet_path + '/reactions/'):
  if j[0] != '.':
    with open(tweet_path + '/reactions/' + j, 'r') as k:
      contents = json.loads(k.read())
      allComments.append(contents)

Paris: reports that man suspected of killing policewoman has 5 hostages at kosher supermarket  http://t.co/ZuU5xaf25I http://t.co/DVYPLL3XbF 



## Print out all comments and the other relevant information of the sample tweet

In [20]:
result = processTweetsToDict(allComments)

User ID: 553539241276608512 Time: Sat Sep 25 16:43:45 +0000 2010
In reply to: 553535862911541248
@guardiannews Will b terrorists r surely taking notes somewhere that multiple unsophisticated attacks by few well prepared men could ... 

User ID: 553540887871885312 Time: Sat Nov 13 15:23:06 +0000 2010
In reply to: 553535862911541248
@guardiannews @The_Real_Fly I'm guessing La Pen will make good use of this tragedy with the Ultra-Nationalist upsurge that likely follows. 

User ID: 553539778541805568 Time: Wed Jan 16 22:17:00 +0000 2013
In reply to: 553539406569938946
@prozactaker @guardiannews  just warching the Live report 

User ID: 553539506469494784 Time: Thu Nov 18 19:02:27 +0000 2010
In reply to: 553539225212452865
@FaithJupiter Nothing to do with Islam? 

User ID: 553540514411474944 Time: Sat Jun 19 05:11:27 +0000 2010
In reply to: 553535862911541248
@guardiannews 3 thousand men to capture 2 terrorists and now this? Seriously? @LaSuperNea 

User ID: 553541533886996480 Time: Fri Jan