In [1]:
import requests
import pandas as pd
from anytree import Node, RenderTree
from functions import *

In [2]:
with open('Authentication/database_uri.txt', 'r', encoding="utf8") as f:
    uri = f.read()

In [3]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def connect_to_endpoint(url, headers, next_token = None):    
    response = requests.request("GET", url, headers = headers)
        
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def getTweets(user_id, header):
    tweets_url = f'https://api.twitter.com/2/users/{user_id}/tweets'
    return connect_to_endpoint(tweets_url, header)

# 'conversation_id' is the identifier for the main tweet
def getConversation(conversation_id, max_results, header):
    params = 'in_reply_to_user_id,author_id,created_at,conversation_id'
    getConversation_url = f'https://api.twitter.com/2/tweets/search/recent?query=conversation_id:{conversation_id}&tweet.fields={params}&max_results={max_results}'

    return connect_to_endpoint(getConversation_url, header)

# For now we will return the time only
def getTweetInformation(conversation_id, header):
    params = 'created_at,conversation_id,in_reply_to_user_id,author_id,referenced_tweets'
    tweetInfo_url = f'https://api.twitter.com/2/tweets?tweet.fields={params}&ids={conversation_id}'
    
    result = connect_to_endpoint(tweetInfo_url, header)
    return result['data'][0]['created_at']

def getTweetComments(conversation_data):
    conversation_dict = {'id':[], 'timestamp':[], 'reply_to':[], 'tweet':[]}
    
    for i in range(len(conversation_data['data'])):
        print('User ID:', conversation_data['data'][i]['author_id'], 
              'Time:', conversation_data['data'][i]['created_at'])
        print('In reply to:', conversation_data['data'][i]['in_reply_to_user_id'])
        print(conversation_data['data'][i]['text'], '\n')
        
        conversation_dict['id'].append(conversation_data['data'][i]['author_id'])
        conversation_dict['timestamp'].append(conversation_data['data'][i]['created_at'])
        conversation_dict['reply_to'].append(conversation_data['data'][i]['in_reply_to_user_id'])
        conversation_dict['tweet'].append(conversation_data['data'][i]['text'])
        
    return conversation_dict

In [4]:
with open('Authentication/twitter_bearer_token.txt', 'r', encoding="utf8") as f:
    token = f.read()

header = create_headers(token)
max_results = 100

# The Straits Times
## Get Tweets

In [5]:
ST_id = '37874853'
ST_tweets = getTweets(ST_id, header)

In [6]:
for i in range(len(ST_tweets['data'])):
    print('Tweet ID:', ST_tweets['data'][i]['id'], 
          'Time:', getTweetInformation(ST_tweets['data'][i]['id'], header))
    print(ST_tweets['data'][i]['text'], '\n')
    
#     command = (
#             '''
#             INSERT INTO twitter_data
#             VALUES ('%s', '%s', '%s');
#             ''' % (ST_tweets['data'][i]['id'], getTweetInformation(ST_tweets['data'][i]['id'], header), 
#                    ST_tweets['data'][i]['text'])
#             )
#     setUpDB(command, uri)

Tweet ID: 1490327180819431429 Time: 2022-02-06T14:11:03.000Z
RT @STsportsdesk: Winter Olympics: 'Proud Kiwi' snowboarder masters Great Wall for New Zealand's first Games gold https://t.co/YVhGOnp3Ms 

Tweet ID: 1490327052733485056 Time: 2022-02-06T14:10:32.000Z
7,639 new local Covid-19 cases in S'pore; 1,074 patients hospitalised https://t.co/0kboaSZvQe 

Tweet ID: 1490320134803628032 Time: 2022-02-06T13:43:03.000Z
RT @STsportsdesk: Winter Olympics: 'He's come!' Hanyu mania hits Games as Japanese star figure skater arrives at last https://t.co/RNSLgGwV… 

Tweet ID: 1490320009377112067 Time: 2022-02-06T13:42:33.000Z
High number of patients at hospitals' emergency departments, most did not need urgent care: MOH https://t.co/XD1Dr7k9nR 

Tweet ID: 1490313967356420108 Time: 2022-02-06T13:18:33.000Z
RT @stbusinessdesk: Sinjia Land's $2b RTO price tag may be revised with new valuation report https://t.co/lf10EV7xDk 

Tweet ID: 1490311080895156224 Time: 2022-02-06T13:07:04.000Z
RT @STsportsde

In [7]:
# query = '@straits_times'
# tweets_url = f'https://api.twitter.com/2/tweets/search/recent?query={query}&max_results={max_results}'
# connect_to_endpoint(tweets_url, header)

## Get Conversation

In [8]:
# ST_conversation_data = getConversation('1475383966832021504', max_results, header)
ST_conversation_data = getConversation('1389937492065931266', max_results, header)

In [9]:
ST_result = getTweetComments(ST_conversation_data)

User ID: 1319461064111697920 Time: 2022-02-06T13:52:32.000Z
In reply to: 87818409
@guardian https://t.co/Uh1D3GqFwj 

User ID: 1001839970221469696 Time: 2022-02-06T01:22:46.000Z
In reply to: 87818409
@guardian https://t.co/oSkHlKSEbi 

User ID: 1001839970221469696 Time: 2022-02-05T20:54:50.000Z
In reply to: 87818409
@guardian https://t.co/dEmWJjaV5l 

User ID: 1470337517341880324 Time: 2022-02-05T16:37:36.000Z
In reply to: 87818409
@guardian https://t.co/sc65nJCyva  Nazifascista do PSDB instrumentaliza e tortura mulheres negras do Partido dos Trabalhadores, em processo de construção do golpe. 

User ID: 1105682057097871360 Time: 2022-02-05T16:02:00.000Z
In reply to: 87818409
@guardian The famillies of the opinion prisonners: Nadjib Khimoud, Souhaib Debaghi &amp; Mohamed Tajadith, confirmed that their sons were tortured and injured very badly in Harache's because of that they are in Hunger strike. After that they were transferred to Blida jail to revange from them. https://t.co/uTiyiAI0

In [10]:
ST_df = pd.DataFrame.from_dict(ST_result)
ST_df['id'] = ST_df['id'].astype(str)
ST_df['reply_to'] = ST_df['reply_to'].astype(str)
ST_df['url'] = ST_df['tweet'].apply(lambda x: getLinks(x))
ST_df['link_title'] = ST_df['url'].apply(lambda x: getURLfromList(x))
ST_df.to_csv('Datasets/ST_twitter_data.csv')

ST_df.head()

Unnamed: 0,id,timestamp,reply_to,tweet,url,link_title
0,1319461064111697920,2022-02-06T13:52:32.000Z,87818409,@guardian https://t.co/Uh1D3GqFwj,https://twitter.com/Walcandy1/status/149032251...,[twitter.com]
1,1001839970221469696,2022-02-06T01:22:46.000Z,87818409,@guardian https://t.co/oSkHlKSEbi,https://twitter.com/Linn45350887/status/149013...,[twitter.com]
2,1001839970221469696,2022-02-05T20:54:50.000Z,87818409,@guardian https://t.co/dEmWJjaV5l,https://twitter.com/Linn45350887/status/149006...,[twitter.com]
3,1470337517341880324,2022-02-05T16:37:36.000Z,87818409,@guardian https://t.co/sc65nJCyva Nazifascist...,https://www.radarfeminista.com.br/post/instrum...,[www.radarfeminista.com.br]
4,1105682057097871360,2022-02-05T16:02:00.000Z,87818409,@guardian The famillies of the opinion prisonn...,https://twitter.com/bendjimli18/status/1489992...,[twitter.com]


# Channel NewsAsia
## Get Tweets

In [11]:
CNA_tweets = getTweets('38400130', header)

for i in range(len(CNA_tweets['data'])):
    print('Tweet ID:', CNA_tweets['data'][i]['id'],
          'Time:', getTweetInformation(ST_tweets['data'][i]['id'], header))
    print(CNA_tweets['data'][i]['text'], '\n')
    
#     command = (
#             '''
#             INSERT INTO twitter_data
#             VALUES ('%s', '%s', '%s');
#             ''' % (CNA_tweets['data'][i]['id'], getTweetInformation(CNA_tweets['data'][i]['id'], header), 
#                    CNA_tweets['data'][i]['text'])
#             )
#     setUpDB(command, uri)

Tweet ID: 1490339260519489542 Time: 2022-02-06T14:11:03.000Z
Varner snatches Saudi International win with late heroics https://t.co/QVzENXdzKe https://t.co/5n40bCqyZh 

Tweet ID: 1490335486526722050 Time: 2022-02-06T14:10:32.000Z
Luge-Ludwig of Germany takes gold in men's singles https://t.co/hYsKt49b34 https://t.co/B8fC9Wljom 

Tweet ID: 1490331710256418817 Time: 2022-02-06T13:43:03.000Z
Freestyle skiing-China-born US skier Owens wins over viewers https://t.co/p9YteU5zhx https://t.co/CwAkKUxR2L 

Tweet ID: 1490328457774993408 Time: 2022-02-06T13:42:33.000Z
Liverpool ease past Cardiff into FA Cup fifth round https://t.co/7XV3t4bbAs https://t.co/u5EjnWBd3C 

Tweet ID: 1490328448430006274 Time: 2022-02-06T13:18:33.000Z
Luge-Ludwig of Germany takes gold in men's singles https://t.co/brmNXWPHx6 https://t.co/3udSNuLvTl 

Tweet ID: 1490328439382970368 Time: 2022-02-06T13:07:04.000Z
Freestyle skiing-Jakara Anthony wins Australia its first Winter Games gold in over a decade https://t.co/qHqRIo

## Get Conversation

In [12]:
CNA_conversation_data = getConversation('1389937492065931266', max_results, header)
# CNA_conversation_data = getConversation('1477849910682832897', max_results, header)

result = getTweetComments(CNA_conversation_data)

User ID: 1319461064111697920 Time: 2022-02-06T13:52:32.000Z
In reply to: 87818409
@guardian https://t.co/Uh1D3GqFwj 

User ID: 1001839970221469696 Time: 2022-02-06T01:22:46.000Z
In reply to: 87818409
@guardian https://t.co/oSkHlKSEbi 

User ID: 1001839970221469696 Time: 2022-02-05T20:54:50.000Z
In reply to: 87818409
@guardian https://t.co/dEmWJjaV5l 

User ID: 1470337517341880324 Time: 2022-02-05T16:37:36.000Z
In reply to: 87818409
@guardian https://t.co/sc65nJCyva  Nazifascista do PSDB instrumentaliza e tortura mulheres negras do Partido dos Trabalhadores, em processo de construção do golpe. 

User ID: 1105682057097871360 Time: 2022-02-05T16:02:00.000Z
In reply to: 87818409
@guardian The famillies of the opinion prisonners: Nadjib Khimoud, Souhaib Debaghi &amp; Mohamed Tajadith, confirmed that their sons were tortured and injured very badly in Harache's because of that they are in Hunger strike. After that they were transferred to Blida jail to revange from them. https://t.co/uTiyiAI0

In [13]:
df = pd.DataFrame.from_dict(result)
df['id'] = df['id'].astype(str)
df['reply_to'] = df['reply_to'].astype(str)
df['url'] = df['tweet'].apply(lambda x: getLinks(x))
df['link_title'] = df['url'].apply(lambda x: getURLfromList(x))
df.to_csv('Datasets/sample_tweet_conversation.csv')

df.head()

Unnamed: 0,id,timestamp,reply_to,tweet,url,link_title
0,1319461064111697920,2022-02-06T13:52:32.000Z,87818409,@guardian https://t.co/Uh1D3GqFwj,https://twitter.com/Walcandy1/status/149032251...,[twitter.com]
1,1001839970221469696,2022-02-06T01:22:46.000Z,87818409,@guardian https://t.co/oSkHlKSEbi,https://twitter.com/Linn45350887/status/149013...,[twitter.com]
2,1001839970221469696,2022-02-05T20:54:50.000Z,87818409,@guardian https://t.co/dEmWJjaV5l,https://twitter.com/Linn45350887/status/149006...,[twitter.com]
3,1470337517341880324,2022-02-05T16:37:36.000Z,87818409,@guardian https://t.co/sc65nJCyva Nazifascist...,https://www.radarfeminista.com.br/post/instrum...,[www.radarfeminista.com.br]
4,1105682057097871360,2022-02-05T16:02:00.000Z,87818409,@guardian The famillies of the opinion prisonn...,https://twitter.com/bendjimli18/status/1489992...,[twitter.com]


In [14]:
# regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
# allLinks = re.findall(regex, df['tweet'][0])

# allLinks

In [15]:
# import urllib

# text = 'My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of https://www.geeksforgeeks.org/'

# urls = re.findall("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", text)
# for url in urls:
#     try:
#         opener = urllib.request.build_opener()
#         request = urllib.request.Request(url)
#         response = opener.open(request)
#         actual_url = response.geturl()
#         print(actual_url)
#     except:
#         print(url)