In [1]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after<
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import numpy as np
from datetime import datetime
import time

# Set up

In [2]:
os.environ['TOKEN'] = 'AAAAAAAAAAAAAAAAAAAAAKHzbwEAAAAANrBlqwYokl3ttFARZ4ZmiSD7Bw0%3DRa8THgCyIJMa416ckXp3dLfyC4JgfsugJPgLkRT802r0hynEoy'

In [3]:
def auth():
    return os.getenv('TOKEN')

In [4]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [6]:
# Search user information
def create_user_url(name_list):
    """
    name_list: a string in form of 'xx1,xx2,xx3'
    """
    
    search_url = "https://api.twitter.com/2/users/by" # User information

    #change params based on the endpoint you are using
    query_params = {'usernames': name_list,
                     'user.fields': 'created_at,public_metrics'}
    return (search_url, query_params)

In [None]:
# # no need, tweets from a specific user
# def create_tweet_url(ID, start_time, end_time, max_result):
    
#     search_url = "https://api.twitter.com/2/users/"+ID+"/tweets" # Tweets information

#     #change params based on the endpoint you are using
#     query_params = {'start_time':start_time,
#                     'end_time': end_time,
#                     'max_results': max_result,
# #                     'exclude': 'retweets,replies',
#                     'tweet.fields': 'created_at,public_metrics,conversation_id'}
#     return (search_url, query_params)


In [5]:
# Search tweets based on keyword and username
def create_keyword_url(keyword, user_name, max_results):
    """
    keyword: a string in form of 'xx OR xx OR xx'
    user_name: a string in form of 'xxx OR xxx OR xxx'
    max_results: number of return
    """
    
    search_url = "https://api.twitter.com/2/tweets/search/all" # Tweets information

    #change params based on the endpoint you are using
    query_params = {'query': '(' + keyword + ') ('+'from:'+ user_name + ')' + ' -is:reply' + ' lang:en',
                    'start_time': '2021-05-15T00:00:00.000Z',
                    'end_time': '2022-05-15T00:00:00.000Z',
                    'max_results': max_results,
                   'tweet.fields': 'created_at,public_metrics,conversation_id'}

    return (search_url, query_params)

In [21]:
# Search replies of a specific tweet
def create_reply_url(to_user, conversation_id, max_results):
    """
    to_user: the user who tweet
    conversation_id: the conversation_id of the tweet
    max_result: number of return
    """
    
    
    search_url = "https://api.twitter.com/2/tweets/search/all" # Tweets information

    #change params based on the endpoint you are using
    query_params = {'query': '(conversation_id:'+ conversation_id + ') ( ' + 'to:' + to_user + ')' ,
                    'max_results': max_results,
                    'start_time': '2021-05-15T00:00:00.000Z',
                    'end_time': '2022-05-16T00:00:00.000Z',              # one day extension of replies
                    'expansions': 'in_reply_to_user_id',
                   'tweet.fields': 'created_at,public_metrics,conversation_id'}
    return (search_url, query_params)

In [6]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [114]:
def create_rate_url():
    search_url = "https://api.twitter.com/1.1/application/rate_limit_status.json"
    query_params = {'resources' : 'help,users,search,statuses'}
    return (search_url, query_params)

In [186]:
def manage_rate_limits(response):
    while True:

        # Get number of requests left with our tokens
        remaining_requests = int(response.headers["x-rate-limit-remaining"])

        # If that number is one, we get the reset-time and wait until then, plus 15 seconds.
        if remaining_requests == 1:
            buffer_wait_time = 15
            resume_time = datetime.fromtimestamp( int(response.headers["x-rate-limit-reset"]) + buffer_wait_time )
            print(f"One request from being rate limited. Waiting on Twitter.\n\tResume Time: {resume_time}")
            diff = (resume_time - datetime.today()).seconds
            time.sleep(diff)
            return
        
        if response.ok:
            return response

# load influencer name and get influencers info

In [None]:
# Opening json file
f = open('btc_influencers.json')
  
# returns JSON object as a dictionary
data = json.load(f)
user_list = data[0].get('nick').replace('@','')

for i in data[1:10]:
    # can only request for 100 user info at a time
    user_list = user_list + ',' + (i.get('nick').replace('@',''))
  
# Closing file
f.close()

In [None]:
# Get influencers info
bearer_token = auth()
headers = create_headers(bearer_token)
url = create_user_url(user_list)
json_response = connect_to_endpoint(url[0], headers, url[1])
# print(json.dumps(json_response, indent=4, sort_keys=True))

In [None]:
# Save as csv
df = pd.DataFrame(json_response['data'])
followers_count = []
tweet_count = []
for i in range(df.shape[0]):
    followers_count.append(df['public_metrics'][i]['followers_count'])
    tweet_count.append(df['public_metrics'][i]['tweet_count'])
df['followers_count'] = followers_count
df['tweet_count'] = tweet_count
df.drop(columns=['public_metrics'], inplace = True)
df.head()
# df.to_csv('data.csv')

# Get tweets based on username and keyword

In [None]:
# start_time = "2022-05-01T07:55:00.000Z"
# end_time = "2022-05-13T23:10:00.000Z"
# max_result = 5
# ID = '2577886615'
# url2 = create_tweet_url(ID, start_time, end_time, max_result)
# json_response2 = connect_to_endpoint(url2[0], headers, url2[1])
# print(json.dumps(json_response2, indent=4, sort_keys=True))

In [None]:
# # save as csv
# df = pd.DataFrame(json_response2['data'])
# retweet_count = []
# reply_count = []
# like_count = []
# quote_count = []
# for i in range(df.shape[0]):
#     retweet_count.append(df['public_metrics'][i]['retweet_count'])
#     reply_count.append(df['public_metrics'][i]['reply_count'])
#     like_count.append(df['public_metrics'][i]['like_count'])
#     quote_count.append(df['public_metrics'][i]['quote_count'])
# df['retweet_count'] = retweet_count
# df['reply_count'] = reply_count
# df['like_count'] = like_count
# df['quote_count'] = quote_count
# df.drop(columns=['public_metrics'], inplace = True)
# # s
# # # df.to_csv('data.csv')
# df

In [None]:
# Extract tweets with given keyword and user name
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = 'BTC OR Bitcoin'
user_name = 'Bitcoin'
max_results = 15
url3 = create_keyword_url(keyword,user_name, max_results)
json_response3 = connect_to_endpoint(url3[0], headers, url3[1])
# print(json.dumps(json_response3, indent=4, sort_keys=True))
pd.DataFrame(json_response3['data'])

# Get replies from a specific tweet and sorted

In [None]:
bearer_token = auth()
headers = create_headers(bearer_token)
to_user = '2577886615'
conversation_id = '1525212367382007809'
max_results = 10
url4 = create_reply_url(to_user, conversation_id, max_results)
json_response4 = connect_to_endpoint(url4[0], headers, url4[1])
print(json.dumps(json_response4, indent=4, sort_keys=True))

In [None]:
df = pd.DataFrame(json_response4['data'])
likes = []
for i in range(df.shape[0]):
    likes.append(df['public_metrics'][i]['like_count'])
df['like_count'] = likes
df.sort_values(by = 'like_count', ascending=False)

# Extract data and store as csv

In [149]:
keywords = 'btc OR BTC OR Btc OR xbt OR XBT OR bitcoin OR bitcoins OR BITCOIN OR Bitcoin'
bearer_token = auth()
headers = create_headers(bearer_token)
output_path='twitter_data.csv'


# user info
username = 'APompliano'
user_url = create_user_url(username)
user_response = connect_to_endpoint(user_url[0], headers, user_url[1])
user_info = pd.DataFrame(user_response['data'])

Endpoint Response Code: 200


In [167]:
# extract tweets
tweets_url = create_keyword_url(keywords, username, 500)
tweets_response = connect_to_endpoint(tweets_url[0], headers, tweets_url[1])
tweets_df = pd.DataFrame(tweets_response['data'])
count = tweets_response['meta']['result_count']

# if number exceed 500
if 'next_token' in tweets_response['meta']:
    next_token = tweets_response['meta']['next_token']
    while next_token is not None:
        tweets_url = create_keyword_url(keywords, username, 500)
        tweets_response = connect_to_endpoint(tweets_url[0], headers, tweets_url[1],next_token = next_token)
        if 'next_token' in tweets_response['meta']:
            next_token = tweets_response['meta']['next_token']
            count += tweets_response['meta']['result_count']
            tweets_df = pd.concat([tweets_df, pd.DataFrame(tweets_response['data'])], ignore_index=True)
        else: 
            count += tweets_response['meta']['result_count']
            tweets_df = pd.concat([tweets_df, pd.DataFrame(tweets_response['data'])], ignore_index=True)
            break
        
print("Finish")


retweet_count = []
reply_count = []
like_count = []
quote_count = []
for i in range(tweets_df.shape[0]):
    retweet_count.append(tweets_df['public_metrics'][i]['retweet_count'])
    reply_count.append(tweets_df['public_metrics'][i]['reply_count'])
    like_count.append(tweets_df['public_metrics'][i]['like_count'])
    quote_count.append(tweets_df['public_metrics'][i]['quote_count'])
tweets_df['retweet_count'] = retweet_count
tweets_df['reply_count'] = reply_count
tweets_df['like_count'] = like_count
tweets_df['quote_count'] = quote_count
tweets_df.drop(columns=['public_metrics','id'], inplace = True)
tweets_df['is_reply_to_user'] = 0   # 0 means original tweets, 1 means replies
tweets_df['related_user_id'] = user_response['data'][0]['id']

tweets_df.to_csv(output_path, mode='w', header = True, index = False)

Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Finish


In [188]:
url = create_rate_url()
status_response = requests.request("GET",url[0], headers = headers, params = url[1])
manage_rate_limits(status_response)
status_response.headers['x-rate-limit-remaining']

'56'

In [194]:
# Extract replies for each tweet
loop = tweets_df.shape[0]
for i in range(loop):
    print('Present progress: '+ str((i+1)) + ' out of ' + str(loop))
    
    # check rate limit
    url = create_rate_url()
    status_response = requests.request("GET",url[0], headers = headers, params = url[1])
    manage_rate_limits(status_response)
    
    # get tweet replies
    conversation_id = tweets_df.iloc[i]['conversation_id']
    to_user = tweets_df.iloc[i]['related_user_id']
    
    replies_url = create_reply_url(to_user, conversation_id, 300)
    replies_response = connect_to_endpoint(replies_url[0], headers, replies_url[1])
    if 'data' in replies_response:
        replies_df = pd.DataFrame(replies_response['data'])
    else:
        time.sleep(1)
        continue
    
    likes = []
    for j in range(replies_df.shape[0]):
        likes.append(replies_df['public_metrics'][j]['like_count'])
    replies_df['like_count'] = likes
    
    replies_df.sort_values(by = 'like_count', ascending=False, inplace=True)
    if replies_df.shape[0] > 10:
        replies_df = replies_df[:10]
    replies_df.drop(columns=['public_metrics','id'], inplace = True)
    replies_df['is_reply_to_user'] = 1   # 0 means original tweets, 1 means replies
    replies_df.rename(columns = {'in_reply_to_user_id': 'related_user_id'}, inplace=True)
    replies_df = replies_df.reindex(columns=tweets_df.columns)
    replies_df.to_csv(output_path, mode='a', index = False, header = False)
    
    time.sleep(1) # make sure not to touch rate limit
    
    # combine it with tweets_df
    # tweets_df = pd.concat([tweets_df, replies_df], ignore_index=True)

# tweets_df['user_name'] = username



Present progress: 165 out of 1649
Endpoint Response Code: 200


# Test

In [51]:
tweets_df[-10:]

Unnamed: 0,text,created_at,conversation_id,retweet_count,reply_count,like_count,quote_count,is_reply_to_user,related_user_id
1682,@APompliano #Bitcoinhalf is taking over #Crypt...,2022-05-13T13:30:52.000Z,1525106011849347078,,,6,,1,339061487
1683,"@APompliano Hey Pomp,\n\nCryptos only seem to ...",2022-05-13T14:24:03.000Z,1525106011849347078,,,5,,1,339061487
1684,@APompliano You have the chops to be inducted ...,2022-05-13T13:34:03.000Z,1525106011849347078,,,4,,1,339061487
1685,@APompliano 💥 Do you hold?\n\n⏳ tin­y­u­­r­l­....,2022-05-13T13:30:15.000Z,1525106011849347078,,,4,,1,339061487
1686,@APompliano #omc #omchain @omChainio,2022-05-13T17:49:52.000Z,1525106011849347078,,,3,,1,339061487
1687,@APompliano Great job! outstanding consistency 👏,2022-05-13T13:30:33.000Z,1525106011849347078,,,3,,1,339061487
1688,@APompliano Who’s the ghost writer?,2022-05-13T13:31:52.000Z,1525106011849347078,,,3,,1,339061487
1689,@APompliano 🙌🏻🙌🏻 something I look forward to e...,2022-05-13T13:31:47.000Z,1525106011849347078,,,2,,1,339061487
1690,@APompliano It’s been pretty good for me too…t...,2022-05-13T13:30:22.000Z,1525106011849347078,,,2,,1,339061487
1691,@APompliano been listening to your podcast for...,2022-05-13T13:32:21.000Z,1525106011849347078,,,2,,1,339061487


In [None]:
followers_count = []
tweet_count = []
for i in range(df.shape[0]):
    followers_count.append(df['public_metrics'][i]['followers_count'])
    tweet_count.append(df['public_metrics'][i]['tweet_count'])
df['followers_count'] = followers_count
df['tweet_count'] = tweet_count
df.drop(columns=['public_metrics'], inplace = True)

In [180]:
url = "https://api.twitter.com/1.1/application/rate_limit_status.json?x-rate-limit-reset"
r = requests.request("GET",url, headers = headers)
int(r.headers['x-rate-limit-reset'])
s = datetime.fromtimestamp( int(r.headers["x-rate-limit-reset"]))

In [124]:
url = create_rate_url()
status_response = requests.request("GET",url[0], headers = headers, params = url[1])
status_response.headers['x-rate-limit-remaining']

'175'

In [161]:
tweets_df

Unnamed: 0,created_at,text,conversation_id,retweet_count,reply_count,like_count,quote_count,is_reply_to_user,related_user_id
0,2022-05-14T23:03:27.000Z,If your long term conviction of bitcoin’s valu...,1525612785375264768,374,191,2527,42,0,339061487
1,2022-05-14T14:57:17.000Z,"We have helped nearly 1,000 people get a new j...",1525490438093799424,49,79,332,4,0,339061487
2,2022-05-13T21:58:59.000Z,Bitcoin is the best first principles solution ...,1525234174046658565,210,226,1381,24,0,339061487
3,2022-05-13T13:29:42.000Z,I have officially been writing a letter on tec...,1525106011849347078,34,94,372,3,0,339061487
4,2022-05-13T02:48:56.000Z,"[NEW POST] Some Thoughts On LUNA / UST\n\n""The...",1524944753753735178,97,188,634,14,0,339061487
...,...,...,...,...,...,...,...,...,...
1647,2021-05-16T01:59:27.000Z,The anti-bitcoin crowd used to be Keynesian ec...,1393747894939815936,777,355,6107,55,0,339061487
1648,2021-05-16T00:02:12.000Z,RT @APompliano: Here is today’s video breaking...,1393718389361762314,116,0,0,0,0,339061487
1649,2021-05-15T17:52:53.000Z,Stock market closed.\n\nBanks closed.\n\nBitco...,1393625445938118660,952,551,11766,72,0,339061487
1650,2021-05-15T13:54:46.000Z,Bitcoin is the ultimate long term thinking.,1393565524911665156,404,357,5878,38,0,339061487


In [197]:
data = pd.read_csv('twitter_data.csv')
data.shape

(16720, 9)