In [45]:
import import_ipynb
from pathlib import Path
import requests
from sentence_transformers import SentenceTransformer
import os
import json
import pickle
from scipy import spatial
import datetime
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import time


# To set your environment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'

def auth():
    return '<your_bearer_token>'

def create_url(user_id):
    # Replace with user ID below
    # user_id = 14499829
    return "https://api.twitter.com/2/users/{}/tweets".format(user_id)


def get_params(start_time, end_time):
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld

    return {"tweet.fields": "created_at", "max_results": 100,
            "start_time": start_time,
            "end_time": end_time}
            # "start_time": "2021-11-01T00:00:00Z",
            # "end_time": "2021-12-01T00:00:00Z"}


def get_new_params(new_token, start_time, end_time):
    return {"tweet.fields": "created_at", "max_results": 100,
            'pagination_token': new_token,
            "start_time": start_time,
            "end_time": end_time}


def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", url, headers=headers, params=params)
    # print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def get_tweet(id, start_time, end_time):
    timelineDataList = []
    bearer_token = auth()
    url = create_url(id)
    headers = create_headers(bearer_token)
    params = get_params(start_time,end_time)
    time.sleep(5)
    json_response = connect_to_endpoint(url, headers, params)
    # print(json_response)
    if 'data' in json_response.keys():
        timelineDataList = json_response['data']
    if 'meta' in json_response.keys():
        while 'next_token' in json_response['meta']:
            new_params = get_new_params(json_response['meta']['next_token'], start_time, end_time)
            time.sleep(5)
            json_response = connect_to_endpoint(url,headers, new_params)
            if 'data' in json_response.keys():
                timelineDataList = timelineDataList+json_response['data']
    return timelineDataList


def filter_tweet(json_response):
    keylist = set('omicron coronavirus koronavirus covid corona isolation quarantine cdc wuhancoronavirus wuhanlockdown ncov wuhan N95 kungflu epidemic outbreak sinophobia'.split())
    timelineDataList = []
    for i in json_response:
        if i['text'][0:2] == 'RT':
            pass
        for j in keylist:
            if j in i['text'].lower():
                timelineDataList.append(i['text'].lower())
                break
    return timelineDataList


def sentence_transformer(tweets):
    wv = SentenceTransformer('bert-base-nli-mean-tokens')
    wholevec = []
    for i in tweets:
        vec = wv.encode(i)
        wholevec.append(vec)
    return wholevec


def ave(wordveclist):
    n = len(wordveclist[0])
    avevec = [0]*n
    for i in wordveclist:
        for j in range(n):
            avevec[j] += i[j]
    for i in range(n):
        avevec[i] = avevec[i]/n
    return avevec

def similarity(vec1, vec2):
    result = spatial.distance.cosine(vec1, vec2)
    return result



def availableTweetToVec(idlist, initial_time, num_timestep):
    vec = []
    for id in idlist:
        time.sleep(5)
        filtedveclist = []
        for i in range(num_timestep):
            start_time = initial_time + relativedelta(days=i*10)
            end_time = start_time + relativedelta(days=10)
            unfilterTweet = get_tweet(id, start_time.isoformat("T") + "Z", end_time.isoformat("T") + "Z")
            tweet = filter_tweet(unfilterTweet)
#             print(len(tweet))
            filtedveclist.append(ave(sentence_transformer(tweet)))
#             print(len(filtedveclist))
        vec.append(filtedveclist)
#         print(len(vec))
    return vec

def filteravailable(idlist, initial_time, num_timestep):
    unavailableid = []
    for id in idlist:
        time.sleep(5)
        for i in range(num_timestep):
            # start_time = "2021-{:02d}-01T00:00:00Z".format(9+i)
            # end_time = "2021-{:02d}-01T00:00:00Z".format(10+i)
            start_time = initial_time + relativedelta(days=i*10)
#             timedelta(month=i)
            end_time = start_time + relativedelta(days=10)
#     timedelta(month=1)
            unfilterTweet = get_tweet(id, start_time.isoformat("T") + "Z", end_time.isoformat("T") + "Z")
            tweet = filter_tweet(unfilterTweet)
            if len(tweet) == 0:
                unavailableid.append(id)
                print(len(unavailableid))
                break
    availableid = []
    for i in idlist:
        if i not in unavailableid:
            availableid.append(i)
    return availableid

def generateAvailableFollowing(idlist):
    for i in idlist:
        id_file = Path(i)
        gzip_path = id_file.with_suffix('.p')
        if gzip_path.is_file():
            print('skipping file already exists: {}'.format(gzip_path))
            continue
        following = get_following(i, 20)
        available_user = filteravailable(following,datetime(2021,9,1),10)
        print(len(available_user))
        pickle.dump(available_user, open(gzip_path, "wb"))

In [46]:
def availableTweet(idlist, initial_time, num_timestep, len_timestep):
    idtonum = {}
    for id in idlist:
        id_file = Path('pfile/'+str(initial_time)+str(len_timestep)+'timeline'+id)
        gzip_path = id_file.with_suffix('.p')
        if gzip_path.is_file():
            print('skipping file already exists: {}'.format(gzip_path))
            continue
        time.sleep(5)
        all_num = []
        filter_num = []
        all_unfilterTweet = []
        all_tweet = []
        for i in range(num_timestep):
            start_time = initial_time + relativedelta(days=i*len_timestep)
            end_time = start_time + relativedelta(days=len_timestep)
            unfilterTweet = get_tweet(id, start_time.isoformat("T") + "Z", end_time.isoformat("T") + "Z")
            all_unfilterTweet.append(unfilterTweet)
            all_num.append(len(unfilterTweet))
            tweet = filter_tweet(unfilterTweet)
            all_tweet.append(tweet)
            filter_num.append(len(tweet))
        idtonum[id] = [all_num,filter_num]
        pickle.dump([all_unfilterTweet, all_tweet], open(gzip_path, "wb"))
    return idtonum

In [None]:
start_time = datetime(2020,3,1)
checkId = pickle.load(open("pfile/checkId.p", "rb"))
result = availableTweet(checkId,start_time, 70,10)

In [34]:
idtonum = {}
for id in checkId:
    id_file = Path('pfile/timeline'+id)
    gzip_path = id_file.with_suffix('.p')
    if gzip_path.is_file():
        file = pickle.load(open(gzip_path, "rb"))
        all_num = []
        filter_num = []
        for i in range(len(file[0])):
            all_num.append(len(file[0][i]))
            filter_num.append(len(file[1][i]))
        idtonum[id] = [all_num,filter_num]

In [None]:
def numCount(idDict, num_timestep, len_timestep):    
    all_num_list = []
    filter_num_list = []
    zero_count_list = []
    filter_zero_count = []
    for id in idtonum.keys():
        for i in range(len(idtonum[id][0])):
            all_num 