# Hydrating Twitter IDs!
#### More information on the Twitter IDs and the Twarc package can be found in the Sources section

In [None]:
# Read in necessary libraries and Packages
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
import geopandas as gpd
from twarc import Twarc

In [None]:
# Pass in credentials so I can connect to API
t = Twarc('HV2haozYztEHhD38iASch50lL', 
          'XPaRlEkbZjnK5CLEW6pXmDzdKi5Bs3df4rFW9krmNJTg5YYPri',
          '979782709298868224-Jw93XDCp5uzuKsD2kuQABD0QwdwnIb2', 
          'duMEIGT0m1HhNLicPRS3m3eKgJ7FimvVX85egBYE7PW64')

## <u>Option 1 | Read in JSON Files and Parse JSON</u>
#### I used the command line and twarc to hydrate one text file with about 50k Twitter IDs which resulted in a single JSON file. The code below parses through the JSON file and stores the results in a list.

In [None]:
# The JSON file has thousands of JSON objects in it, so we need to first open the file, 
# loop through each line, and use the json.loads() function to extract each object
tweets = []
with open('apriltweets\\some_april_tweets.json') as f:
    for line in f:
        tweets.append(json.loads(line))

In [None]:
# How many tweets do we have in this JSON file
len(tweets)

In [None]:
# What the available data look like for one tweet
tweets[27]

### Text from Tweets

In [None]:
# Printing out the first five tweets in the file
[print(tweets[i]['full_text'],'\n\n') for i in range(5)]

In [None]:
# If a Tweet was retweeted, the text may be shortened. For example, in this tweet below the 'full text'
# is actually cut short, but in the retweeted status we can see the full text.
print(tweets[27]['user']['location'], tweets[27]['full_text']), tweets[27]['retweeted_status']['full_text']

### Locations from Tweets

In [None]:
# Printing out the first five user locations in the file
[tweets[i]['user']['location'] for i in range(15)]

#### Let's see how many users have 'Seattle' as their exact location and what they tweeted.

In [None]:
text_tweets = pd.Series([tweets[i]['full_text'] for i in range(len(tweets))])
locations = pd.Series([tweets[i]['user']['location'] for i in range(len(tweets))])
seattle = pd.Series([tweets[i]['user']['location'] == 'Seattle' for i in range(len(tweets))])
full_text_tweets = []
for i in range(len(tweets)):
    try:
        full_text_tweets.append(tweets[i]['retweeted_status']['full_text'])
    except:
        full_text_tweets.append('n/a')
full_text_tweets = pd.Series(full_text_tweets)

In [None]:
print("There are {} users who have 'Seattle' as their location".format(len(locations[seattle])))

In [None]:
# Let's be more flexible with the location
flexible = pd.Series([tweets[i]['user']['location'] in 
                      ['Seattle','seattle', 'PNW', 'Seattle, Wa'] for i in range(len(tweets))])

In [None]:
print("There are {} users who have 'Seattle', 'seattle', 'PNW', 'Seattle, Wa' as their location".format(len(locations[flexible])))

In [None]:
for i in full_text_tweets[seattle]:
    print(i,'\n')

#### Locations are not great, but some users are geo_enabled and have coordinates.

In [None]:
tweets_coords = []
for i in tweets:
    if i['coordinates'] != None:
        tweets_coords.append(i)

In [None]:
len(tweets_coords)

#### 130 Users in this file have a geolocation

In [None]:
# Show the coordinates + location for one tweet 
tweets_coords[1]['user']['location'], tweets_coords[1]['coordinates']

In [None]:
# Get all lats and lons and store them in a geopandas df for plotting
lats = [tweets_coords[i]['coordinates']['coordinates'][0] for i in range(len(tweets_coords))]
lons = [tweets_coords[i]['coordinates']['coordinates'][1] for i in range(len(tweets_coords))]#
geometry = [Point(xy) for xy in zip(lats, lons)]
geodf = gpd.GeoDataFrame(geometry, columns = ['geometry'])
# Check it works
geodf.head()

In [None]:
# Plot the points!
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

geodf.plot(ax=world.plot(figsize=(15,15)), marker = 'o', color = 'red', markersize = 10)  

## <u>Option 2 | Read in .txt Files & Hydrate them in a Jupyter Notebook</u>

#### First, Bring in .txt Files of Twitter IDs and Merging them
#### We have 4 folders (corresponding to Jan, Feb, Mar, Apr), each of which contains approx. 400-500 .txt files, and each file holds about 50k Twitter IDs that need to be hydrated. See example below for merging each text file in the April folder, resulting in a list with 405 sublists, each with thousands of Twitter IDs.

In [None]:
filenames = []

for file in os.listdir('apriltweets'):
    filename = os.fsdecode(file)
    if filename.endswith( ('.txt') ):
        filenames.append(filename)

In [None]:
ids = []
for file in filenames:
    with open('apriltweets\\'+file,'r') as tweetids:
        ids.append(tweetids.read())

In [None]:
ids[0:2]

In [None]:
testids = [ids[0][0:19], ids[0][20:39], ids[0][40:59]]
testids

#### Second, Hydrate the texts from within Jupyter Notebook - Warning: This goes much slower than using the app_auth in the command line, and rate limits are exceeded much quicker than when using using app_auth: https://github.com/DocNow/twarc

In [None]:
jsontweets = []
for tweet in t.hydrate(ids[0]):
    jsontweets.append(tweet)
    
#for tweet in t.hydrate(open('ids.txt')):
#    print(tweet["text"])

In [None]:
jsontweets

In [None]:
testids = ['1245140084313206786', '1245140084350910464', '1245140084417941505']

In [None]:
jsontweets = []
for i in t.hydrate(testids):
    jsontweets.append(i)

In [None]:
jsontweets = json_normalize(jsontweets)

In [None]:
jsontweets[['created_at', 'full_text', 'coordinates', 'user.location', 'retweeted_status.full_text']]

## <u>Option 3 | Read in .txt Files, merge them, write them to a new text file for hydrating in command line</u>

In [None]:
filenames = []

for file in os.listdir('apriltweets'):
    filename = os.fsdecode(file)
    if filename.endswith( ('.txt') ):
        filenames.append(filename)

In [None]:
ids = []
for file in filenames:
    with open('apriltweets\\'+file,'r') as tweetids:
        ids.append(tweetids.read())

In [None]:
# Write these merged ids
with open('small_merged_twitter_ids.txt', 'w') as outfile:
    for i in ids[0:5]:
        outfile.write(str(i))

### Sources
#### Twarc: https://github.com/DocNow/twarc
#### Twitter Ids: https://github.com/echen102/COVID-19-TweetIDs