# Download tweets via the request package from the twitter api

- Download tweets for training of application 2 for the maelstrom project. 
- Search based on:
    - keywords based on vocabularies related to weather (from seperate files found in ../data/vocabularies/) including emojis provided by emoji pacakge
    - date / time
    - usually require geo spatial information
- Includes quick analysis of 
    - time distribution of tweets
    - source of tweets
    - most active users 

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import random
import re
import logging
from collections import Counter
import os
import glob

logging.basicConfig(level=logging.INFO)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import emoji
import a2.twitter.downloader
import a2.dataset.load_dataset

In [None]:
def print_tweets(filename="test.csv", n_sample=10, n=None):
    """load tweets in filename and print random n_sample or n tweets from beginning of file"""
    df = pd.read_csv(filename, skiprows=2).to_xarray()
    df["created_at"] = (["index"], pd.to_datetime(df.created_at).values)
    if n is None:
        index_list = [random.randrange(df["tweet_id"].shape[0]) for i in range(n_sample)]
    elif n == "all":
        index_list = [i for i in range(df["tweet_id"].shape[0])]
    else:
        index_list = [i for i in range(n)]
    for i in index_list:
        print("------------------------------")
        print(df.text.values[i])
        print("------------------------------")
    return df


def print_author_text_source(df):
    for author, tweet_text, source in zip(df["author_id"].values, df["text"].values, df["source"].values):
        tweet_text = tweet_text.replace("\n", "")
        print(f"from: {author}, {source}\n    text: {tweet_text}")

In [None]:
df_emoji = pd.read_csv("../../src/a2/data/emoji/emoji_df.csv")
df_emoji

## Prepare vocab for search query

In [None]:
vocab = a2.twitter.downloader.get_vocabulary()
vocab = [re.sub(r" \(.*\)", "", x) for x in vocab][0:64]
vocab_string = " OR ".join(vocab)

##  Using emoji package to include emojis in search 

pick emojis with name that matches words in vocab 

In [None]:
emoji_list = a2.twitter.downloader.get_emojis_from_vocab(vocab, exclude=["rainbow flag"])

## Run search query

Would like to connect information about the weather provided in the tweet with the location of the person tweeting. Therefore geospatial location is crucial.
The following search terms appear sensible for this project:
- `has:geo` 
    - enforces that the user either (1) activated tracking on his device such that GPS coordinates are directly available or (2) tagged his tweet from a list of possible locations provided by [foursquare](https://twittercommunity.com/t/foursquare-location-data-in-the-api/36065) with varying *levels* (state, county, city, ...)
    - additional information provided by [twitter](https://developer.twitter.com/en/docs/tutorials/advanced-filtering-for-geo-data)
- `-is:retweet`
    - is *not* a retweet as they cannot have locations attached, already filtered out by `has:geo`
- `lang:en`
    - tweets are tagged with language or marked undefined `und`
- `place_country:GB`
    - country of origin (look at GB as rather active on twitter)
- `-is:nullcast`
    - removes tweets that are for promotion only 
- `-from:3446146816`
    - exclude specific user, userid 3446146816 produces massive amounts of tweets about flood warnings, which are expected to bias the model 

See overview [listing of operators](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) provided by twitter.

In addition, we specify
- `start_dates`
- `end_dates`
- `max_results`
    - twitter only provides a maximum of 500 tweets, our library tweets_downloader takes care of this, however we limit total number of downloads per query to not accidently exceed our quota of 10M per month due to excessive spamming of a user. 
    - that's also why currently limit query to single month

In [None]:
keyword = f"sun has:geo -is:retweet (lang:en) place_country:GB -is:nullcast -from:3446146816"
keyword = "sun -is:retweet (lang:en) -is:nullcast -from:3446146816"

start_dates = "2020-01-1T10:00:10.000Z"
end_dates = "2020-01-1T12:00:00.000Z"
end_dates = "2020-01-1T10:05:00.000Z"
filename = "fake_not_enought_tweets"
max_results = 600
print(keyword)
a2.twitter.downloader.download_tweets(
    filepath=filename,
    keyword=keyword,
    start_dates=start_dates,
    end_dates=end_dates,
    max_results=max_results,
    sleep_time=0.1,
)

In [None]:
def format_date(year, month=1, day=1):
    return "%i-%02d-%02dT00:00:00.000Z" % (year, month, day)

In [None]:
# We remove known bots via their userid `weather_bots`.
weather_bots = (
    "-from:3446146816"
    " -from:2522135204 -from:4643841555 -from:19711240 -from:2581881138 -from:26643647 -from:23366418 -from:2411260615 -from:1035516433 -from:88720351 -from:470441737"
    " -from:3029396645 -from:20788211 -from:186672208 -from:161831709"
)
ads = "-from:824637752574488576"
bot_terms = [
    "Hum",
    "Press",
    "Barometer",
    "Pressure",
    "Humidity",
    "Baro",
    "Humid",
]
keyword = (
    f'({" OR ".join(emoji_list)} OR {" OR ".join(vocab)}) has:geo -is:retweet (lang:en) place_country:GB -is:nullcast '
    + weather_bots
    + " "
    + ads
    + " "
    + "".join(["-" + t + " " for t in bot_terms])
)
max_results = 200_000
year = 2014
for month in range(1, 13):
    start_dates = format_date(year, month, 1)
    end_dates = format_date(year, month + 1, 1)
    if month == 12:
        end_dates = format_date(year + 1, 1, 1)
    filename = "tweets_%i_%02d" % (year, month)
    print(keyword)
    a2.twitter.downloader.download_tweets(
        filepath=filename,
        keyword=keyword,
        start_dates=start_dates,
        end_dates=end_dates,
        max_results=max_results,
        sleep_time=0.1,
    )
    # break

In [None]:
list(range(1, 1))

In [None]:
weather_bots = (
    "-from:3446146816"
    " -from:2522135204 -from:4643841555 -from:19711240 -from:2581881138 -from:26643647 -from:23366418 -from:2411260615 -from:1035516433 -from:88720351 -from:470441737"
    " -from:3029396645 -from:20788211 -from:186672208 -from:161831709"
)
ads = "-from:824637752574488576"
keyword = (
    f"has:geo -is:retweet (lang:en) place_country:GB -is:nullcast "
    # + weather_bots
    # + " "
    # + ads
)
max_results = 200_000
year = 2020
month = 2
for day in range(14, 24):
    start_dates = "%i-%02d-%02dT00:00:00.000Z" % (year, month, day)
    end_dates = "%i-%02d-%02dT00:00:00.000Z" % (year, month, day + 1)
    if month == 12:
        end_dates = "%i-%02d-01T00:00:00.000Z" % (year + 1, 1)
    filename = f"tweets_no_keywords_{start_dates}_{end_dates}"
    print(keyword)
    a2.twitter.downloader.download_tweets(
        filepath=filename,
        keyword=keyword,
        start_dates=start_dates,
        end_dates=end_dates,
        max_results=max_results,
        sleep_time=0.1,
    )
    # break

## Check downloaded Tweets

In [None]:
json_filename = filename + ".json"
ds = a2.dataset.load_dataset.load_tweets_dataframe_from_jsons([json_filename]).to_xarray()

In [None]:
def print_tweet_authors(ds):
    authors = ds.groupby("author_id").count().sortby("id", ascending=False)
    for i, (n, a) in enumerate(zip(authors["id"].values, authors["author_id"].values)):
        if i > 20:
            break
        print(f"{a} --> {n}")
        mask = ds.author_id.values == a
        print(f"sample: {ds['text'].loc[mask].values}")


print_tweet_authors(ds)

In [None]:
ds

## Check for occurence of bots and most active users

In [None]:
filepath = "../data/tweets/gb_2017_rain_sun_vocab_emojis/"
path = os.path.abspath(filepath)  # use your path
all_files = glob.glob(os.path.join(path, "tweets_2017*.json"))

# ds = a2.dataset.load_dataset.load_tweets_dataframe_from_jsons(
#     all_files
# ).to_xarray()
ds = a2.dataset.load_dataset.load_tweets_dataframe_from_json("tweets_2018_01.json").to_xarray()
n_old = ds.index.shape[0]
ds = ds.where(
    (ds.source == "Twitter for iPhone")
    | (ds.source == "Twitter for Android")
    | (ds.source == "Instagram")
    | (ds.source == "Twitter for iPad")
    | (ds.source == "Twitter Web Client"),
    drop=True,
)
sources_non_bot = [
    "Twitter for iPhone",
    "Twitter for Android",
    "Instagram",
    "Twitter for iPad",
    "Twitter Web Client",
]
n_new = ds.index.shape[0]
print(f"initial size dataset: {n_old}, removed {n_old-n_new} 'bots', new size: {n_new}")

In [None]:
authors = ds.groupby("author_id").count().sortby("id", ascending=False)

In [None]:
def print_tweet_authors(ds):
    authors = ds.groupby("author_id").count().sortby("id", ascending=False)
    for i, (n, a) in enumerate(zip(authors["id"].values, authors["author_id"].values)):
        if i > 20:
            break
        print(f"{a} --> {n}")
        mask = ds.author_id.values == a
        unique_sources = np.unique(ds["source"].loc[mask].values)
        print(f"source: {unique_sources[0] if len(unique_sources) else unique_sources}")
        print(f"sample: {np.random.choice(ds['text'].loc[mask].values, 5 if n>5 else n, replace=False)}")


print_tweet_authors(ds.where(ds.text.str.contains(f"{'|'.join(bot_terms)}")))

In [None]:
sources = ds.groupby("source").count().sortby("id", ascending=False)
for n, s in zip(sources["id"].values, sources["source"].values):
    print(f"{s} --> {n}")

In [None]:
fig = plt.figure(figsize=(10, 15))
ax = plt.axes()
ds.groupby("source").count().plot.scatter(y="source", x="id", figsize=(20, 20), ax=ax)

In [None]:
print_tweet_authors(ds)

In [None]:
print_tweet_authors(ds.where(~ds.source.str.contains(f"{'|'.join(sources_non_bot)}")))

# Quick analysis

## Number of tweets per user 

In [None]:
df = print_tweets(filename="test.csv", n_sample=10, n=None)

In [None]:
df.groupby("author_id").count()["tweet_id"].plot.hist(bins=100)
ax = plt.gca()
ax.set_yscale("log")
ax.set_xlabel("occurence of author_id")
ax.set_ylabel("count");

## Source of tweets: Private weather stations usually have source: pywws / MeteoWare Plus+

In [None]:
df.groupby("source").count().plot.scatter(y="source", x="tweet_id", size=10)
ax = plt.gca()
ax.tick_params(axis="x", labelrotation=0)
ax.set_xlabel("source")
ax.set_ylabel("count");

## Number of tweets per day of the year

In [None]:
df.groupby("created_at.dayofyear").count().plot.scatter(x="dayofyear", y="author_id");

## Look at most active users 

In [None]:
activity = df.groupby("author_id").count().sortby("tweet_id", ascending=False)

In [None]:
user_activity_sorted_by_activity = activity["author_id"]
number_of_tweets = activity["tweet_id"]

In [None]:
user_activity_sorted_by_activity[number_of_tweets > 2]

In [None]:
mask = number_of_tweets > 2
for user_id, n_tweets in zip(user_activity_sorted_by_activity[mask].values, number_of_tweets[mask].values):
    user = a2.data_manipulation.twitter.downloader.get_user_from_userid(user_id)["data"]
    print(
        f'{user["name"]}, @{user["username"]}, {user["location"] if "location" in user else "?"} --> {n_tweets} tweets'
    )

## Example of 
### - obtaining user information from user id
### - getting location from place id
### - converting tweets in json format to list 

In [None]:
j = a2.data_manipulation.twitter.downloader.get_user_from_userid(375106238)
print(j)

In [None]:
a2.data_manipulation.twitter.manipulate_tweets.convert_single_tweet_to_list(j, "user")

In [None]:
tweet = a2.data_manipulation.twitter.downloader.get_location_from_placeid("78e87ea8817310a6")
tweet

In [None]:
a2.data_manipulation.twitter.manipulate_tweets.convert_single_tweet_to_list(tweet, "location", skip_fields=["country"])