In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import seaborn as sns

from tweet_parser.tweet import Tweet
from tweet_parser.getter_methods.tweet_geo import get_profile_location

from twittersearch.result_stream import ResultStream
from twittersearch.utils import *

In [3]:
username = "agonzales@twitter.com"
search_api = "fullarchive"
account_name = "shendrickson"
endpoint_label = "ogformat.json"

search_endpoint = gen_endpoint(search_api, account_name, endpoint_label, count_endpoint=False)
count_endpoint = gen_endpoint(search_api, account_name, endpoint_label, count_endpoint=True)

search_args = {"username": username, "password": pw, "url": search_endpoint }
count_args = {"username": username, "password": pw, "url": count_endpoint }

In [None]:
_rule = """
"taylor swift"
has:geo
or 
has:profile_geo
"""

count_rule = gen_rule_payload(_rule,
                        from_date="2016-09-01",
                        to_date="2017-09-01",
                        max_results=500, 
                        count_bucket="day")

search_rule = gen_rule_payload(_rule,
                        from_date="2016-09-01",
                        to_date="2017-09-01",
                        max_results=500, 
                        )
rule

In [None]:
counts = list(ResultStream(**count_args, rule_payload=rule, max_tweets=1000).stream())

In [None]:
(pd.DataFrame(counts)
 .assign(timePeriod=lambda df: pd.to_datetime(df["timePeriod"]))
 .set_index("timePeriod")
 .sort_index()
 .plot()
 
)

In [None]:
tweets = list(ResultStream(**search_args, rule_payload=search_rule, max_tweets=500).stream())

In [None]:
t = tweets[0]
t.text
t.profile_location

In [None]:
def tweet_geo_collector(result_stream, tag, fields=None):
    if fields is None:
        fields = ["id", "created_at_datetime", "text"]
    
    coords = []
    print("collecting tweets for {}".format(tag))
    for tweet in result_stream.stream():
        attrs = (tweet.__getattribute__(field)
                   for field in fields)
        try:
            _coords = get_a_geo_coordinate(tweet)
            coords.append(list(it.chain.from_iterable([attrs, _coords])))
        except AttributeError:
            print("error in geo")
            print(tweet.id, tweet.text)
            continue
        
        
    result_stream.end_stream()
    columns = list(it.chain.from_iterable([fields, ["lat", "long"]]))
    
    df = (pd.DataFrame(coords, columns=columns)
          .pipe(latlng_to_meters, "lat", "long")
          .drop(["lat", "long"], axis=1)
          .assign(tag=tag)
         )
    return df



In [None]:
rs = ResultStream(**as_args, rule_payload=rule, max_results=500, )
# rs.artist = "taylor_swift"
df = tweet_geo_collector(rs, tag="taylor_swift", fields=["id"])

In [None]:
import numpy as np

In [None]:
def jitter_point(lat, long, pct=10):
    
    return np.random.uniform(lat, long)

def jitter_box(lat, long)

In [None]:
from functools import reduce

from tweet_parser.tweet_checking import is_original_format

try:
    import numpy as np
    mean_bbox = lambda x: list(np.array(x).mean(axis=0))
except ImportError:
    mean_bbox = lambda x: (reduce(lambda y, z: y + z, x) / len(x))

def get_profile_geo_coords(tweet):
    geo = tweet.profile_location.get("geo")
    coords = geo.get("coordinates") # in [LONG, LAT]
    if coords:
        long, lat = coords
    return lat, long


def get_place_coords(tweet, est_center=False):
    """
    Places are formal spots that define a bounding box around a place.
    Each coordinate pair in the bounding box is a set of [[lat, long], [lat, long]]
    pairs.
    
    """
    
    def get_bbox_ogformat():
        _place = tweet.get("place")
        if _place is None:
            return None
    
        return (_place
                .get("bounding_box")
                .get("coordinates")[0])

    def get_bbox_asformat():
        _place = tweet.get("location")
        if _place is None:
            return None
        return (_place
                .get("geo")
                .get("coordinates")[0])
        
    bbox = get_bbox_ogformat() if is_original_format(tweet) else get_bbox_asformat()

    return mean_bbox(bbox) if est_center else bbox


def get_exact_geo_coords(tweet):
    geo = tweet.get("geo")
    if geo is None:
        return None
    
    # coordinates.coordinates is [LONG, LAT]
    # geo.coordinates is [LAT, LONG]
    field = "geo" if is_original_format(tweet) else "geo"
    coords = tweet.get("geo").get("coordinates")
    return coords


def get_a_geo_coordinate(tweet):
    geo = get_exact_geo_coords(tweet)
    lat, long = geo if geo else (None, None)
    if lat:
        return lat, long
    long, lat = get_place_coords(tweet, est_center=True)
    return lat, long


def latlng_to_meters(df, lat_name, lng_name):
    """
    Taken and modified from the datashader notebooks 
    """
    lat = df[lat_name]
    lng = df[lng_name]
    origin_shift = 2 * np.pi * 6378137 / 2.0
    mx = lng * origin_shift / 180.0
    my = np.log(np.tan((90 + lat) * np.pi / 360.0)) / (np.pi / 180.0)
    my = my * origin_shift / 180.0
    return df.assign(mx=mx).assign(my=my)


In [None]:
from functools import partial

In [None]:

from bokeh.models import WMTSTileSource
from bokeh.tile_providers import STAMEN_TONER

from bokeh.io import output_notebook, show
from bokeh.plotting import ColumnDataSource, figure
from bokeh.models import HoverTool, value

output_notebook()

tiles = {'OpenMap': WMTSTileSource(url='http://c.tile.openstreetmap.org/{Z}/{X}/{Y}.png'),
         'ESRI': WMTSTileSource(url='https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{Z}/{Y}/{X}.jpg'),
         'Wikipedia': WMTSTileSource(url='https://maps.wikimedia.org/osm-intl/{Z}/{X}/{Y}@2x.png'),
         'Stamen': WMTSTileSource(url="http://tile.stamen.com/toner-background/{z}/{x}/{y}.png")
         }


def plot_tweets(df, x_col="mx", y_col="my", tile="Stamen", title="title"):
    # add our DataFrame as a ColumnDataSource for Bokeh
    plot_data = ColumnDataSource(df)
    # create the plot and configure the
    # title, dimensions, and tools
    plot = figure(title=title,
                  plot_width=800,
                  plot_height=800,
                  tools= ('pan, wheel_zoom, box_zoom, reset'),
                  active_scroll='wheel_zoom')

    # add a hover tool to display words on roll-over
    plot.add_tools(HoverTool(tooltips = '@text'))

    # draw the words as circles on the plot
    plot.circle(x=x_col, y=y_col, source=plot_data,
                     color=u'blue', line_alpha=0.1, fill_alpha=0.1,
                     size=3, hover_line_color='black')

    # configure visual elements of the plot
    plot.title.text_font_size = value('12pt')
    plot.xaxis.visible = False
    plot.yaxis.visible = False
    plot.grid.grid_line_color = None
    plot.outline_line_color = None
    plot.add_tile(tiles[tile])
    return plot



In [None]:
_rule

In [None]:
from functools import partial

def gen_multipart(rules):
    rules = ['("{}")'.format(rule) for rule in rules]
    return "({}) has:geo place_country:us".format(' OR '.join(rules))

gen_rule = partial(gen_rule_payload,
                   max_results=500,
                   from_date="2016-09-01",
                   to_date="2017-09-01",
                   )
base_rule = """
"{}"
has:geo
place_country:us
"""

artists = ["taylor swift",
           "uzi vert",
           "beyonce",
           "luke bryan",
           "eminem"
          ]

artist_dicts = {"taylor swift": ["taylor swift",
                                 "look what you made me do",
                                 ],
                
                "uzi vert": ["uzi vert", "lil uzi", "lil uzi vert", 
                             "XO TOUR Lif3", "money longer"],
                "beyonce": ["beyonce", "formation",
                            "queen bey", "bey", "beyoncé",
                            "halo", "crazy in love"],
                "luke bryan": ["luke bryan", "huntin fishin lovin", "play it again"],
                "eminem": ["eminem", "love the way you lie", "rap god", "lose yourself"]
                
               }
_tweet_collector = partial(tweet_geo_collector, fields=["id"])

artist_rules = [gen_rule(gen_multipart(v)) for k, v in artist_dicts.items()]

artist_rules

In [None]:
rule

In [None]:
tweets = list(ResultStream(**search_args, rule_payload=artist_rules[3]).stream())

In [None]:
print(tweets[20].text)

In [None]:
streams = [ResultStream(**as_args,
                        rule_payload=rule,
                        max_results=100000)
           for rule in artist_rules]



In [None]:
results = [tweet_geo_collector(stream, tag) for stream, tag in zip(streams, artists)]

In [None]:
df = pd.concat(results)

In [None]:
df.query("tag == 'beyonce'").sort_values("created_at_datetime", ).head()

In [None]:
df.tag.value_counts()

In [None]:
import seaborn as sns
sns.set_style("white")

In [None]:
(df
 .set_index("created_at_datetime")
 .sort_index()
 .groupby([pd.TimeGrouper("D"), "tag"])
 .size()
 .to_frame("tweets")
 ["tweets"]
 .unstack()
 .fillna(0)
 .plot()
 
)

In [None]:
def even_sample(df, cat_col):
    cats = df[cat_col].unique()
    vc = df[cat_col].value_counts()
    min_count = vc.min()
    res = []
    for cat in cats:
        res.append(df[df[cat_col] == cat].sample(min_count))
    return pd.concat(res)

In [None]:
even_df = even_sample(df, "tag")

In [None]:
from bokeh.models.widgets import Panel, Tabs

In [None]:
two_chainz = plot_tweets(df.query("tag == '2 chainz'"))
ku = plot_tweets(df.query("tag == 'keith urban'"))
ts = plot_tweets(df.query("tag == 'taylor swift'"))
bey = plot_tweets(df.query("tag == 'beyonce'"))

tabs = Tabs(tabs=[Panel(child=two_chainz, title="2 chainz"),
                 # Panel(child=ku, title="keith urban"),
                  Panel(child=ts, title="taylor swift"),
                 # Panel(child=bey, title="beyonce")
                 ])

In [None]:
from bokeh.io import output_file, reset_output, save

In [None]:
reset_output()
# output_file("bokeh_tabs.html")

In [None]:
output_file("test_bokeh.html")

In [None]:
save(tabs, filename="test_bokeh.html")

## Datashader

In [None]:
from bokeh import palettes

import datashader as ds
import datashader.transfer_functions as tf

from datashader.bokeh_ext import InteractiveImage

from cartopy import crs


import geoviews as gv

import holoviews as hv

from holoviews.operation.datashader import aggregate, shade, datashade, dynspread


hv.notebook_extension('mpl', 'bokeh')


def gen_col_points(categories, colormap):
    inv_cats = {k: k for k in categories}
    color_points = hv.NdOverlay({inv_cats[k]: gv.Points([0,0],
                                                        crs=crs.PlateCarree(),
                                                        label=inv_cats[k])
                                 (style=dict(color=v))
                                 for k, v in colormap.items()})
    return color_points

In [None]:
plot_df = df.assign(tag=lambda df: df["tag"].astype("category"))

In [None]:
x_min, y_min, x_max, y_max = (plot_df.mx.values.min(),
                              plot_df.my.values.min(),
                              plot_df.mx.values.max(),
                              plot_df.my.values.max())
x_range=(x_min, x_max)
y_range=(y_min, y_max)
color_key = dict(zip(artists, palettes.Category10[len(artists)]))
shade_defaults = dict(x_range=x_range,
                      y_range=y_range,
                      width=1200,
                      height=660)

In [None]:
%%output filename="artist_datashaded_points"
%%opts Overlay [width=800 height=600 xaxis=None yaxis=None show_grid=False ] (background_alpha=0.1) 
%%opts Shape (fill_color=None line_width=1.5) [apply_ranges=False] 
%%opts Points [apply_ranges=False tools=[]]
%%opts WMTS (alpha=0.25)

# shade_defaults = dict(x_range=(x_max, x_min),
                      # y_range=(y_max, y_min),
                      # width=1200,
                      # height=660)

shaded_points = datashade(hv.Points(gv.Dataset(plot_df,
                                               kdims=["mx", "my"],
                                               vdims=["tag"])),
                          cmap=color_key,
                          element_type=gv.Image,
                          aggregator=ds.count_cat("tag"),
                          **shade_defaults, 
                         )

color_points = gen_col_points(color_key.keys(), color_key)

map_ = gv.WMTS(tiles["Stamen"]) * dynspread(shaded_points,
                                            max_px=1,
                                            threshold=0.5) * color_points
map_