In [None]:
# this script uses data that I collect using the Twitter streaming API to analyze 
# where tweets are coming from and the sentiment of them. Preliminary figures show some 
# cool temporal dynamics and perhaps some cultural differences.
# Full dataset is ~30Gb and ~180 million tweets

In [1]:
import geopandas as gpd
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import numpy as np
import datetime
import glob
import time
import os
import numpy as np

In [2]:
# load multiple sqlite DBs w/ scraped tweets
ts = np.vectorize(datetime.datetime.fromtimestamp)
os.chdir('/media/data/twitter/')
if not os.path.isfile('/media/data/twitter/twitter_data_clean.csv'):
    dbs = glob.glob('*db')
    data = []
    for db in dbs:
        print('loading ' + db) # I split these files up so any one isn't too big
        conn = sqlite3.connect(db)
        try:
            df = pd.read_sql("select unix, sentiment, loc_source, longitude, latitude, location from sentiment", conn) #  unix, sentiment, loc_source, longitude, latitude, location
        except:
            df = pd.read_sql("select unix, sentiment, longitude, latitude from sentiment", conn) #  unix, sentiment, loc_source, longitude, latitude, location
            # if we don't have location info, or lat/long.. we can drop some rows 
            # an older version of my scraper didn't save loc_source but still ~1/50 tweets has lat/long data
            df = df.dropna()
            df['locatin'] = ''
            df['loc_source'] = ''
        dates = ts(df.unix.values/1000)
        df.index = dates
        conn.close()
        data.append(df)

    df = pd.concat(data)
    del data, dates
    df = df.sort_index()
    df.to_csv('twitter_data_clean.csv')
else:
    df = pd.read_csv('twitter_data_clean.csv')

loading 20210615-102758_twitter.db
loading twitter_fall.db
loading 20210703-133946_twitter.db
loading 20210613-104941_twitter.db
loading 20210703-122204_twitter.db
loading 20210704-115426_twitter.db
loading 20210615-172857_twitter.db
loading twitter_spring.db
loading 20210616-180158_twitter.db
loading 20210616-090825_twitter.db
loading twitter_geo3.db
loading twitter_2.db


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  df = pd.concat(data)


In [None]:
# we are going to attempt to geocode unlabelled tweets by the 'location' listed for users that share this.. fully aware that this won't be correct (or a real place) for many
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="twitter_geo")

print('started with: ', sum(df.latitude.isnull()), ' missing locations')

n = 10 # n is the number of places we will try to get lat/long coord for, sorted by most common places
locs = df['location'].value_counts()[:n].index.tolist()

lon = int(np.where(df.columns=='longitude')[0])
lat = int(np.where(df.columns=='latitude')[0])

for loc in locs:
    try:
        lo  = geolocator.geocode(loc,timeout=5) # get coords for a place
        idx = np.where(df.location==loc)
        for ind in idx[0]:
            df.iat[ind,lon] = lo.longitude
            df.iat[ind,lat] = lo.latitude   
    except:
        pass
    
print('ended with: ', sum(df.latitude.isnull()), ' missing locations')
print('working with: ', sum(df.latitude.notnull()), ' locations')

started with:  180955104  missing locations


In [None]:
# let's only keep the complete data
df = df[df['latitude'].notnull()]
print(len(df))
df.tail()

In [None]:
geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry)   

# #this is a simple map that goes with geopandas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
gdf.plot(ax=world.plot(figsize=(20, 10),edgecolor='k',color='w'), marker='o', cmap='bwr', markersize=5,vmin=-1,vmax=1)

In [None]:
# make a geoDataFrame using lat/long of tweets
places = gdf
places["geometry"] = gdf.apply(lambda row: Point(row["longitude"], row["latitude"]), axis=1)
places = gpd.GeoDataFrame(places, geometry="geometry")
places.crs = {"init": "epsg:4326"}

# Load the countries polygons
country_shapes = world[['geometry', 'iso_a3']]
country_names = world[['name', 'iso_a3']]
countries = world[['geometry', 'name']]
countries = countries.rename(columns={'name':'country'})

# join tweet lat/long with country shapes (i.e. borders)
result = gpd.tools.sjoin(places, countries, how="left")

In [None]:
# get the overall median and hourly average sentiment values for each country, as heatmaps
sentAvg = []
sentLastHr = []

for country in world['name']:
    sentAvg.append(result.loc[result['country']==country]['sentiment'].median())
    sentLastHr.append((result.loc[result['country']==country]['sentiment'].last('4h').mean()-result.loc[result['country']==country]['sentiment'].mean())/result.loc[result['country']==country]['sentiment'].std())
    
world['sentiment'] = sentAvg
world['sentiment_lastHr'] = sentLastHr

fig, axes = plt.subplots(nrows=2, ncols=1,figsize=(30,12))

world.dropna().plot(ax=axes[0],column='sentiment_lastHr',cmap='coolwarm',legend=False,vmin=-.25,vmax=.25) 
axes[0].set_title('last hour')

world.dropna().plot(ax=axes[1],column='sentiment',cmap='coolwarm',legend=False,vmin=-.35,vmax=.35) 
axes[1].set_title('total')

In [None]:
# let's look at temporal dynamics
result[result['country']=='United States of America']['sentiment'].rolling('200S',min_periods=10).mean().plot(marker='.')
plt.title(('United States of America',len(result.loc[result['country']=='United States of America'])))
# plt.xlim((datetime.datetime(2021,7,3,5,48),datetime.datetime.now()))


In [None]:
result[result['country']=='United States of America']['sentiment'].rolling('200S',min_periods=10).mean().plot(marker='.')
plt.title(('United States of America',len(result.loc[result['country']=='United States of America'])))
# plt.xlim((datetime.datetime(2021,7,3,5,48),datetime.datetime.now()))
plt.xlim((datetime.datetime(2020,6,4,1,48),datetime.datetime(2020,6,6,5,48)))

In [None]:
plt.plot(result['latitude'].dropna().index)

In [None]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))