This notebook is for visualizing large-scale properties of the dataset

In [1]:
import pandas as pd

#Plotting 
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")


%cd twitterproject
# inject config value (on command line would've been --config=data-analysis)
import sys
# args = ['--config', 'data-analysis']
args = ['--config', 'laptop-mining']
old_sys_argv = sys.argv
sys.argv = [old_sys_argv[0]] + args
import environment

from TwitterDatabase.Repositories import DataRepositories as DR
from TwitterDatabase.DatabaseAccessObjects import DataConnections as DC
from TwitterDatabase.Models.WordORM import Word
from TwitterDatabase.Models.TweetORM import Users as User
from TwitterDatabase.Models.TweetORM import Tweet
from DataAnalysis.SearchTools.WordMaps import get_adjacent_word_counts, get_adjacent_words, get_user_ids_for_word

EXP_TERMS_FILEPATH = '%s/experimental-terms.xlsx' % environment.EXPERIMENTS_FOLDER
IDS_FILEPATH = "%s/temp_output/user-ids.xlsx" % environment.LOG_FOLDER_PATH


  return f(*args, **kwds)
  return f(*args, **kwds)


(bookmark:twitterproject) -> /Users/adam/Dropbox/PainNarrativesLab/TwitterProject
/Users/adam/Dropbox/PainNarrativesLab/TwitterProject
Reading configuration from /Users/adam/Dropbox/PainNarrativesLab/TwitterProject/configurations/testing.config.ini


In [None]:
dao = DC.MySqlConnection(environment.CREDENTIAL_FILE)

# Number of distinct users whose tweets we have captured

In [None]:
data = pd.read_sql_query("select tweetID, userID from tweets", dao.engine) #, index_col='tweetID')
print("Loaded %s tweets" % len(data))

In [None]:
userCount = len(data.groupby('userID').groups)
print("Captured tweets from %s distinct users" % userCount)

# Number of tweets per user

## All results

In [None]:
tweetsPerUser = data.groupby('userID').size()
tweetsPerUser.describe()

In [None]:
sns.distplot(tweetsPerUser)

## Trimmed results

In [None]:
MAX_PER_USER = 20
trimmed = tweetsPerUser[tweetsPerUser.between(1, MAX_PER_USER)]

In [None]:
trimmed.describe()

In [None]:
fig, axes = plt.subplots(figsize=(5,4))
sns.distplot(trimmed, ax=axes)
axes.set_title("Tweets per user (trimmed at %s)" % MAX_PER_USER); fig.tight_layout()

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(10,4))
sns.violinplot(trimmed, ax=axes[0])
sns.boxplot(trimmed, ax=axes[1])
axes[0].set_title("Tweets per user (trimmed at %s)" % MAX_PER_USER); 
fig.tight_layout()

# Temporal distibution of captured tweets

In [None]:
timeData = pd.read_sql_query("select created_at from tweets", dao.engine) #, index_col='tweetID')
print("Loaded %s tweets" % len(timeData))

In [None]:
# convert to timestamps (takes a long time)
timeData['created_at'] = timeData.apply(lambda x : pd.Timestamp(x.created_at), axis=1)
# add extra column to help with plotting
timeData['tweet'] = 1

In [None]:
timeData[:5]

## Monthly tweet count summary

In [None]:
# monthly
monthly = timeData.set_index('created_at').resample('1M').sum()
# weekly
# tt = timeData.set_index('created_at').resample('1W').sum()

In [None]:
monthly.describe()

In [None]:
fig, axes = plt.subplots(figsize=(15,5))
tt.plot(kind='bar', ax=axes)
axes.set_title('Tweet creation dates by month'); axes.set_ylabel("# tweets")
fig.tight_layout()

In [None]:
from bokeh.palettes import Spectral6, Category20, magma, inferno, viridis

def color_generator(num_colors, palette_function=viridis):
    """Returns a color from the relevant palette"""
    colorlist = palette_function(num_colors)
    for c in colorlist:
        yield c


In [None]:
def ticker():
    """Replaces the numeric y axis label with the correct term
    The dict seems to need to be hardcoded since bokeh
    messes with any args or values which seem like they should be 
    in scope"""
    dd = {
        1: 'crps',
        2: 'migraine',
        3: 'fibromyalgia',
        4: 'spoonie',
        5: 'vulvodynia',
        6: 'endometriosis',
        7: 'neuropathy',
        8: 'arthritis',
        9: 'rhem_arthritis',
        10: 'shingles',
        11: 'backpain',
        12: 'headache'
    }

    term = dd.get( tick )
    return "{}".format( term )


In [None]:
from bokeh.io import show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.sampledata.commits import data
from bokeh.transform import jitter

from bokeh.plotting import *
from bokeh.models import FuncTickFormatter
from bokeh.models.tickers import FixedTicker


def plot_tweet_distributions(frame, terms, title='tweet frequencies'):
    colorgen = color_generator(1)
    
    color = next(colorgen)
      
    # initialize the notebook output
    output_notebook()

    # create a new plot with a title and axis labels
    p = figure(title=title, 
               x_axis_type="datetime", 
               plot_width=800, 
               plot_height=500, 
               x_axis_label='timestamp', 
               y_axis_label='tweet')

    source = ColumnDataSource(frame)
    p.circle(x='created_at', 
                 y=jitter('tweet', width=0.5, range=p.y_range), 
                 fill_color=color, 
                 source=source, 
                 alpha=0.6
                )

    p.x_range.range_padding = 0
    p.ygrid.grid_line_color = None
    p.legend.orientation = "horizontal"

    # limit the displayed tick locations to those corresponding to the 
    # terms in the dataframe 
    tick_locations = [x for x in range(1, len(terms) + 1)]
    p.yaxis.ticker = FixedTicker(ticks=tick_locations)

    # Now add the labels instead of the numbers to the y axis
#     p.yaxis.formatter = FuncTickFormatter.from_py_func(ticker)

    # show the results
    show(p)

In [None]:
plot_tweet_distributions(timeData, ['created_at'])

# Users

## Users with tons of followers

These will often be celebrities

In [None]:
LIMIT = 25
query = """
SELECT screen_name, followers_count, friends_count, statuses_count 
FROM users 
ORDER BY followers_count DESC 
LIMIT %s""" % LIMIT
popular = pd.read_sql_query(query, dao.engine, index_col='screen_name')

In [None]:
popular

## By status count

In [None]:
LIMIT = 25
query = """
SELECT screen_name, followers_count, friends_count, statuses_count 
FROM users 
ORDER BY statuses_count DESC 
LIMIT %s""" % LIMIT
freq = pd.read_sql_query(query, dao.engine, index_col='screen_name')

In [None]:
freq