# Get descriptive stats for Twitter data

Get descriptive stats for all Twitter data.

In [1]:
import pandas as pd
import numpy as np

## Load data
After processing data [here](collect_validate_NEs_in_tweets.py), we can load it.

In [2]:
combined_data = pd.read_csv('../../data/mined_tweets/combined_tweet_tag_data_NE_flat_valid.gz', sep='\t', index_col=False, compression='gzip')

## Total tweets, NEs per data source

In [7]:
combined_data.loc[:, 'NE_type'].value_counts()

LOCATION                     505773
PERSON                       323781
other                        117692
company                       54120
band                          48118
movie                         45056
O                             38079
facility                      31647
ORGANIZATION                  23229
product                       20781
tvshow                        19054
NUMBER                        18694
CAUSE_OF_DEATH                16082
sportsteam                    12198
COUNTRY                        7122
CITY                           5970
NONE                           5614
MISC                           5584
DATE                           4163
TITLE                          3402
STATE_OR_PROVINCE              3252
PERCENT                         595
MONEY                           573
NATIONALITY                     556
IDEOLOGY                        196
CRIMINAL_CHARGE                  98
RELIGION                         73
ENTITY                      

In [8]:
group_var = 'data_name_fixed'
LOC_TYPES = set(['LOCATION', 'COUNTRY', 'CITY'])
tweet_counts = combined_data.groupby(group_var).apply(lambda x: x.loc[:, 'id'].nunique())
NE_counts = combined_data.groupby(group_var).apply(lambda x: x[(x.loc[:, 'NE_fixed']!='') & (x.loc[:, 'NE_type'].isin(LOC_TYPES))].loc[:, 'NE_fixed'].shape[0])
valid_NE_counts = combined_data.groupby(group_var).apply(lambda x: x[x.loc[:, 'valid_loc']==1].loc[:, 'NE_fixed'].shape[0])
display(tweet_counts)
display(NE_counts)
display(valid_NE_counts)

data_name_fixed
florence     66595
harvey      679400
irma        809423
maria       313088
michael      52506
dtype: int64

data_name_fixed
florence     28670
harvey      181636
irma        229315
maria        57237
michael      22007
dtype: int64

data_name_fixed
florence     9971
harvey      71500
irma        67298
maria        7185
michael      8509
dtype: int64

## Get top NEs per data source

In [18]:
pd.set_option('display.max_colwidth',100)
top_k = 10
combined_data_dedup = combined_data.drop_duplicates('id', inplace=False)
top_NEs = combined_data.groupby(group_var).apply(lambda x: x[x.loc[:, 'NE_type'].isin(LOC_TYPES)].loc[:, 'NE_fixed'].value_counts()[:top_k].index.tolist())
top_valid_NEs = combined_data.groupby(group_var).apply(lambda x: x[x.loc[:, 'valid_loc']==1].loc[:, 'NE_fixed'].value_counts()[:top_k].index.tolist())
display(top_NEs)
display(top_valid_NEs)

data_name_fixed
florence    [north_carolina, nc, #florence, wilmington, south_carolina, sc, florence, carolina, new_bern, ca...
harvey             [texas, houston, tx, #houston, corpus_christi, louisiana, rockport, austin, mexico, america]
irma                         [florida, miami, fl, cuba, puerto_rico, tampa, naples, georgia, orlando, key_west]
maria                [puerto_rico, san_juan, florida, america, dominica, mexico, virgin_islands, texas, fl, tx]
michael     [florida, panama_city, mexico_beach, fl, georgia, panhandle, tallahassee, panama_city_beach, bay...
dtype: object

data_name_fixed
florence    [wilmington, florence, carolina, new_bern, myrtle_beach, raleigh, fayetteville, charlotte, lumbe...
harvey      [houston, corpus_christi, rockport, austin, dallas, san_antonio, galveston, port_aransas, new_or...
irma        [miami, tampa, naples, orlando, marco_island, atlanta, jacksonville, miami_beach, houston, charl...
maria             [san_juan, vieques, ponce, carolina, caguas, utuado, guaynabo, yabucoa, aguadilla, las_vegas]
michael     [panama_city, mexico_beach, panhandle, tallahassee, panama_city_beach, bay_county, lynn_haven, g...
dtype: object

## Get stats for power user data
Let's do the same thing but for the power users.

In [1]:
import pandas as pd

In [2]:
import re
power_user_data_full = pd.read_csv('../../data/mined_tweets/combined_data_power_user_NE_flat_valid.gz', sep='\t', index_col=False, compression='gzip')
# limit full data to event-related tweets
data_name_var = 'data_name_fixed'
data_name_matchers = {
    name : re.compile(name) for name in power_user_data_full.loc[:, data_name_var].unique()
}
id_var = 'id'
txt_var = 'txt'
event_related_var = 'is_event_related'
event_related = power_user_data_full.groupby(id_var).apply(lambda x: data_name_matchers[x.loc[:, data_name_var].iloc[0]].search(x.loc[:, 'txt'].iloc[0].lower()) is not None).reset_index().rename(columns={0 : event_related_var})
power_user_data_full = pd.merge(power_user_data_full, event_related, on=id_var)
# restrict to full data to LOCATION NEs
LOC_TYPES = set(['LOCATION', 'COUNTRY', 'CITY'])
power_user_data = power_user_data_full[(power_user_data_full.loc[:, 'NE_type'].isin(LOC_TYPES)) & (power_user_data_full.loc[:, 'valid_loc']==1)]
display(power_user_data.head())

Unnamed: 0,id,txt,data_name_fixed,username,date,lang,NE,NE_type,NE_LOC,valid_loc,NE_fixed,is_event_related
21,1042132780015120384,Latest on the @santeecooper coal ash pits in C...,florence,AndyShain,2018-09-18 15:26:31,en,Conway,LOCATION,True,True,conway,True
23,1042132780015120384,Latest on the @santeecooper coal ash pits in C...,irma,AndyShain,2018-09-18 15:26:31,en,Conway,LOCATION,True,True,conway,True
25,1042132780015120384,Latest on the @santeecooper coal ash pits in C...,michael,AndyShain,2018-09-18 15:26:31,en,Conway,LOCATION,True,True,conway,True
35,1042126288805421063,SC #Florence briefing: Waccamaw River expected...,florence,AndyShain,2018-09-18 15:00:43,en,crest,LOCATION,True,True,crest,True
36,1042126288805421063,SC #Florence briefing: Waccamaw River expected...,florence,AndyShain,2018-09-18 15:00:43,en,Conway,LOCATION,True,True,conway,True


In [4]:
# limit to event-related
event_related_power_user_data_full = power_user_data_full[power_user_data_full.loc[:, event_related_var]==1]

In [6]:
# tweets
data_name_var = 'data_name_fixed'
id_var = 'id'
display(event_related_power_user_data_full.groupby(data_name_var).apply(lambda x: x.loc[:, id_var].nunique()))

data_name_fixed
florence    17624
harvey      31563
irma        45913
maria       11332
michael      8828
dtype: int64

In [9]:
# authors
## NOTE! fewer than 200 authors because we subsetted by percentile: [95,100]
author_var = 'username'
display(event_related_power_user_data_full.groupby(data_name_var).apply(lambda x: x.loc[:, author_var].nunique()))

data_name_fixed
florence    185
harvey      164
irma        178
maria       115
michael     136
dtype: int64

In [7]:
# LOC NEs
data_name_var = 'data_name_fixed'
display(event_related_power_user_data_full.loc[:, data_name_var].value_counts())

irma        77114
harvey      50050
florence    29066
maria       18204
michael     14655
Name: data_name_fixed, dtype: int64

In [20]:
# example NEs
NE_var = 'NE_fixed'
top_k = 10
display(power_user_data.groupby(data_name_var).apply(lambda x: x.loc[:, NE_var].value_counts()[:top_k]))

data_name_fixed                
florence         florence           828
                 wilmington         771
                 houston            638
                 fayetteville       481
                 charleston         477
                 myrtle_beach       377
                 alert              313
                 lumberton          271
                 new_bern           262
                 charlotte          251
harvey           houston           6166
                 barcelona         1055
                 miami              799
                 corpus_christi     701
                 rockport           624
                 atlanta            617
                 austin             546
                 harvey             489
                 dallas             432
                 naples             416
irma             houston           5263
                 miami             2717
                 white_house       1033
                 naples             992
        