# Count mined tweets
Can we get an estimate of how many tweets we're dealing with? 

We've mined data from the following hurricanes:

- Hurricane Harvey
- Hurricane Irma
- Hurricane Maria
- Hurricane Nate

In [25]:
import os
import gzip
import re

In [27]:
data_dir = '../../data/mined_tweets/'
tweet_files = map(str.lower, os.listdir(data_dir))
hurricane_names = ['harvey', 'irma', 'maria', 'nate']
for n in hurricane_names:
    print("testing hurricane %s"%(n))
    matching_files = filter(lambda x: n in x, tweet_files)
    print('\n'.join(matching_files))

testing hurricane harvey
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-19-17_aug-19-17.gz
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-27-17_aug-27-17.gz
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-31-17_aug-31-17.gz
example_ambiguous_tweets_hurricaneharvey.tsv
hurricane_harvey.csv
hurricaneharvey_ids.txt.gz
#irma,#hurricaneirma,#houstonstrong,#harvey,#hurricaneharvey_houston2017-08-17_2017-09-14.gz
harvey_users
#irma,#hurricaneirma,#harvey,#hurricaneharvey_2017-08-17_2017-08-27.gz
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-17-17_aug-17-17.gz
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-20-17_aug-20-17.gz
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-18-17_aug-18-17.gz
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-24-17_aug-24-17.gz
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-28-17_aug-28-17.gz
archive_#irma,#hurricaneirma,#harvey,#hurricaneharvey_aug-22-17_aug-22-17.gz
archive_#irm

In [55]:
def count_lines(file_list):
    ctr = 0
    for f in file_list:
        for l in gzip.open(f, 'r'):
            ctr += 1
    return ctr
def count_matching_lines(file_list, line_matcher):
    ctr = 0
    for f in file_list:
        for l in gzip.open(f, 'r'):
            if(len(line_matcher.findall(l)) > 0):
                ctr += 1
    return ctr

## Harvey

In [39]:
tweet_files = os.listdir(data_dir)
corpus_matcher = re.compile('.*\#[Hh]arvey.*.gz')
historical_files = map(lambda y: os.path.join(data_dir, y), filter(lambda x: corpus_matcher.match(x), tweet_files))
print('\n'.join(historical_files))

../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-19-17_Aug-19-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-27-17_Aug-27-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-31-17_Aug-31-17.gz
../../data/mined_tweets/#Irma,#HurricaneIrma,#HoustonStrong,#Harvey,#HurricaneHarvey_Houston2017-08-17_2017-09-14.gz
../../data/mined_tweets/#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_2017-08-17_2017-08-27.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-17-17_Aug-17-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-20-17_Aug-20-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-18-17_Aug-18-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-24-17_Aug-24-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug

In [40]:
rehydrated_tweet_file = os.path.join(data_dir, 'HurricaneHarvey_ids_rehydrated_clean.txt.gz')

In [47]:
all_files = historical_files + [rehydrated_tweet_file]
line_matcher = re.compile('[Hh]arvey')
harvey_ctr = count_matching_lines(all_files, line_matcher)

In [48]:
print('%d total harvey lines'%(harvey_ctr))

1862036 total harvey lines


## Irma

In [52]:
corpus_matcher = re.compile('.*[Ii]rma.*.gz')
historical_files = map(lambda y: os.path.join(data_dir, y), filter(lambda x: corpus_matcher.match(x), tweet_files))
print('\n'.join(historical_files))

../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-19-17_Aug-19-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-27-17_Aug-27-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-31-17_Aug-31-17.gz
../../data/mined_tweets/#Irma,#HurricaneIrma,#HoustonStrong,#Harvey,#HurricaneHarvey_Houston2017-08-17_2017-09-14.gz
../../data/mined_tweets/#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_2017-08-17_2017-08-27.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-17-17_Aug-17-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-20-17_Aug-20-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-18-17_Aug-18-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug-24-17_Aug-24-17.gz
../../data/mined_tweets/archive_#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_Aug

In [49]:
line_matcher = re.compile('[Ii]rma')
irma_ctr = count_matching_lines(historical_files, line_matcher)
print('%d Irma tweets'%(irma_ctr))

764993 Irma tweets


## Maria

In [51]:
corpus_matcher = re.compile('.*[Mm]aria.*.gz')
historical_files = map(lambda y: os.path.join(data_dir, y), filter(lambda x: corpus_matcher.match(x), tweet_files))
print('\n'.join(historical_files))

../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-29_2017-09-30.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-08-30_2017-08-31.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-01_2017-09-02.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-02_2017-09-03.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-03_2017-09-04.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-08-31_2017-09-01.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-14_2017-09-15.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-15_2017-09-16.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-17_2017-09-18.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-18_2017-09-19.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-23_2017-09-24.gz
../../data/mined_tweets/#Maria,#HurricaneMaria,#PuertoRico_2017-09-22_2017-0

In [57]:
line_matcher = re.compile('[Mm]aria')
# for some reason matcher isn't working
# maria_ctr = count_matching_lines(historical_files, line_matcher)
maria_ctr = count_lines(historical_files)
print('%d Maria tweets'%(maria_ctr))

1855821 Maria tweets


## Nate

In [58]:
corpus_matcher = re.compile('.*[Nn]ate.*.gz')
historical_files = map(lambda y: os.path.join(data_dir, y), filter(lambda x: corpus_matcher.match(x), tweet_files))
print('\n'.join(historical_files))

../../data/mined_tweets/#Nate,#HurricaneNate,#nateupdates,#NateHurricane_2017-10-07_2017-10-08.gz
../../data/mined_tweets/#Nate,#HurricaneNate,#nateupdates,#NateHurricane_2017-10-11_2017-10-12.gz
../../data/mined_tweets/#Nate,#HurricaneNate,#nateupdates,#NateHurricane_2017-10-04_2017-10-05.gz
../../data/mined_tweets/#Nate,#HurricaneNate,#nateupdates,#NateHurricane_2017-10-10_2017-10-11.gz
../../data/mined_tweets/#Nate,#HurricaneNate,#nateupdates,#NateHurricane_2017-10-09_2017-10-10.gz
../../data/mined_tweets/#Nate,#HurricaneNate,#nateupdates,#NateHurricane_2017-10-06_2017-10-07.gz
../../data/mined_tweets/#Nate,#HurricaneNate,#nateupdates,#NateHurricane_2017-10-05_2017-10-06.gz
../../data/mined_tweets/#Nate,#HurricaneNate,#nateupdates,#NateHurricane_2017-10-08_2017-10-09.gz


In [59]:
line_matcher = re.compile('[Nn]ate')
nate_ctr = count_matching_lines(historical_files, line_matcher)
print('%d Nate tweets'%(nate_ctr))

49911 Nate tweets


## Summary

Name  | Count
------------- | -------------
Harvey  | 1862036
Irma  | 764993
Maria  | 1855821
Nate  | 49911

We might be able to get cleaner data if we get prior data on top-1000 local users tweeting during each hurricane...TBD.