## Identify and store sibling mentions, using the criteria determined by text analyses.

In [1]:
import json
import requests
import logging,sys
from getpass import getpass 
import os

In [2]:
files = [f for f in os.listdir('.') if os.path.isfile(f)]
datfiles = [f for f in files if 'April' in f]
datfiles
#files in the current directory

['April10_1a',
 'April10_1b',
 'April10_1c',
 'April10_2.json',
 'April10_3a.txt',
 'April10_3b.json',
 'April10_4a',
 'April10_4b',
 'April10_4c',
 'April10_5a',
 'April10_5b',
 'April10_5c',
 'April10_5d',
 'April10_5e',
 'April10_5f',
 'April10_5g',
 'April10_5h',
 'April10_5i',
 'April11.json',
 'April9.json']

In [4]:
irrel_hashtags = ['#equalpayday', '#tuesdaythoughts', '#lifecourbeeasier', '#zuckerberg', \
                  '#brochure', '#rack', '#flyer', '#roll',\
                 '#cbx_bloomingdays', '#felizmartes', '#temblor', '#mondaymotivation', \
                 '#americanidol', '#foodasitcom', '#michaelcohen', '#fcbsfc', '#fft18', '#michaelcohen',\
                 "#onlychild"]
irrel_keywords=['research', 'keyword', 'market','digit'] 
irrel = irrel_hashtags + irrel_keywords

In [5]:
tweets_c = []
for fname in datfiles:
    print("Now the file being processed is", fname)
    with open(fname, 'r') as f:
        my_tweets = json.load(f)
    if my_tweets[-1] is None:
        del my_tweets[-1]
    for a_tweet in my_tweets:
        if 'lang' in a_tweet:
            if a_tweet['lang']!='en':  #filtering out all the non-english tweets
                continue
        if 'retweeted_status' in a_tweet: #filtering out retweets
            continue  
        if 'entities' not in a_tweet:
            continue
        if len(a_tweet['entities']['user_mentions'])==0:
            continue  #filter out tweets where there were no mentions
        if 'extended_tweet' in a_tweet:
            text = a_tweet["extended_tweet"]["full_text"] #full text of truncated tweets
        elif 'text' in a_tweet:
            text = a_tweet["text"]
        if any(word in text.lower() for word in irrel):  #filter out tweets with irrelevant hashtags or keywords
            continue
        tweets_c.append(a_tweet)
    print("Now there are", len(tweets_c), "tweets.")

Now the file being processed is April10_1a
Now there are 2699 tweets.
Now the file being processed is April10_1b
Now there are 4993 tweets.
Now the file being processed is April10_1c
Now there are 6901 tweets.
Now the file being processed is April10_2.json
Now there are 9358 tweets.
Now the file being processed is April10_3a.txt
Now there are 9358 tweets.
Now the file being processed is April10_3b.json
Now there are 9358 tweets.
Now the file being processed is April10_4a
Now there are 10494 tweets.
Now the file being processed is April10_4b
Now there are 11699 tweets.
Now the file being processed is April10_4c
Now there are 12630 tweets.
Now the file being processed is April10_5a
Now there are 13725 tweets.
Now the file being processed is April10_5b
Now there are 14852 tweets.
Now the file being processed is April10_5c
Now there are 16121 tweets.
Now the file being processed is April10_5d
Now there are 17389 tweets.
Now the file being processed is April10_5e
Now there are 18651 tweets.

From this step I got 24,031 tweets.

In [8]:
#store these 24,031 twitter objects for future use
with open('sibling_tweets_raw.txt', 'w') as outfile:
    json.dump(tweets_c, outfile)

Then get the tweet id, the username, the tweet, and the user mentions from these twitter objects. For the ease of processing in R.

In [68]:
tweet_obj_c=[]
for a_tweet in tweets_c:
    a_tweet_dict={}
    a_tweet_dict['id_str'] = a_tweet['id_str']
    a_tweet_dict['screen_name'] = a_tweet['user']['screen_name']
    if 'extended_tweet' in a_tweet:
        a_tweet_dict['text']= a_tweet["extended_tweet"]["full_text"]
    elif 'text' in a_tweet:
        a_tweet_dict['text']=a_tweet["text"]
    usermention = []
    for mentions in a_tweet['entities']['user_mentions']: #note that some tweets can have multiple mentions
        usermention.append(mentions['screen_name'])
    a_tweet_dict['user_mentions'] = usermention
    tweet_obj_c.append(a_tweet_dict)

In [71]:
tweet_obj_c[0]

{'id_str': '983557792014393344',
 'screen_name': 'CarlHLam',
 'text': '@WGRZ It is so rare that we’re all together and yes, we’ve established that I’m the tallest. #NationalSiblingsDay #BeOn2 https://t.co/aC9nXcvmgj',
 'user_mentions': ['WGRZ']}

In [72]:
#store these shortened twitter objects
with open('sibling_tweets_short.txt', 'w') as outfile:
    json.dump(tweet_obj_c, outfile)

In [73]:
#check length of it to make sure it's consistent with what's read in R
print(len(tweet_obj_c))

24031
