In [156]:
from datetime import datetime, timedelta
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import time
import tweepy

In [157]:
# Read in the consumer key, consumer secret, access token, and access token secret.
consumer_key = open("../../auth/twitter/consumer_key.txt").read()[:-1]
consumer_secret = open("../../auth/twitter/consumer_secret.txt").read()[:-1]
access_token = open("../../auth/twitter/access_token.txt").read()[:-1]
access_token_secret = open("../../auth/twitter/access_token_secret.txt").read()[:-1]

auth = tweepy.OAuthHandler(consumer_key = consumer_key,
                           consumer_secret = consumer_secret)
auth.set_access_token(access_token,
                      access_token_secret)

api = tweepy.API(auth)

current_time = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
cutoff_time = (datetime.utcnow() - timedelta(days = 1)).strftime("%Y-%m-%d %H:%M:%S")

In [158]:
# Set the users of interest.
users = ['nytimes',
        'thesun',
        'thetimes',
        'ap',
        'cnn',
        'bbcnews',
        'cnet',
        'msnuk',
        'telegraph',
        'usatoday',
        'wsj',
        'washingtonpost',
        'bostonglobe',
        'newscomauhq',
        'skynews',
        'sfgate',
        'ajenglish',
        'independent',
        'guardian',
        'latimes',
        'reutersagency',
        'abc',
        'business',
        'bw',
        'time']

In [164]:
def query_tweets(api, users):

    # Initialize a list to hold the tweets as JSON files.
    tweets_data = []

    # Loop over the users. (Hard to vectorize since I want to keep all tweets
    # on the same hierarchical level. I.e. I don't want a list of lists.)
    for user in range(len(users)):

        # Find the desired user.
        this_user = api.get_user(users[user])

        # Get their timeline.
        this_user_recent_tweets = api.user_timeline(user_id = this_user.id)

        # For each of their recent tweets, convert to JSON and store in a list.
        recent_tweets_json = list(map(get_tweet_json, this_user_recent_tweets))
        #recent_tweets_json = list(map(get_tweet_json, list(range(len(this_user_recent_tweets)))))

        # Append that list to the overall list of tweets.
        tweets_data += recent_tweets_json

    return(tweets_data)

# For a given user, convert a single tweet to JSON.
def get_tweet_json(tweet):
    json_str = json.dumps(tweet._json)
    tweet = json.loads(json_str)
    return(tweet)

print("Getting tweets...")

tweets_data = query_tweets(api = api, users = users)

print("Done.")

Getting tweets...
Done.


In [151]:
#for i in range(500):
#    print(tweets_data[i]['text'])
tweets_data[13]['retweeted_status']

KeyError: 'retweeted_status'

In [165]:
# Convert the new tweets from JSON to a tidy pandas dataframe.
def wrangle_new_tweets(new_tweets):

    # Initialize a dataframe to hold the tweets.
    tweets = pd.DataFrame()

    # Get when the tweet was created.
    tweets['created_at'] = list(map(lambda tweet: tweet['created_at'], tweets_data))

    # Get the UTC offset, so that times can be correctly compared.
    tweets['utc_offset'] = list(map(lambda tweet: tweet['user']['utc_offset'], tweets_data))

    # Get the text in the tweet.
    tweets['text'] = list(map(lambda tweet: tweet['text'], tweets_data))

    # Get the url of the tweet itself.
    tweets['tweet_url'] = list(map(get_url, tweets_data))

    # Get the user's screen name.
    tweets['screen_name'] = list(map(lambda tweet: tweet['user']['screen_name'], tweets_data))

    # Get the user's username.
    tweets['name'] = list(map(lambda tweet: tweet['user']['name'], tweets_data))

    # Get the number of times the tweet was retweeted.
    tweets['retweet_count'] = list(map(lambda tweet: tweet['retweet_count'], tweets_data))

    # Get the number of times the tweet was favorited.
    tweets['favorite_count'] = list(map(lambda tweet: tweet['favorite_count'], tweets_data))
    
    # Get the retweet status.
    tweets['is_retweet'] = list(map(check_is_retweet, tweets_data))
    
    # Get the URL to the linked page, if there is one.
    #tweets['url_ext'] = list(map(lambda tweet: tweet['entities']['urls'][0]['url'], tweets_data))

    return(tweets)

# Get the URL of a tweet even if it's a retweet.
def get_url(tweet):
    return("https://twitter.com/" + tweet['user']['screen_name'] + "/status/" + tweet['id_str'])

# Check if a tweet is a retweet.
def check_is_retweet(tweet):
    try:
        exists = tweet['retweeted_status']
        return True
    except:
        return False

tidy_new_tweets = wrangle_new_tweets(new_tweets = tweets_data)

In [166]:
# Read in the previously stored tweets.
old_tweets = pd.read_csv('../data/candidate_tweets.csv')

# Combine the new data with the old data, and remove duplicate tweets.
all_tweets = (tidy_new_tweets.append(other = old_tweets)
                             .drop_duplicates(subset = 'tweet_url')
                             .reset_index(drop = True))

In [167]:
def remove_old_tweets(all_tweets, cutoff):

    # Convert the created_at variable to a timestamp.
    all_tweets['created_at_stamp'] = all_tweets.apply(tweet_time_to_timestamp, 1)

    # Remove all tweets that are older than the cutoff. (Default 24 hours.)
    all_tweets = all_tweets[all_tweets['created_at_stamp'] > cutoff]

    # Reorder the tweets by their creation time.
    all_tweets = all_tweets.sort_values(by = 'created_at_stamp', ascending = False)

    # Reset the index.
    all_tweets = all_tweets.reset_index(drop = True)

    return(all_tweets)

# Convert the time of a tweet to a UTC timestamp.
def tweet_time_to_timestamp(tweet):
    newtime = datetime.strptime(
        tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'
    )
    return(newtime)

# Remove any tweets that are older than 24 hours.
recent_tweets = remove_old_tweets(all_tweets, cutoff = cutoff_time)

In [168]:
# Save the updated, pruned dataset to csv.
recent_tweets.to_csv('../data/candidate_tweets.csv', index = False)

In [182]:
vectorizer = CountVectorizer(max_df=100,
                             max_features=200,
                             min_df=1,
                             stop_words='english')

In [183]:
tweet_texts = recent_tweets['text']

In [184]:
X = vectorizer.fit_transform(tweet_texts)

In [185]:
km = KMeans(n_clusters=2,
            init='k-means++',
            max_iter=100,
            n_init=10,
            verbose=False)

In [186]:
km.fit(X)

Initialization complete
Iteration  0, inertia 1290.000
Iteration  1, inertia 1237.909
Iteration  2, inertia 1225.970
Iteration  3, inertia 1217.078
Iteration  4, inertia 1212.692
Iteration  5, inertia 1208.849
Iteration  6, inertia 1204.772
Iteration  7, inertia 1204.437
Converged at iteration 7: center shift 0.000000e+00 within tolerance 1.330728e-06
Initialization complete
Iteration  0, inertia 1278.000
Iteration  1, inertia 1124.914
Iteration  2, inertia 1122.843
Converged at iteration 2: center shift 0.000000e+00 within tolerance 1.330728e-06
Initialization complete
Iteration  0, inertia 1292.000
Iteration  1, inertia 1224.598
Iteration  2, inertia 1196.126
Iteration  3, inertia 1168.883
Iteration  4, inertia 1159.423
Iteration  5, inertia 1157.834
Converged at iteration 5: center shift 0.000000e+00 within tolerance 1.330728e-06
Initialization complete
Iteration  0, inertia 1271.000
Iteration  1, inertia 1207.209
Iteration  2, inertia 1161.567
Iteration  3, inertia 1148.284
Iterati

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [195]:
for i in np.where(km.labels_ == 4):
    print(recent_tweets.loc[i]['text'])

65     Theresa May under pressure to say who will pay...
71     Tuesday's Sun: "Blood On His Hands" (via @Alli...
86     Tuesday's i: "'Dementia tax' U-turn" (via @All...
94     Tuesday's Mirror: "How can we ever trust Mrs U...
95     Tuesday's Mail: "Facebook Lets Teens See Porn"...
97     Tuesday's Times: "Pensioners to pay for May's ...
101    Tuesday's Guardian: "May's manifesto meltdown:...
103    Tuesday's Independent: "May's campaign in chao...
115    Tuesday's Telegraph: "Care cost chaos after Ma...
128    Tomorrow's front page: Pensioners to pay for M...
161    Tuesday's @Telegraph front page #tomorrowspape...
163    DAILY MIRROR FRONT PAGE: 'How can we ever trus...
166    GUARDIAN FRONT PAGE: 'May's manifesto meltdown...
171    May’s 'dementia tax'  U-turn https://t.co/ELir...
175    THE TIMES FRONT PAGE: 'Pensioners to pay for M...
192    Tuesday's Express: "Dementia runs in the famil...
195    Tuesday's Star: "Blooming Kell it's a scorcher...
203    Theresa May refuses to a

In [181]:
np.where(km.labels_ != 0)
#km.labels_
print(recent_tweets.loc[17])
print(recent_tweets.loc[30])
print(recent_tweets.loc[113])
print(recent_tweets.loc[152])

created_at                             Mon May 22 22:17:18 +0000 2017
utc_offset                                                      36000
text                Lounge room quite the fitting room https://t.c...
tweet_url           https://twitter.com/newscomauHQ/status/8667800...
screen_name                                               newscomauHQ
name                                                      news.com.au
retweet_count                                                       1
favorite_count                                                      1
is_retweet                                                      False
created_at_stamp                                  2017-05-22 22:17:18
notif_score                                                  -1.96667
Name: 17, dtype: object
created_at                             Mon May 22 22:12:06 +0000 2017
utc_offset                                                     -14400
text                RT @NYTNational: Ayisha Gomez's daughter wasn’

In [171]:
recent_tweets

Unnamed: 0,created_at,utc_offset,text,tweet_url,screen_name,name,retweet_count,favorite_count,is_retweet,created_at_stamp
0,Mon May 22 22:22:02 +0000 2017,3600,A hilarious series of pictures prove why elder...,https://twitter.com/TheSun/status/866781223535...,TheSun,The Sun,0,0,False,2017-05-22 22:22:02
1,Mon May 22 22:21:50 +0000 2017,3600,Theresa May says only poll that counts is on J...,https://twitter.com/msnuk/status/8667811747015...,msnuk,MSN UK,0,0,False,2017-05-22 22:21:50
2,Mon May 22 22:21:22 +0000 2017,36000,Disick home burglary ‘an inside job’ https://t...,https://twitter.com/newscomauHQ/status/8667810...,newscomauHQ,news.com.au,0,0,False,2017-05-22 22:21:22
3,Mon May 22 22:21:07 +0000 2017,-14400,The one ingredient that can make your summer d...,https://twitter.com/washingtonpost/status/8667...,washingtonpost,Washington Post,2,7,False,2017-05-22 22:21:07
4,Mon May 22 22:21:02 +0000 2017,-14400,President Trump told then-FBI Director James C...,https://twitter.com/CNN/status/866780972494319616,CNN,CNN,18,15,False,2017-05-22 22:21:02
5,Mon May 22 22:20:06 +0000 2017,-25200,Column: Trump's budget plan continues his dece...,https://twitter.com/latimes/status/86678073555...,latimes,Los Angeles Times,25,19,False,2017-05-22 22:20:06
6,Mon May 22 22:20:05 +0000 2017,-14400,Nepal has issued a record 371 permits this yea...,https://twitter.com/USATODAY/status/8667807327...,USATODAY,USA TODAY,2,8,False,2017-05-22 22:20:05
7,Mon May 22 22:20:01 +0000 2017,3600,First animals born from sperm stored on Intern...,https://twitter.com/Telegraph/status/866780715...,Telegraph,The Telegraph,3,0,False,2017-05-22 22:20:01
8,Mon May 22 22:20:01 +0000 2017,3600,The Church of Satan is distancing itself from ...,https://twitter.com/Independent/status/8667807...,Independent,The Independent,9,11,False,2017-05-22 22:20:01
9,Mon May 22 22:19:14 +0000 2017,3600,RT @BBCBreaking: Police warn people to avoid a...,https://twitter.com/BBCNews/status/86678051784...,BBCNews,BBC News (UK),1016,0,True,2017-05-22 22:19:14


In [173]:
def compute_notif_score(tweet):
    d = datetime.utcnow() - datetime.strptime(str(recent_tweets.loc[0]['created_at_stamp']), "%Y-%m-%d %H:%M:%S")
    minutes_passed = d.seconds/60
    score = np.mean(int(tweet['favorite_count']) + int(tweet['retweet_count'])) - minutes_passed - 10000*int(tweet['is_retweet'])
    return(score)

recent_tweets['notif_score'] = recent_tweets.apply(compute_notif_score, 1)

np.max(recent_tweets['notif_score'])

5963.0166666666664

In [174]:
recent_tweets.sort_values(by = "notif_score", ascending = False)

Unnamed: 0,created_at,utc_offset,text,tweet_url,screen_name,name,retweet_count,favorite_count,is_retweet,created_at_stamp,notif_score
227,Mon May 22 21:03:14 +0000 2017,-14400,BREAKING: Top House Democrat: Documents show F...,https://twitter.com/AP/status/866761392371773442,AP,The Associated Press,2761,3206,False,2017-05-22 21:03:14,5963.016667
278,Mon May 22 20:38:59 +0000 2017,-14400,NEW: James Comey will consult with special cou...,https://twitter.com/ABC/status/866755288761847811,ABC,ABC News,1071,1694,False,2017-05-22 20:38:59,2761.016667
217,Mon May 22 21:06:35 +0000 2017,-14400,Breaking News: Michael Flynn misled Pentagon o...,https://twitter.com/nytimes/status/86676223707...,nytimes,The New York Times,1208,1366,False,2017-05-22 21:06:35,2570.016667
267,Mon May 22 20:45:03 +0000 2017,-14400,Former US President Barack Obama is set to joi...,https://twitter.com/CNN/status/866756818957201408,CNN,CNN,330,1476,False,2017-05-22 20:45:03,1802.016667
425,Mon May 22 18:08:47 +0000 2017,-14400,BREAKING: Flynn's letter to Senate committee c...,https://twitter.com/AP/status/866717493003419649,AP,The Associated Press,793,860,False,2017-05-22 18:08:47,1649.016667
325,Mon May 22 20:14:09 +0000 2017,-14400,One reader's response to Michael Flynn's refus...,https://twitter.com/nytimes/status/86674904084...,nytimes,The New York Times,485,1157,False,2017-05-22 20:14:09,1638.016667
396,Mon May 22 19:12:02 +0000 2017,-14400,Michael Flynn's decision to invoke the Fifth A...,https://twitter.com/nytimes/status/86673340912...,nytimes,The New York Times,539,958,False,2017-05-22 19:12:02,1493.016667
416,Mon May 22 18:37:35 +0000 2017,3600,"""Why should we believe you a third time?"" Andr...",https://twitter.com/BBCNews/status/86672473920...,BBCNews,BBC News (UK),579,663,False,2017-05-22 18:37:35,1238.016667
405,Mon May 22 19:00:52 +0000 2017,-14400,Breaking News: Michael Flynn refused to hand o...,https://twitter.com/nytimes/status/86673059861...,nytimes,The New York Times,581,592,False,2017-05-22 19:00:52,1169.016667
413,Mon May 22 18:45:08 +0000 2017,-14400,The Trump administration is trying to block ef...,https://twitter.com/nytimes/status/86672663917...,nytimes,The New York Times,601,504,False,2017-05-22 18:45:08,1101.016667


In [163]:
tidy_new_tweets.to_csv('../data/candidate_tweets.csv', index = False)

old_tweets.to_csv("")

time_of_most_recent_notification.to_txt("")

new_notifications.to_csv("")