In [91]:
import csv
import json
import psycopg2
from pprint import pprint
from dateutil import parser
from datetime import datetime

In [92]:
# connect to the tweets database
conn = psycopg2.connect(database="postgres",
                        user="postgres",
                        password="apassword",
                        host="localhost")

# define the cursor to be able to query the database
cur = conn.cursor()

In [93]:
# query the database and store the results in a list a list of dicts
cur.execute("SELECT tweet FROM twitter")
tweets_all = [record[0] for record in cur]

In [94]:
len(tweets_all)

124672

In [95]:
# classify tweets as retweet/mention/tweet
for line in tweets_all:
    if 'retweeted_status' in line:
        line['TWEET_TYPE'] = 'retweet'
    elif len(line['entities']['user_mentions']) > 0:
        line['TWEET_TYPE'] = 'mention'
    else:
        line['TWEET_TYPE'] = 'tweet'

In [96]:
# extract relevant content from tweets
tweets = []
for line in tweets_all:
    row = {'user' : line['user']['screen_name'],
           'text' : line['text'].encode('unicode_escape'),
           'datetime' : parser.parse(line['created_at']),
           'type' : line['TWEET_TYPE'],
           'id' : line['id_str']}
    tweets.append(row)

In [97]:
# print a sample of one tweet
tweets[0]

{'datetime': datetime.datetime(2016, 12, 17, 21, 13, 29, tzinfo=tzutc()),
 'id': u'810231467683016704',
 'text': 'The OA on Netflix goes crazy, as does "3%"',
 'type': 'tweet',
 'user': u'Alivingtribunal'}

In [98]:
for line in tweets[:10]:
    print line['text']

The OA on Netflix goes crazy, as does "3%"
RT @reaganbabyxx: seriously love just laying around watching Netflix and not having to worry about tests papers or projects
If you love science fiction &amp; stuff on alternate dimensions, then check out The OA on Netflix !
Forget The Get Down, if you want to learn the true story, you need to watch Hip Hop Evolution on Netflix, excellent documentary series
RT @PelisResumidas: The Crown: Serie de Netflix que nos muestra que la reina Isabel II no siempre fue una vieja gru\xf1ona. https://t.co/pP9fM\u2026
Ent\xe9rate de todos los detalles de #Sense8Season2, y del especial de #navidad que se viene \U0001f385\n#Sense8 #series #netflix https://t.co/4Jp9elAkSJ
Netflix trekt stekker uit serie Marco Polo https://t.co/PUuusukhZ4 via @Easy_Branches @auteurjanjansen #easybranches https://t.co/IeZwv3gNNo
Netflix trekt stekker uit serie Marco Polo https://t.co/PUuusukhZ4 via @Easy_Branches @auteurjanjansen #easybranches https://t.co/IeZwv3gNNo
Love Actually

In [99]:
# load Netflix content into a list of dicts
# query the database and store the results in a list a list of dicts
cur.execute("SELECT * FROM tvshow")
netflix = [record for record in cur]

In [100]:
# get a unique set of titles to look for matches.  To minimize frivolous matches, only add titles longer than 4 characters
netflix_titles = set()
for line in netflix:
    if len(line[6]) > 5:
        netflix_titles.add(line[6])

In [101]:
tweets[0]

{'datetime': datetime.datetime(2016, 12, 17, 21, 13, 29, tzinfo=tzutc()),
 'id': u'810231467683016704',
 'text': 'The OA on Netflix goes crazy, as does "3%"',
 'type': 'tweet',
 'user': u'Alivingtribunal'}

In [102]:
# look through all of the tweets to find matches with the Netflix titles
netflix_tweets = []
for line in tweets:
    line['Netflix_match'] = []
    for row in netflix_titles:
        if row.lower() in line['text'].lower() and 'netflix' in line['text'].lower():
            line['Netflix_match'].append(row)
            netflix_tweets.append(line)

In [103]:
# print a sample row of the data
netflix_tweets[0]

{'Netflix_match': ['The Get Down'],
 'datetime': datetime.datetime(2016, 12, 17, 21, 14, 16, tzinfo=tzutc()),
 'id': u'810231661354958849',
 'text': 'Forget The Get Down, if you want to learn the true story, you need to watch Hip Hop Evolution on Netflix, excellent documentary series',
 'type': 'tweet',
 'user': u'mildoo72'}

In [104]:
# extract only tweets with at least one match
netflix_tweets = []
for line in tweets:
    if len(line['Netflix_match']) > 0:
        netflix_tweets.append(line)

In [105]:
# for multiple matches, choose only the first match
for line in netflix_tweets:
    line['Title'] = line['Netflix_match'][0]

In [106]:
# how many tweets remain?
print "Number of tweets originally: ", len(tweets)
print "Number of Netflix tweets: ", len(netflix_tweets)

Number of tweets originally:  124672
Number of Netflix tweets:  83118


In [107]:
# print a few sample matches to verify
for line in netflix_tweets[:5]:
    print(line['Netflix_match'])
    print(line['text'])
    print("\n")

['The Get Down']
Forget The Get Down, if you want to learn the true story, you need to watch Hip Hop Evolution on Netflix, excellent documentary series


['The Crown']
RT @PelisResumidas: The Crown: Serie de Netflix que nos muestra que la reina Isabel II no siempre fue una vieja gru\xf1ona. https://t.co/pP9fM\u2026


['Sense8']
Ent\xe9rate de todos los detalles de #Sense8Season2, y del especial de #navidad que se viene \U0001f385\n#Sense8 #series #netflix https://t.co/4Jp9elAkSJ


['Marco Polo']
Netflix trekt stekker uit serie Marco Polo https://t.co/PUuusukhZ4 via @Easy_Branches @auteurjanjansen #easybranches https://t.co/IeZwv3gNNo


['Marco Polo']
Netflix trekt stekker uit serie Marco Polo https://t.co/PUuusukhZ4 via @Easy_Branches @auteurjanjansen #easybranches https://t.co/IeZwv3gNNo




In [108]:
# write Netflix tweets to a CSV file
with open('Netflix_tweets_2.csv', 'w') as f:
    writer = csv.DictWriter(f, netflix_tweets[0].keys())
    writer.writeheader()
    writer.writerows(netflix_tweets)