## Exploring Congressional Candidate Tweets

Let's look at congressional candidate tweets via NB.

In [None]:
import nltk
import random
from string import punctuation
from pprint import pprint
import sqlite3
from collections import defaultdict, Counter

In [None]:
# Link up to congressional candidate DB
db = sqlite3.connect("C:\\Users\\jchan\\Dropbox\\Teaching\\AppliedDataAnalytics\\ada-master\\congressional-candidates\\" + 
                      "congressional_data.db")

cur = db.cursor()

In [None]:
cur.execute("""SELECT t.candidate, c.party, t.tweet_text 
               FROM tweets t
               INNER JOIN candidate_data c on c.district=t.district""")

In [None]:
num_tweets = 0

d = defaultdict(lambda: defaultdict(list))
for row in cur.fetchall() :
    cand, party, text = row
    d[cand][party].append(text)
    num_tweets += 1
    
print("We have {} tweets.".format(num_tweets))
    

In [None]:
def tweet_features(the_tweet) :
    """ Input: A tweet
        Output: A dictionary listing the words that are in 
                the description.
                
        This function does some cleaning on the descriptions,
        removing some punctuation, splitting on whitespace, 
        dropping to lower case. It returns a dictionary 
        of the form 
            {example : True,
             word :    True}
    
        """
    exclude = set(punctuation)
    exclude.remove("#") #useful for twitter...
    exclude.remove("@") #ditto
    exclude.add("“")
    exclude.add("”")
    
    # Found this at https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
    the_tweet = ''.join([ch.lower() for ch in the_tweet if ch not in exclude])
    
    word_list = the_tweet.split()

    ret_val = {}
    
    for word in word_list :
        ret_val[word] = True
    
    return(ret_val)

In [None]:
# build dataset for NB
featuresets = []
counter = 0
for candidate in d :
    for party in d[candidate] :
        for tweet in d[candidate][party] :
            if party in ["Democratic","Republican"] : 
                featuresets.append((tweet_features(tweet.decode("utf-8")),
                                    party))

                counter += 1
                if counter > 300000 :
                    break
        

In [None]:
test_set_size = int(0.1*len(featuresets))

train_set, test_set = featuresets[test_set_size:], featuresets[:test_set_size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

How'd we do?

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
classifier.show_most_informative_features(40)

---

In [None]:
# Now I'll try to make a better classifier, dropping @ and stopwords.

sw = set(nltk.corpus.stopwords.words("english"))

In [None]:
def tweet_features_2(the_tweet) :
    """ Input: A twitter description
        Output: A dictionary listing the words that are in 
                the description.
                
        This function does some cleaning on the descriptions,
        removing some punctuation, splitting on whitespace, 
        dropping to lower case. It returns a dictionary 
        of the form 
            {example : True,
             word :    True}
    
        """
    exclude = set(punctuation)
    exclude.remove("#") #useful for twitter...
    exclude.remove("@") #ditto
    exclude.add("“")
    exclude.add("”")
    
    # Found this at https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
    the_tweet = ''.join([ch.lower() for ch in the_tweet if ch not in exclude])
    
    word_list = the_tweet.split()
    
    word_list = [w for w in word_list if "@" not in w and w not in sw]
    
    # Optionally drop hashtags too
    word_list = [w for w in word_list if "#" not in w]

    ret_val = {}
    
    for word in word_list :
        ret_val[word] = True
    
    return(ret_val)

In [None]:
featuresets = []
counter = 0
for candidate in d :
    for party in d[candidate] :
        for tweet in d[candidate][party] :
            if party in ["Democratic","Republican"] : 
                featuresets.append((tweet_features_2(tweet.decode("utf-8")),
                                    party))

                counter += 1
#                if counter > 300000 :
#                    break


In [None]:
test_set_size = int(0.1*len(featuresets))

train_set, test_set = featuresets[test_set_size:], featuresets[:test_set_size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
Counter([p for f,p in featuresets])

In [None]:
classifier.show_most_informative_features(40)