## Comparing Convention Language

In this workbook we'll set ourselves up to work with the convention data we've scraped.

In [None]:
import sqlite3
import nltk
import random
import numpy as np
from collections import Counter, defaultdict
from string import punctuation

In [None]:
from nltk.corpus import stopwords

In [None]:
sw = stopwords.words("english")

In [None]:
convention_db = sqlite3.connect("ConventionSpeeches.db")
convention_cur = convention_db.cursor()

We'll make punctuation a set and add the apostrophe that appears in the data. 

In [None]:
punctuation = set(punctuation)
punctuation.add("’")

Let's read in the convention data from the DB so that we can work with it. 

In [None]:
query_results = convention_cur.execute(
                            '''
                                SELECT text, party
                                FROM conventions
                                WHERE speaker != "Unknown"
                            ''')

And now we'll store all the text from every identified speaker in a dictionary that has just two keys, "Democratic" and "Republican".

In [None]:
convention_data = defaultdict(str)

for row in query_results :
    text, party = row

    # A nice trick to get rid of punctuation
    text = "".join([ch for ch in text if ch not in punctuation])    
    text = [w.lower() for w in text.split() if w.isalpha()]
    
    convention_data[party] += " ".join(text) + " "

In [None]:
nltk.FreqDist(convention_data['Democratic'].split()).most_common(20)

In [None]:
nltk.FreqDist(convention_data['Republican'].split()).most_common(20)

In [None]:
a = nltk.FreqDist(convention_data['Democratic'].split())['trump']/74000

In [None]:
nltk.FreqDist(convention_data['Democratic'].split())['obama']

In [None]:
nltk.FreqDist(convention_data['Republican'].split())['trump']

In [None]:
nltk.FreqDist(convention_data['Republican'].split())['obama']

In [None]:
b = nltk.FreqDist(convention_data['Republican'].split())['biden']/83000

In [None]:
nltk.FreqDist(convention_data['Democratic'].split())['biden']

In [None]:
b/a

In [None]:
d_split = convention_data['Democratic'].split()
r_split = convention_data['Republican'].split()

In [None]:
dem_len = len(d_split)
rep_len = len(r_split)

In [None]:
all_words = set(d_split + r_split)

In [None]:
metrics = defaultdict(list)

In [None]:
# list will have D count, R count, D frac, R frac, D ratio, R ratio
for word in all_words :
    metrics[word].append(len([w for w in d_split if w == word]))
    metrics[word].append(len([w for w in r_split if w == word]))
    metrics[word].append(len([w for w in d_split if w == word])/dem_len)
    metrics[word].append(len([w for w in r_split if w == word])/rep_len)
    

In [None]:
for word, nums in metrics.items() :
    if (nums[2] * nums[3] > 0) :
        metrics[word].append(nums[2]/nums[3])
        metrics[word].append(nums[3]/nums[2])
    else :
        metrics[word].append(None)
        metrics[word].append(None)


In [None]:
metrics["eagle"]