In this notebook, we'll try to figure out which words a candidates uses much more than other candidates. 

What is our criteria? To start with, let's say the word has to be in the top 100 (excluding stopwords) for a candidate and to represent a "much" higher percentage of their words. To make life easy, we'll just look for those words *not* in the top 100 for "most" of the other candidates. We'll also add a filter that the candidate must use the word 5 times. 

In [None]:
import sqlite3
import re 
from collections import Counter, defaultdict
import nltk

In [None]:
def clean_candidate_text(text_list) :
    # given a list of raw text, returns a list of words
    words = []

    for text in text_list :
        text = [word.lower() for word in text.split()]
        text = [word.strip() for word in text if word.isalpha()]
        text = [word for word in text if word not in sw]
        words.extend(text)
        
    return(words)

def count_term(term,words,verbose=False) :
    # Given a single-word term to look for and a list of words,
    # returns the count of that term.
    word_count = Counter(words)
    total_words = len(words)
    
    if term in word_count :
        if verbose :
            print("Out of {} words, {} were '{}'.".format(total_words,
                                                        word_count[term],
                                                        term))
        return(word_count[term])
    else :
        return(0)

In [None]:
sw = nltk.corpus.stopwords.words("english")

db_path = "C://users//jchan//dropbox//teaching//2019//textmining//text-mining-2019//web-scraping//candidate//"
db = sqlite3.connect(db_path + "candidate_websites.db") # feel free to change this to something you like. 
cur = db.cursor()

In [None]:
# Let's get a list of candidates
cur.execute("SELECT DISTINCT base_url, text FROM site_text")
candi_text = defaultdict(list) # let's talk about this

for row in cur.fetchall() :
    candidate, text = row
    candi_text[candidate].append(text)

db.close()

Now we'll get clean words.

In [None]:
candi_words = defaultdict(list)

for candidate in candi_text :
    candi_words[candidate] = clean_candidate_text(candi_text[candidate])

Now let's go candidate by candidate and find the top 100 words and see if they're top 100. 

In [None]:
candidate_unique_words = defaultdict(lambda: defaultdict(int)) #going to have several levels.

for candidate in candi_words :
    this_count = Counter(candi_words[candidate]).most_common(100)
    num_missing_from_others = defaultdict(int)
    
    top_100 = {w for w, cnt in this_count if cnt > 5}
    
    for candi_2 in candi_words : # iterate over the same dict
        if candi_2 != candidate :
            this_count_2 = Counter(candi_words[candi_2]).most_common(100)
            top_100_2 = {w for w, cnt in this_count_2}
            
            for word in top_100 :
                if word not in top_100_2 :
                    num_missing_from_others[word] += 1
                                
    for word, cnt in num_missing_from_others.items() :
        if cnt >= 9 :
            candidate_unique_words[candidate][word] = cnt

Now let's look at the results.

In [None]:
for candi in candidate_unique_words :
    for word, cnt in candidate_unique_words[candi].items() :
        print("{} uniquely uses the word '{}'.".format(candi,word))