### munging/parsing/processing json data

Other ideas:  
use date of debate as another feature  

In [4]:
import os
import glob
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
LOCAL_DATA_PATH = 'C:\Users\JoAnna\political_history\data'
LOCAL_SAVE_PATH = 'C:\Users\JoAnna\political_history\processed_data'

#### general processing for all approaches

In [6]:
#import all json files, concatenate into pandas dataframe
all_files = glob.glob(LOCAL_DATA_PATH + '/*.json')

df = pd.concat((pd.read_json(f, orient='index') for f in all_files))

#concatenating resulted in non-unique index, re-index
#df.index.is_unique

df['index'] = np.arange(len(df))
df = df.set_index('index')

df.index.is_unique

True

In [7]:
#from speaker: strip spaces and special characters, make lowercase
df.speaker = df.speaker.str.strip().str.lower().str.replace(' ','').str.replace('.', '').str.replace(':','')

#get list of unique values for speaker
unique_speaker = pd.unique(df.speaker.ravel())
#print len(unique_speaker)
#print unique_speaker

In [8]:
#map list of candidates to political party
speaker_to_party = {'trump': 'Republican',
                    'clinton': 'Democrat',
                    'pence': 'Republican',
                    'kaine': 'Democrat',
                    'republicanpresidentialnomineewmittromney': 'Republican',
                    'govromney': 'Republican',
                    'thepresident\u2014': 'Republican',
                    'govromney\u2014': 'Republican',
                    'govromney\u2014\u2014': 'Republican',
                    'thepresident': 'Democrat',
                    'representativepaulryan': 'Republican',
                    'ryan': 'Republican',
                    'vicepresidentjosephbiden': 'Democrat',
                    'biden': 'Democrat',
                    'mccain': 'Republican',
                    'obama': 'Democrat',
                    'palin': 'Republican',
                    'presidentbush': 'Republican',
                    'senatorjohnfkerry': 'Democrat',
                    'senatorkerry': 'Democrat',
                    'cheney': 'Republican',
                    'edwards': 'Democrat',
                    'bush': 'Republican',
                    'gore': 'Democrat',
                    'lieberman': 'Democrat'}

#make new column in dataframe for affiliation
df['affiliation'] = df['speaker']
df['affiliation'].replace(speaker_to_party, inplace=True)

In [9]:
#create two new dataframes, one for republicans, one for democrats
republican_df = df.loc[df['affiliation'] == 'Republican']
democrat_df = df.loc[df['affiliation'] == 'Democrat']

#create combined data frame - better for train/test split (sort of...)
candidates_df = df.loc[df['affiliation'].isin(['Republican','Democrat'])]

In [6]:
#export new dataframe for others to use
directory_name = LOCAL_SAVE_PATH
base_filename = 'candidates'
suffix = '.pkl'
save_path = os.path.join(directory_name, base_filename + suffix)
candidates_df.to_pickle(save_path)

In [7]:
#read pickled data
#test_df = pd.read_pickle('C:\Users\JoAnna\political_history\processed_data\candidates.pkl')
#print test_df.head()

In [10]:
#label data - 0 for democrat, 1 for republican
candidates_df['affiliation'].replace({'Democrat':0, 'Republican':1}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


#### processing the text column for bag_of_words

In [11]:
#make new lists just with labels and text
labels = candidates_df['affiliation']
paragraph_text = candidates_df['text']

In [10]:
#define function to tokenize and stem
def clean_text(text):
    """
    Removes punctuation, converts all characters to lowercase, removes stop words, stems
    
    Args:
        a single string of text 
        
    Returns:
        processed text string
        
    """
    tokens = RegexpTokenizer(r'\w+')
    stops = set(stopwords.words('english'))
    stemmer = SnowballStemmer('english')
    
    token = tokens.tokenize(text)
    filtered_words = [word for word in token if word not in stops]
    stems = [stemmer.stem(t) for t in filtered_words]
    return( " ".join(stems)) 

num_paragraphs= len(paragraph_text)
cleaned_paragraphs = []
#loop over paragraph_text to clean
for paragraph in paragraph_text:
    cleaned_text = clean_text(paragraph)
    cleaned_paragraphs.append(cleaned_text)

#print len(cleaned_paragraphs)
#print cleaned_paragraphs[6]

In [11]:
#export labels and cleaned paragraphs
os.chdir(LOCAL_SAVE_PATH)
pickle.dump(labels, open("bow_labels.pkl", "w"))
pickle.dump(cleaned_paragraphs, open("bow_processed_text.pkl", "w"))

In [12]:
#define new function to tokenize and stem, keep stopwords
def clean_text_nostop(text):
    """
    Removes punctuation, converts all characters to lowercase, stems
    
    Args:
        a single string of text 
        
    Returns:
        processed text string
        
    """
    tokens = RegexpTokenizer(r'\w+')
    #stops = set(stopwords.words('english'))
    stemmer = SnowballStemmer('english')
    
    token = tokens.tokenize(text)
    filtered_words = [word for word in token]
    stems = [stemmer.stem(t) for t in filtered_words]
    return( " ".join(stems)) 

num_paragraphs= len(paragraph_text)
cleaned_paragraphs_nostop = []
#loop over paragraph_text to clean
for paragraph in paragraph_text:
    cleaned_text_nostop = clean_text_nostop(paragraph)
    cleaned_paragraphs_nostop.append(cleaned_text_nostop)


In [13]:
#export labels and cleaned paragraphs, no stop words
os.chdir(LOCAL_SAVE_PATH)
#pickle.dump(labels, open("bow_labels.pkl", "w"))
pickle.dump(cleaned_paragraphs_nostop, open("bow_processed_text_nostop.pkl", "w"))

#### Tokenize, Stem text and make each paragraph an ordered list of words

In [12]:
def clean_text_list(text):
    """
    Removes punctuation, converts all characters to lowercase, stems
    
    Args:
        a single string of text 
        
    Returns:
        processed text string
        
    """
    tokens = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer('english')
    
    token = tokens.tokenize(text)
    filtered_words = [word for word in token]
    stems = [stemmer.stem(t) for t in filtered_words]
    #stemmed_text = " ".join(stems)
    return stems

num_paragraphs= len(paragraph_text)
cleaned_paragraphs_list = []
#loop over paragraph_text to clean
for paragraph in paragraph_text:
    cleaned_text_list = clean_text_list(paragraph)
    cleaned_paragraphs_list.append(cleaned_text_list)


In [14]:
#print cleaned_paragraphs_list[123]

[u'let', u'me', u'mention', u'anoth', u'regul', u'in', u'dodd', u'frank', u'you', u'say', u'we', u'were', u'give', u'mortgag', u'to', u'peopl', u'who', u'weren', u't', u'qualifi', u'that', u's', u'exact', u'right', u'it', u's', u'one', u'of', u'the', u'reason', u'for', u'the', u'great', u'financi', u'calam', u'we', u'had', u'and', u'so', u'dodd', u'frank', u'correct', u'say', u'we', u'need', u'to', u'have', u'qualifi', u'mortgag', u'and', u'if', u'you', u'give', u'a', u'mortgag', u'that', u's', u'not', u'qualifi', u'there', u'are', u'big', u'penalti', u'except', u'they', u'didn', u't', u'ever', u'go', u'on', u'to', u'defin', u'what', u'a', u'qualifi', u'mortgag', u'was']


In [15]:
#export labels and cleaned paragraphs, no stop words, text in a list
os.chdir(LOCAL_SAVE_PATH)
#pickle.dump(labels, open("bow_labels.pkl", "w"))
pickle.dump(cleaned_paragraphs_list, open("bow_processed_text_list.pkl", "w"))

#### Just export text without tokenizing and stemming

#### Processing for word2vec

In [19]:
#paragraph processing for word2vec
#https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors

#start with paragraph text

def paragraph_to_wordlist(paragraph, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    paragraph_clean = re.sub("[^a-zA-Z]"," ", paragraph)
    words = paragraph_clean.lower().split()
    #Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

In [20]:
#split paragraph into sentences, then sentences into words

import nltk.data
nltk.download()

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Define a function to split a review into parsed sentences
def paragraph_to_sentences( paragraph, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(paragraph.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(paragraph_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [21]:
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for paragraph in paragraph_text:
    sentences += paragraph_to_sentences(paragraph, tokenizer)
    
print len(sentences)

Parsing sentences from training set
18711


In [22]:
sentences[1]

[u'well', u'good']

In [24]:
#this processing does not keep the same index for the sentences and labels from the paragraph. Fix this, or don't use it.