### munging/parsing/processing json data

Other ideas:  
use date of debate as another feature  

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
LOCAL_DATA_PATH = '/Users/willsankey/political_history/data'
LOCAL_SAVE_PATH = '/Users/willsankey/political_history/processed_data'

#### general processing for all approaches

In [2]:
#import all json files, concatenate into pandas dataframe
all_files = glob.glob(LOCAL_DATA_PATH + '/*.json')

df = pd.concat((pd.read_json(f, orient='index') for f in all_files))

#concatenating resulted in non-unique index, re-index
#df.index.is_unique

df['index'] = np.arange(len(df))
df = df.set_index('index')

df.index.is_unique

True

In [4]:
df.head()

Unnamed: 0_level_0,date,speaker,text,title
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2012-10-03,National Economy,This debate and the next three—two Presidentia...,"Barack Obama: Presidential Debate in Denver, C..."
1,2012-10-03,National Economy,Thousands of people offered suggestions on seg...,"Barack Obama: Presidential Debate in Denver, C..."
2,2012-10-03,Laughter,But we all know that we've still got a lot of ...,"Barack Obama: Presidential Debate in Denver, C..."
3,2012-10-03,Gov. Romney.,You bet.,"Barack Obama: Presidential Debate in Denver, C..."
4,2012-10-03,Deficit and National Debt/Spending Cuts,,"Barack Obama: Presidential Debate in Denver, C..."


In [5]:
#from speaker: strip spaces and special characters, make lowercase
df.speaker = df.speaker.str.strip().str.lower().str.replace(' ','').str.replace('.', '').str.replace(':','')

#get list of unique values for speaker
unique_speaker = pd.unique(df.speaker.ravel())
#print len(unique_speaker)
#print unique_speaker

In [6]:
#map list of candidates to political party
speaker_to_party = {'trump': 'Republican',
                    'clinton': 'Democrat',
                    'pence': 'Republican',
                    'kaine': 'Democrat',
                    'republicanpresidentialnomineewmittromney': 'Republican',
                    'govromney': 'Republican',
                    'thepresident\u2014': 'Republican',
                    'govromney\u2014': 'Republican',
                    'govromney\u2014\u2014': 'Republican',
                    'thepresident': 'Democrat',
                    'representativepaulryan': 'Republican',
                    'ryan': 'Republican',
                    'vicepresidentjosephbiden': 'Democrat',
                    'biden': 'Democrat',
                    'mccain': 'Republican',
                    'obama': 'Democrat',
                    'palin': 'Republican',
                    'presidentbush': 'Republican',
                    'senatorjohnfkerry': 'Democrat',
                    'senatorkerry': 'Democrat',
                    'cheney': 'Republican',
                    'edwards': 'Democrat',
                    'bush': 'Republican',
                    'gore': 'Democrat',
                    'lieberman': 'Democrat'}

#make new column in dataframe for affiliation
df['affiliation'] = df['speaker']
df['affiliation'].replace(speaker_to_party, inplace=True)

In [7]:
#create two new dataframes, one for republicans, one for democrats
republican_df = df.loc[df['affiliation'] == 'Republican']
democrat_df = df.loc[df['affiliation'] == 'Democrat']

#create combined data frame - better for train/test split (sort of...)
candidates_df = df.loc[df['affiliation'].isin(['Republican','Democrat'])]

In [8]:
#export new dataframe for others to use
directory_name = LOCAL_SAVE_PATH
base_filename = 'candidates'
suffix = '.pkl'
save_path = os.path.join(directory_name, base_filename + suffix)
candidates_df.to_pickle(save_path)

In [9]:
#read pickled data
#test_df = pd.read_pickle('C:\Users\JoAnna\political_history\processed_data\candidates.pkl')
#print test_df.head()

In [10]:
#label data - 0 for democrat, 1 for republican
candidates_df['affiliation'].replace({'Democrat':0, 'Republican':1}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


#### processing the text column for bigrams

In [11]:
#make new lists just with labels and text
labels = candidates_df['affiliation']
paragraph_text = candidates_df['text']

In [12]:
#define function to tokenize and stem
def clean_text(text):
    """
    Removes punctuation, converts all characters to lowercase, removes stop words, stems
    
    Args:
        a single string of text 
        
    Returns:
        processed text string
        
    """
    tokens = RegexpTokenizer(r'\w+')
    stops = set(stopwords.words('english'))
    stemmer = SnowballStemmer('english')
    
    token = tokens.tokenize(text)
    filtered_words = [word for word in token if word not in stops]
    stems = [stemmer.stem(t) for t in filtered_words]
    return( " ".join(stems)) 

num_paragraphs= len(paragraph_text)
cleaned_paragraphs = []
#loop over paragraph_text to clean
for paragraph in paragraph_text:
    cleaned_text = clean_text(paragraph)
    cleaned_paragraphs.append(cleaned_text)

#print len(cleaned_paragraphs)
#print cleaned_paragraphs[6]

In [13]:
#export labels and cleaned paragraphs
os.chdir(LOCAL_SAVE_PATH)
pickle.dump(labels, open("bow_labels.pkl", "w"))
pickle.dump(cleaned_paragraphs, open("bow_processed_text.pkl", "w"))