# Setup

In [833]:
import pandas as pd
import numpy as np
import regex as re
from fuzzywuzzy import fuzz

In [834]:
def cleaner(text):
    clean_text = re.sub(r'[^\w]', ' ', text)
    clean_text = re.sub(r'[0-9]', '' , clean_text).strip()
    return clean_text


def party_extract(text):
    start = text.index('(') + 1
    end = text.index(')')
    party = text[start:end][0]
    return(party)


def trimmer(text):
    items = text.split(" ")
#     print(items)
    items.pop(-1);
    items.pop(-1);
#     print(items)
    trimmed_text = ' '.join(word for word in items)
    return trimmed_text

    
def contains(main, item, threshold=85):
    if fuzz.partial_ratio(main, item)>threshold:
        return True
    else:
        return False
        

# Cleaning

In [835]:
govs = pd.read_csv('./data/governors_races.csv')
govs.drop(['rival', 'inc_results', 'inc_party.1', 'inc_R'], axis = 1, inplace = True)

sens = pd.read_csv('./data/senate_races.csv')
sens.drop(['winner', 'inc_R'], axis = 1, inplace=True)

In [836]:
govs.head()

Unnamed: 0,race_id,candidates,inc,inc_party,state,abbrev,year
0,AL_gov_1978,"['Fob James (Democratic) 72.6', 'H. Guy Hunt (...",George Wallace\r\n,D,Alabama,AL,1978
1,AL_gov_1982,"['George Wallace (Democratic) 57.6', 'Emory Fo...",Fob James\r\n,D,Alabama,AL,1982
2,AL_gov_1986,"['H. Guy Hunt (Republican) 56.4', 'Bill Baxley...",George Wallace\r\n,D,Alabama,AL,1986
3,AL_gov_1990,"['Paul Hubbert (Democratic) 47.9', '\n']",H. Guy Hunt\r\n,R,Alabama,AL,1990
4,AL_gov_1994,"['Fob James (Republican) 50.3', '\n']",Jim Folsom\r\n,D,Alabama,AL,1994


In [837]:
sens.head()

Unnamed: 0,race_id,candidates,inc,inc_party,state,abbrev,year
0,NY_sen_1976,"['√ Daniel P. Moynihan (Democratic) 54.2', 'Ja...",James L. Buckley\r,I,New_York,NY,1976
1,MN_sen_1996,"['√ Paul Wellstone (Democratic (DFL)) 50.3', '...",Paul Wellstone\r\n,I,Minnesota,MN,1996
2,AL_sen_1978,"['√ Howell Heflin (Democratic) 94.0', 'Jerome ...",John Sparkman\r\n,D,Alabama,AL,1978
3,AL_sen_1980,"['√ Jeremiah Denton (Republican) 50.2', 'Jim F...",Donald W. Stewart\r\n,D,Alabama,AL,1980
4,AL_sen_1984,"['√ Howell Heflin (Democratic) 62.7', 'Albert ...",Howell Heflin\r\n,D,Alabama,AL,1984


In [838]:
def first_clean(df):
    #initialize
    n_rows = df.shape[0]
    df['GOP_win'] = 0
     
        
    #extract winner party
    for i in range(n_rows):
        win_party = party_extract(df.loc[i, 'candidates'].split(',')[0])
        df.loc[i, 'GOP_win'] = np.where(win_party == 'R', 1, 0)

        
    # extract and clean winner
    for i in range(n_rows):
        df.loc[i, 'winner'] = trimmer(cleaner(df.loc[i, 'candidates'].split(',')[0]))

        
    #extract and clean runner up
    for i in range(n_rows):
        try:
            rival = cleaner(df.loc[i, 'candidates'].split(',')[1])
            if len(rival) > 6:
                df.loc[i, 'rival'] = trimmer(rival)
            else:
                df.loc[i, 'rival'] = cleaner(df.loc[i, 'candidates'].split(',')[2])
        except:
            df.loc[i, 'rival'] = 'Unopposed'   
    
    
    #clean candidates col
    for i in range(n_rows):
        df.loc[i, 'candidates'] = cleaner(df.loc[i, 'candidates'])
    
    
    #clean incumbent
    for i in range(n_rows):
        df.loc[i, 'inc'] = cleaner(df.loc[i, 'inc'])
    
    # binary for predecessor party
    df['pred_GOP'] = np.where(df['inc_party'] == 'R', 1, 0)
    df['pred_DEM'] = np.where(df['inc_party'] == 'D', 1, 0)
    
    
    # unopposed binary
    df['unopposed'] = np.where(df['rival'] == 'Unopposed', 1, 0)  

    
    return df

In [839]:
govs = first_clean(govs)
sens = first_clean(sens)

In [840]:
govs.head(10)

Unnamed: 0,race_id,candidates,inc,inc_party,state,abbrev,year,GOP_win,winner,rival,pred_GOP,pred_DEM,unopposed
0,AL_gov_1978,Fob James Democratic H Guy Hunt Repub...,George Wallace,D,Alabama,AL,1978,0,Fob James,H Guy Hunt,0,1,0
1,AL_gov_1982,George Wallace Democratic Emory Folmar ...,Fob James,D,Alabama,AL,1982,0,George Wallace,Emory Folmar,0,1,0
2,AL_gov_1986,H Guy Hunt Republican Bill Baxley Dem...,George Wallace,D,Alabama,AL,1986,1,H Guy Hunt,Bill Baxley,0,1,0
3,AL_gov_1990,Paul Hubbert Democratic n,H Guy Hunt,R,Alabama,AL,1990,0,Paul Hubbert,Unopposed,1,0,1
4,AL_gov_1994,Fob James Republican n,Jim Folsom,D,Alabama,AL,1994,1,Fob James,Unopposed,0,1,1
5,AL_gov_1998,Don Siegelman Democratic n,Fob James,R,Alabama,AL,1998,0,Don Siegelman,Unopposed,1,0,1
6,AL_gov_2002,Bob Riley Republican John Sophocleus L...,Don Siegelman,D,Alabama,AL,2002,1,Bob Riley,John Sophocleus,0,1,0
7,AL_gov_2006,Bob Riley R Lucy Baxley D n,Bob Riley,R,Alabama,AL,2006,1,Bob Riley,Lucy Baxley,1,0,0
8,AL_gov_2010,Robert Bentley R nRon Sparks D ...,Bob Riley,R,Alabama,AL,2010,1,Robert Bentley,nRon Sparks D,1,0,0
9,AL_gov_2014,Robert Bentley R Parker Griffith D ...,Robert Bentley,R,Alabama,AL,2014,1,Robert Bentley,Parker Griffith D,1,0,0


In [841]:
def second_clean(df):
    
    #initialize
    n_rows = df.shape[0]
    df['inc_running'] = 0
    df['inc_GOP_running'] = 0
    df['inc_DEM_running'] = 0
    
    #check incumbent running
    for i in range(n_rows):
        inc = df.loc[i, 'inc']
        candidates = df.loc[i, 'candidates']
        if (inc in candidates):
            df.loc[i, 'inc_running'] = 1
        else:
            df.loc[i, 'inc_running'] = 0
    
    
    # inc parties
    for i in range(n_rows):
        if (df.loc[i, 'inc_running'] == 1):
            if (df.loc[i, 'inc_party']=='R'):
                df.loc[i, 'inc_GOP_running'] = 1
            else:
                df.loc[i, 'inc_DEM_running'] = 1
    
    df.drop(['candidates', 'inc', 'inc_party', 'inc_running'], axis = 1, inplace=True)
    
    
    return df

In [842]:
govs = second_clean(govs)
sens = second_clean(sens)

In [844]:
sens.head()

Unnamed: 0,race_id,state,abbrev,year,GOP_win,winner,rival,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running
0,NY_sen_1976,New_York,NY,1976,0,Daniel P Moynihan,James L Buckley,0,0,0,0,1
1,MN_sen_1996,Minnesota,MN,1996,0,Paul Wellstone Democratic,Rudy Boschwitz,0,0,0,0,1
2,AL_sen_1978,Alabama,AL,1978,0,Howell Heflin,Jerome B Couch,0,1,0,0,0
3,AL_sen_1980,Alabama,AL,1980,1,Jeremiah Denton,Jim Folsom Jr,0,1,0,0,0
4,AL_sen_1984,Alabama,AL,1984,0,Howell Heflin,Albert L Smith Jr,0,1,0,0,1


In [845]:
govs.head()

Unnamed: 0,race_id,state,abbrev,year,GOP_win,winner,rival,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running
0,AL_gov_1978,Alabama,AL,1978,0,Fob James,H Guy Hunt,0,1,0,0,0
1,AL_gov_1982,Alabama,AL,1982,0,George Wallace,Emory Folmar,0,1,0,0,0
2,AL_gov_1986,Alabama,AL,1986,1,H Guy Hunt,Bill Baxley,0,1,0,0,0
3,AL_gov_1990,Alabama,AL,1990,0,Paul Hubbert,Unopposed,1,0,1,0,0
4,AL_gov_1994,Alabama,AL,1994,1,Fob James,Unopposed,0,1,1,0,0


In [847]:
govs.tail()

Unnamed: 0,race_id,state,abbrev,year,GOP_win,winner,rival,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running
542,WY_gov_1998,Wyoming,WY,1998,0,John Vinich,Dave Dawson,1,0,0,0,0
543,WY_gov_2002,Wyoming,WY,2002,0,Dave Freudenthal,Eli Bebout,1,0,0,0,0
544,WY_gov_2006,Wyoming,WY,2006,0,Dave Freudenthal,Ray Hunkins,0,1,0,0,1
545,WY_gov_2010,Wyoming,WY,2010,1,Matt Mead,nLeslie Petersen D,0,1,0,0,0
546,WY_gov_2014,Wyoming,WY,2014,1,Matt Mead,Pete Gosar D,1,0,0,1,0


In [848]:
govs.to_csv('./governors_races_cleaned.csv')
sens.to_csv('./senate_races_cleaned.csv')