In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stat

In [53]:
# Read in dataset and choose relevant columns
house_df = pd.read_csv('1976-2022-house.csv')
house_df = house_df[~house_df['special']]
house_df = house_df[['year', 'state_po', 'district', 'candidate', 'party', 'candidatevotes', 'totalvotes']]

house_df = house_df[(house_df['candidate'] != 'WRITEIN') & (house_df['candidate'] != 'SCATTERING')]
house_df['party'] = house_df['party'].astype(str)

# Make some simplifications about party affiliation
house_df['party'] = house_df['party'].apply(lambda x : 
                                            'DEMOCRAT' if 
                                                (('DEMOCRAT' in x) or ('LIBERAL' in x)) else 
                                                ('REPUBLICAN' 
                                                    if (('REPUBLICAN' in x) or ('CONSERVATIVE' in x))
                                                    else x)
)
house_df['party'] = house_df['party'].astype(str)

# Renames state_po(stal code) column label to just state
house_df['state'] = house_df['state_po']
house_df = house_df[['year', 'state', 'district', 'candidate', 'party', 'candidatevotes', 'totalvotes']]

# Find primary party of candidates in fusion tickets
grouped = house_df.groupby(['year', 'state', 'district', 'candidate'])
house_df = house_df.loc[grouped['candidatevotes'].idxmax()]

# Find total votes for fusion tickets
house_df = house_df[['year', 'state', 'district', 'candidate', 'party', 'totalvotes']]
sums = grouped['candidatevotes'].sum().reset_index()
house_df = pd.merge(house_df, sums, 'inner', on = ['year', 'state', 'district', 'candidate'])

# Find total votes cast per party
house_df = house_df.groupby(['year', 'state', 'district', 'party', 'totalvotes'])['candidatevotes'].sum().reset_index()

# Aptly rename the candidatevotes no longer referring to candidates
house_df['votes'] = house_df['candidatevotes']
house_df = house_df[['year', 'state', 'district', 'party', 'votes', 'totalvotes']]

# Filter only top two parties
grouped = house_df.groupby(['year', 'state', 'district'])
two_party_votes = grouped['votes'].apply(lambda x : x.sort_values().tail(2)).reset_index()

# Calculate two party vote percentage
two_party_votes['two_party_pct'] = two_party_votes['votes'] / two_party_votes.groupby(['year', 'state', 'district'])['votes'].transform('sum')

house_df = pd.merge(house_df, two_party_votes, 'inner', on = ['year', 'state', 'district', 'votes'])
house_df = house_df.drop(columns= ['level_3'])

# Further simplification: The top two parties are always Democrat and Republican
party_index = list(house_df.columns).index('party')
def simplify(df):
    if len(df) != 2: return df

    if set(df['party']) == {'DEMOCRAT', 'REPUBLICAN'}: return df

    copy = df.copy()

    if copy.iat[0, party_index] not in {'DEMOCRAT', 'REPUBLICAN'}:
        copy.iat[0, party_index] = 'DEMOCRAT' if ('REPUBLICAN' == copy.iat[1, party_index]) else 'REPUBLICAN'
    
    if copy.iat[1, party_index] not in {'DEMOCRAT', 'REPUBLICAN'}:
        copy.iat[1, party_index] = 'DEMOCRAT' if ('REPUBLICAN' == copy.iat[0, party_index]) else 'REPUBLICAN'

    if len(copy) == 2 and set(copy['party']) != {'DEMOCRAT', 'REPUBLICAN'}: 
        print(copy[(copy['party'] != 'DEMOCRAT') & (copy['party'] != 'REPUBLICAN')]['party'])
    
    return copy

# Calculate total vote percentage
grouped = house_df.groupby(['year', 'state', 'district'])
house_df = grouped.apply(simplify).reset_index()
house_df['total_pct'] = house_df['votes']/house_df['totalvotes']

# Exporsts per district dataset
house_df.to_csv(path_or_buf='1976-2022-house-two-party-vote-by-district.csv')

# Simplification done by assuming that districts within a state:
# a) Have the same population
# b) Have the same proportion of voter (by-party) turnout
# Necessary assumptions because lack of data regarding uncontested seats
house_df = house_df.groupby(['year', 'state', 'party'])['two_party_pct'].mean()
house_df.to_csv(path_or_buf='1976-2022-house-two-party-vote-by-state.csv')

3