In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('House_dataset/1976-2020-house.csv')

In [3]:
df.shape

(31101, 20)

In [4]:
# drop unused columns and nulls
df = df.drop(columns=['mode', 'version', 'office', 'fusion_ticket', 'unofficial', 'writein',
                      'special', 'runoff', 'stage', 'state_cen', 'state_ic'])
df = df.dropna()
df = df.reset_index()
df.shape

(27472, 10)

In [5]:
# drop counties reporting zero total votes
df = df.loc[df['totalvotes'] != 0]
df.shape

(27471, 10)

In [6]:
# get % of votes each party received in each county
temp = []

for index, row in df.iterrows():
    temp.append(round(row['candidatevotes'] / row['totalvotes'], 4))
df['percentage'] = temp

In [7]:
valid_parties = ['DEMOCRAT', 'REPUBLICAN', 'LIBERTARIAN', 'INDEPENDENT', 'GREEN', 'OTHER']

for index, row in df.iterrows():
    if row['party'] not in valid_parties:
        df.loc[index,'party'] = 'OTHER'
    

In [8]:
counter = {}
for index, row in df.iterrows():
    if row['party'] in counter:
        counter[row['party']] += 1
    else:
        counter[row['party']] = 1
        
print(counter)

{'DEMOCRAT': 9501, 'REPUBLICAN': 9264, 'OTHER': 4349, 'INDEPENDENT': 1166, 'LIBERTARIAN': 2683, 'GREEN': 508}


In [9]:
df.head(10)

Unnamed: 0,index,year,state,state_po,state_fips,district,candidate,party,candidatevotes,totalvotes,percentage
0,0,1976,ALABAMA,AL,1,1,BILL DAVENPORT,DEMOCRAT,58906,157170,0.3748
1,1,1976,ALABAMA,AL,1,1,JACK EDWARDS,REPUBLICAN,98257,157170,0.6252
2,3,1976,ALABAMA,AL,1,2,J. CAROLE KEAHEY,DEMOCRAT,66288,156362,0.4239
3,5,1976,ALABAMA,AL,1,2,"WILLIAM L. """"BILL"""" DICKINSON",REPUBLICAN,90069,156362,0.576
4,6,1976,ALABAMA,AL,1,3,BILL NICHOLS,DEMOCRAT,106935,108048,0.9897
5,8,1976,ALABAMA,AL,1,3,OGBURN GARDNER,OTHER,1111,108048,0.0103
6,9,1976,ALABAMA,AL,1,4,TOM BEVILL,DEMOCRAT,141490,176022,0.8038
7,10,1976,ALABAMA,AL,1,4,LEONARD WILSON,REPUBLICAN,34531,176022,0.1962
8,12,1976,ALABAMA,AL,1,5,RONNIE G. FLIPPO,DEMOCRAT,113553,113560,0.9999
9,14,1976,ALABAMA,AL,1,6,BILLY E. DORSEY,OTHER,1021,162518,0.0063


In [10]:
#create new dataframe with desired data config
curr_dist = 'AL1'
new_df = []
temp_dict = {}
for index, row in df.iterrows():
    if row['state_po'] + str(row['district']) == curr_dist:
        temp_dict['year'] = row['year']
        temp_dict['state'] = row['state_po']
        temp_dict['state_fips'] = int(row['state_fips'])
        temp_dict['district'] = row['district']
        temp_dict['total_no'] = row['totalvotes']
        curr_party = row['party'][0].lower()
        
        if curr_party == 'd':
            temp_dict['dem_no'] = row['candidatevotes']
            temp_dict['dem_pct'] = row['percentage']
        elif curr_party == 'r':
            temp_dict['rep_no'] = row['candidatevotes']
            temp_dict['rep_pct'] = row['percentage']
        elif curr_party == 'l':
            temp_dict['lib_no'] = row['candidatevotes']
            temp_dict['lib_pct'] = row['percentage']
        elif curr_party == 'g':
            temp_dict['grn_no'] = row['candidatevotes']
            temp_dict['grn_pct'] = row['percentage']
        else:
            temp_dict['oth_no'] = row['candidatevotes']
            temp_dict['oth_pct'] = row['percentage']
    else:
        copy = temp_dict.copy()
        new_df.append(copy)
        temp_dict.clear()
        
        temp_dict['year'] = row['year']
        temp_dict['state'] = row['state_po']
        temp_dict['state_fips'] = int(row['state_fips'])
        temp_dict['district'] = row['district']
        temp_dict['total_no'] = row['totalvotes']
        curr_party = row['party'][0].lower()

        if curr_party == 'd':
            temp_dict['dem_no'] = row['candidatevotes']
            temp_dict['dem_pct'] = row['percentage']
        elif curr_party == 'r':
            temp_dict['rep_no'] = row['candidatevotes']
            temp_dict['rep_pct'] = row['percentage']
        elif curr_party == 'l':
            temp_dict['lib_no'] = row['candidatevotes']
            temp_dict['lib_pct'] = row['percentage']
        elif curr_party == 'g':
            temp_dict['grn_no'] = row['candidatevotes']
            temp_dict['grn_pct'] = row['percentage']
        else:
            temp_dict['oth_no'] = row['candidatevotes']
            temp_dict['oth_pct'] = row['percentage']
    
    curr_dist = row['state_po'] + str(row['district'])
    

new_df = pd.DataFrame(new_df)

In [11]:
def get_winning_party(row):
    max_val = 0
    winning_party = '#'
    if row['dem_no'] > max_val:
        max_val = row['dem_no']
        winning_party = 'd'
    if row['rep_no'] > max_val:
        max_val = row['rep_no']
        winning_party = 'r'
    if row['grn_no'] > max_val:
        max_val = row['grn_no']
        winning_party = 'g'
    if row['lib_no'] > max_val:
        max_val = row['lib_no']
        winning_party = 'l'
    if row['oth_no'] > max_val:
        max_val = row['oth_no']
        winning_party = 'o'
    return winning_party

In [12]:
new_df.head()

Unnamed: 0,year,state,state_fips,district,total_no,dem_no,dem_pct,rep_no,rep_pct,oth_no,oth_pct,lib_no,lib_pct,grn_no,grn_pct
0,1976,AL,1,1,157170,58906.0,0.3748,98257.0,0.6252,,,,,,
1,1976,AL,1,2,156362,66288.0,0.4239,90069.0,0.576,,,,,,
2,1976,AL,1,3,108048,106935.0,0.9897,,,1111.0,0.0103,,,,
3,1976,AL,1,4,176022,141490.0,0.8038,34531.0,0.1962,,,,,,
4,1976,AL,1,5,113560,113553.0,0.9999,,,,,,,,


In [18]:
#create new dataframe with desired data config
curr_state = 'AL'
final_df = []
temp_dict = {}
for index, row in new_df.iterrows():
    if index >= len(new_df):
        break
    
    if row['state'] == curr_state:
        temp_dict['year'] = row['year']
        temp_dict['state'] = row['state']
        temp_dict['state_fips'] = int(row['state_fips'])
        curr_party = get_winning_party(new_df.loc[index])
        
        if curr_party == 'd':
            if 'dem_no' in temp_dict:
                temp_dict['dem_no'] += 1
            else:
                temp_dict['dem_no'] = 1
        elif curr_party == 'r':
            if 'rep_no' in temp_dict:
                temp_dict['rep_no'] += 1
            else:
                temp_dict['rep_no'] = 1
        elif curr_party == 'l':
            if 'lib_no' in temp_dict:
                temp_dict['lib_no'] += 1
            else:
                temp_dict['lib_no'] = 1
        elif curr_party == 'g':
            if 'grn_no' in temp_dict:
                temp_dict['grn_no'] += 1
            else:
                temp_dict['grn_no'] = 1
        elif curr_party == 'o':
            if 'oth_no' in temp_dict:
                temp_dict['oth_no'] += 1
            else:
                temp_dict['oth_no'] = 1
                
                
    else:
        total = 0
        if 'dem_no' in temp_dict:
            total += temp_dict['dem_no']
        else:
            temp_dict['dem_no'] = 0 
        if 'rep_no' in temp_dict:
            total += temp_dict['rep_no']
        else:
            temp_dict['rep_no'] = 0  
        if 'lib_no' in temp_dict:
            total += temp_dict['lib_no']
        else:
            temp_dict['lib_no'] = 0  
        if 'grn_no' in temp_dict:
            total += temp_dict['grn_no']
        else:
            temp_dict['grn_no'] = 0  
        if 'oth_no' in temp_dict:
            total += temp_dict['oth_no']
        else:
            temp_dict['oth_no'] = 0
            
        temp_dict['total_no'] = total
        
        temp_dict['dem_pct'] = round(temp_dict['dem_no'] / temp_dict['total_no'], 4)
        temp_dict['rep_pct'] = round(temp_dict['rep_no'] / temp_dict['total_no'], 4)
        temp_dict['lib_pct'] = round(temp_dict['lib_no'] / temp_dict['total_no'], 4)
        temp_dict['grn_pct'] = round(temp_dict['grn_no'] / temp_dict['total_no'], 4)
        temp_dict['oth_pct'] = round(temp_dict['oth_no'] / temp_dict['total_no'], 4)
            
        
        copy = temp_dict.copy()
        final_df.append(copy)
        temp_dict.clear()
        
        temp_dict['year'] = row['year']
        temp_dict['state'] = row['state']
        temp_dict['state_fips'] = int(row['state_fips'])
        curr_party = get_winning_party(new_df.loc[index])

        if curr_party == 'd':
            if 'dem_no' in temp_dict:
                temp_dict['dem_no'] += 1
            else:
                temp_dict['dem_no'] = 1
        elif curr_party == 'r':
            if 'rep_no' in temp_dict:
                temp_dict['rep_no'] += 1
            else:
                temp_dict['rep_no'] = 1
        elif curr_party == 'l':
            if 'lib_no' in temp_dict:
                temp_dict['lib_no'] += 1
            else:
                temp_dict['lib_no'] = 1
        elif curr_party == 'g':
            if 'grn_no' in temp_dict:
                temp_dict['grn_no'] += 1
            else:
                temp_dict['grn_no'] = 1
        elif curr_party == 'o':
            if 'oth_no' in temp_dict:
                temp_dict['oth_no'] += 1
            else:
                temp_dict['oth_no'] = 1
    
    curr_state = row['state']
    

final_df = pd.DataFrame(final_df)

In [19]:
final_df.head(55)

Unnamed: 0,year,state,state_fips,rep_no,dem_no,lib_no,grn_no,oth_no,total_no,dem_pct,rep_pct,lib_pct,grn_pct,oth_pct
0,1976,AL,1,3,4,0,0,0,7,0.5714,0.4286,0.0,0.0,0.0
1,1976,AK,2,1,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0
2,1976,AZ,4,2,2,0,0,0,4,0.5,0.5,0.0,0.0,0.0
3,1976,AR,5,1,3,0,0,0,4,0.75,0.25,0.0,0.0,0.0
4,1976,CA,6,14,29,0,0,0,43,0.6744,0.3256,0.0,0.0,0.0
5,1976,CO,8,2,3,0,0,0,5,0.6,0.4,0.0,0.0,0.0
6,1976,CT,9,2,4,0,0,0,6,0.6667,0.3333,0.0,0.0,0.0
7,1976,DE,10,1,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0
8,1976,FL,12,5,10,0,0,0,15,0.6667,0.3333,0.0,0.0,0.0
9,1976,GA,13,0,10,0,0,0,10,1.0,0.0,0.0,0.0,0.0


In [16]:
print(len(final_df))

456
