In [93]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [7]:
demographics = pandas.read_table(r"./CCL_2023_assessment_data/data/demographics.txt")
projections = pandas.read_table(r"./CCL_2023_assessment_data/data/projections.txt")
results = pandas.read_table(r"./CCL_2023_assessment_data/data/results.txt")
geoid_key = pandas.read_table(r"./CCL_2023_assessment_data/data/geoid_key.txt")

In [24]:
comb1 = pd.merge(demographics, projections, on='geoid', how='left')
comb2 = pd.merge(comb1, results, on = 'geoid', how='left')

In [26]:
comb2['mw_off'] = (comb2['mw_yes'] / (comb2['mw_yes'] + comb2['mw_no'])) - (comb2['proj_mw_yes'] / comb2['proj_votes'])
comb2['bg_off'] = (comb2['bg_yes'] / (comb2['bg_yes'] + comb2['bg_no'])) - (comb2['proj_bg_yes'] / comb2['proj_votes'])
comb2['rcv_off'] = (comb2['rcv_yes'] / (comb2['rcv_yes'] + comb2['rcv_no'])) - (comb2['proj_rcv_yes'] / comb2['proj_votes'])

In [33]:
demos_off = comb2.drop(['proj_rcv_yes', 'proj_bg_yes', 'proj_mw_yes', 'rcv_yes', 'bg_yes', 'mw_yes', 'rcv_no', 'bg_no', 'mw_no'], axis = 1)

In [83]:
np.random.seed(13) #arbitrary seed

def split_train_test(data, test_ratio): #defining function/inputs
    shuffled_indices = np.random.permutation(len(data)) #shuffling the data to prevent learning any patterns from the data's original order
    test_set_size = int(len(data) * test_ratio) #defining the size of our test set given the user defined ratio (i will use 20% for my test set size)
    test_indices = shuffled_indices[:test_set_size] #slicing out our test list starting from 0 and ending at the size of the test list
    train_indices = shuffled_indices[test_set_size:] #slicing out our training list starting from index of test_list_size (where the last line ended, becuase it ends at this value -1) and going ot the end of the data
                                                    #the colons here are like when you do 1:10
                                                    #for the first line it's startofdata:test_list_size
                                                    #for the second it's test_list_size:endofdata
    return data.iloc[train_indices], data.iloc[test_indices] #making two new dataframes based on the train/test indices

train_set, test_set = split_train_test(demos_off, .2)
#We have a few missing values for our offs for counties where results aren't present
#I'm choosing to drop these rather than replace w/ medians like I do for hhincome, b/c the result is so crucial to our analysis
train_set = train_set.dropna(subset=['mw_off', 'bg_off', 'rcv_off'])

In [91]:
#dropping geoid as well since we set our own indices - don't want geoid as a predictor
train_set_no_targets = train_set.drop(['mw_off', 'bg_off', 'rcv_off', 'geoid'], axis = 1)
train_set_mw_labels = train_set['mw_off'].copy()
train_set_bg_labels = train_set['bg_off'].copy()
train_set_rcv_labels = train_set['rcv_off'].copy()

train_copy = train_set_no_targets.copy()
train_copy.isnull().sum() / len(train_copy)
#Looks like we have a few missing values for avg_hhincome, so I'm going to impute those

n_registered           0.000000
share_dem              0.000000
share_rep              0.000000
share_white            0.000000
share_afam             0.000000
share_female           0.000000
avg_hhincome           0.002545
avg_popdens            0.000000
avg_partyscore         0.000000
avg_collegescore       0.000000
avg_gunownscore        0.000000
avg_gvpscore           0.000000
avg_churchscore        0.000000
avg_marijuanascore     0.000000
avg_fiscalprogscore    0.000000
avg_choicescore        0.000000
avg_enviroscore        0.000000
proj_votes             0.000000
dtype: float64

In [86]:
imputer_median = SimpleImputer(strategy="median")
#I would narrow to just int/float here, but that's all we have so we're good
#This data is super clean, but there's a few NaNs for household income, so I'm imputing w/ the median for those
#I'm replacing w/ median rather than dropping b/c this dataset isn't huge - I don't want to lose rows
imputer_median.fit(train_copy)
print(imputer_median.statistics_)
print(train_copy.median().values)
X = imputer_median.transform(train_copy)
train_final = pd.DataFrame(X, columns=train_copy.columns,
                          index=train_copy.index)

#Checking if our cleaning worked
ifmissing = train_final.isnull().values.any()
prctmissing = train_final.isnull().sum() / len(train_final)
prctmissing
print("If this prints as zeroes, we replaced the missing values:\n{}".format(prctmissing))

[9.60000e+02 2.71000e-01 3.03000e-01 9.33000e-01 3.00000e-03 5.02000e-01
 4.11585e+01 1.60893e+02 4.64950e+01 4.13340e+01 6.25320e+01 3.18270e+01
 2.34560e+01 6.28930e+01 3.10760e+01 5.35940e+01 2.66730e+01 7.12000e+02]
[9.60000e+02 2.71000e-01 3.03000e-01 9.33000e-01 3.00000e-03 5.02000e-01
 4.11585e+01 1.60893e+02 4.64950e+01 4.13340e+01 6.25320e+01 3.18270e+01
 2.34560e+01 6.28930e+01 3.10760e+01 5.35940e+01 2.66730e+01 7.12000e+02]
If this prints as zeroes, we replaced the missing values:
n_registered           0.0
share_dem              0.0
share_rep              0.0
share_white            0.0
share_afam             0.0
share_female           0.0
avg_hhincome           0.0
avg_popdens            0.0
avg_partyscore         0.0
avg_collegescore       0.0
avg_gunownscore        0.0
avg_gvpscore           0.0
avg_churchscore        0.0
avg_marijuanascore     0.0
avg_fiscalprogscore    0.0
avg_choicescore        0.0
avg_enviroscore        0.0
proj_votes             0.0
dtype: float64


In [87]:
#There are a few missing values here b/c of counties where there aren't results.
#I am choosing to drop those rows rather than impute, b/c I fear filling in median error for counties is too big a leap.
train_set_mw_labels_clean = train_set_mw_labels.dropna()
train_set_bg_labels_clean = train_set_bg_labels.dropna()
train_set_rcv_labels_clean = train_set_rcv_labels.dropna()

In [90]:
lin_reg = LinearRegression()
lin_reg.fit(train_final, train_set_mw_labels_clean)
lin_predict = lin_reg.predict(train_final)

In [99]:
#Evaluating linear regression predictions
lin_model_mse = mean_squared_error(train_set_mw_labels, lin_predict, squared=True)
lin_model_rmse = mean_squared_error(train_set_mw_labels, lin_predict, squared=False)
lin_model_mae = mean_absolute_error(train_set_mw_labels, lin_predict)
lin_model_r2 = r2_score(train_set_mw_labels, lin_predict)
print(lin_model_mse, lin_model_rmse, lin_model_mae, lin_model_r2)

0.004257395329098849 0.06524871898435132 0.045070951801944766 0.3172051523617958


In [100]:
pd.DataFrame(zip(train_final.columns, lin_reg.coef_))


Unnamed: 0,0,1
0,n_registered,-1.3e-05
1,share_dem,0.328905
2,share_rep,-0.410329
3,share_white,0.022755
4,share_afam,0.876112
5,share_female,-0.234267
6,avg_hhincome,0.000143
7,avg_popdens,1e-05
8,avg_partyscore,-0.009931
9,avg_collegescore,0.000304


In [95]:
from sklearn.preprocessing import PolynomialFeatures

#Tried trying going over degree=2 for this, but it requires ~2x the RAM I have on my system. I commented out those lines.
#Tried grid search as well, but was having errors. Entirely possible that was also a RAM issue.

#Adding the square of each feature in the training set as a new feature
poly_features = PolynomialFeatures(degree=2)
#poly_features3 = PolynomialFeatures(degree=3)

#Making the regression
poly_model = LinearRegression()
#poly_model3 = LinearRegression()

#Fitting
train_cleaned_poly = poly_features.fit_transform(train_final)
poly_model.fit(train_cleaned_poly, train_set_mw_labels)

#Making the predictions
poly_predict = poly_model.predict(train_cleaned_poly)



In [97]:
#Evaluating polynimaial
poly_model_mse = mean_squared_error(train_set_mw_labels, poly_predict, squared=True)
poly_model_rmse = mean_squared_error(train_set_mw_labels, poly_predict, squared=False)
poly_model_mae = mean_absolute_error(train_set_mw_labels, poly_predict)
poly_model_r2 = r2_score(train_set_mw_labels, poly_predict)
print(poly_model_mse, poly_model_rmse, poly_model_mae, poly_model_r2)

0.002236687487591149 0.04729363051819081 0.03439876998675301 0.6412833260125423
