# Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

# Get Data

In [2]:
# Import voting record data
vote_by_year = pd.read_csv("datasets/historic_voting.csv", index_col = "Unnamed: 0")

In [3]:
# Import policy search trend data
perc_trend = pd.read_csv("datasets/rep_dem_policy_dist.csv", header = [1], index_col = 0)
perc_trend.columns = ["economy", "abortion", "immigration", "climate_change", "health_care"]
perc_trend = perc_trend.sort_values("Region")

In [4]:
# Remove % sign from data
for col in perc_trend.columns:
    for i in range(0, 51):
        perc_trend[col][i] = perc_trend[col][i][:-1]

In [5]:
# Add a column for number of elections voted Democrat in the last 9 elections
perc_trend["votes_dem"] = [vote_by_year[i].sum() for i in vote_by_year.drop("year", axis = 1)]

In [6]:
# Create and assign a 1 to the label column if it will likely vote dem
perc_trend["label"] = 0
perc_trend.loc[perc_trend["votes_dem"] >= 4, "label"] = 1

In [7]:
# Ensure that data are integers
perc_trend = perc_trend.astype(int)

## Model Building and Testing

In [8]:
# Create and run a model with relative popularity data
cols = ["economy", "abortion", "immigration", "climate_change", "health_care"]

X = perc_trend[cols]
y = perc_trend["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1776)

In [9]:
# Fit and run a Logistic Regression model
lr = LogisticRegressionCV(cv = 5, fit_intercept = False, random_state = 1776)
lr.fit(X_train, y_train)

print(f"Model Training Score: {lr.score(X_train, y_train)}")
print(f"Model Testing Score:{lr.score(X_test, y_test)}")

Model Training Score: 0.868421052631579
Model Testing Score:0.8461538461538461


In [10]:
# GridSearch for hyperparameters of a LogisticRegressionCV model
param_grid = {
    "max_iter" : [100, 200, 300],
    "fit_intercept" : [True, False]
}

lr_grid = GridSearchCV(LogisticRegressionCV(cv = 5, random_state = 1776),
                       param_grid,
                       cv = 5
)

In [11]:
lr_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                            dual=False, fit_intercept=True,
                                            intercept_scaling=1.0,
                                            l1_ratios=None, max_iter=100,
                                            multi_class='warn', n_jobs=None,
                                            penalty='l2', random_state=1776,
                                            refit=True, scoring=None,
                                            solver='lbfgs', tol=0.0001,
                                            verbose=0),
             iid='warn', n_jobs=None,
             param_grid={'fit_intercept': [True, False],
                         'max_iter': [100, 200, 300]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [12]:
lr_2 = lr_grid.best_estimator_

In [13]:
print(f"Model Training Score: {lr_2.score(X_train, y_train)}")
print(f"Model Testing Score:{lr_2.score(X_test, y_test)}")

Model Training Score: 0.868421052631579
Model Testing Score:0.8461538461538461


In [14]:
# Create column for predicted values
lr_pred = lr_2.predict(X)

In [15]:
# Instantiate and fit a basic RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, random_state=1776)
rf.fit(X_train, y_train)

print(f"Model Training Score: {rf.score(X_train, y_train)}")
print(f"Model Testing Score:{rf.score(X_test, y_test)}")

Model Training Score: 1.0
Model Testing Score:0.8461538461538461


In [16]:
# GridSearch for hyperparameters of a RandomForestClassifier
param_grid = {
    "n_estimators" : [150, 200, 250],
    "max_depth" : [2, 3],
    "min_samples_split" : [2, 3],
    "min_samples_leaf" : [1, 2, 3]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state = 1776),
                       param_grid,
                       cv = 5
)

In [17]:
rf_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=1776, verbose=0,
                                              warm_start=False),
             iid

In [18]:
rf_grid.best_params_

{'max_depth': 2,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}

In [19]:
rf_2 = rf_grid.best_estimator_

In [20]:
print(f"Model Training Score: {rf_2.score(X_train, y_train)}")
print(f"Model Testing Score:{rf_2.score(X_test, y_test)}")

Model Training Score: 0.868421052631579
Model Testing Score:0.8461538461538461


In [21]:
# Create column for predicted values
rf_pred = rf_2.predict(X)

### Review Model Predictions

In [22]:
# Create columns for the predictions made by the logistic regression and random forest models
perc_trend["rf_pred"] = rf_pred
perc_trend["lr_pred"] = lr_pred

In [23]:
# Get the votes for 2016 election to use as a metric for identifying the better model
past_vote = pd.read_csv("datasets/historic_voting.csv", index_col = "Unnamed: 0")
votes_2016 = np.array(past_vote.loc[past_vote["year"] == 2016].drop(columns = "year"))
perc_trend["2016"] = votes_2016[0]

In [24]:
# Identify the misclassifications of the Logistic Regression model
lr_misses = (perc_trend["2016"] != perc_trend["lr_pred"]).astype(int)
lr_misses.sum()

7

In [25]:
# Identify the misclassifications of the Random Forest model
rf_misses = (perc_trend["2016"] != perc_trend["rf_pred"]).astype(int)
rf_misses.sum()

7

In [26]:
perc_trend.loc[perc_trend["rf_pred"] != perc_trend["lr_pred"]]

Unnamed: 0_level_0,economy,abortion,immigration,climate_change,health_care,votes_dem,label,rf_pred,lr_pred,2016
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alaska,22,25,15,16,22,0,0,0,1,0
Florida,20,22,22,6,30,3,0,0,1,0
Montana,20,25,11,12,32,1,0,1,0,0
Nebraska,20,24,18,9,29,0,0,1,0,0


Among the missclassifications of the Random Forest and Logistic Regression models where the models differed in their answers, the Logistic Regression model had predicted more swing-states correctly in the context of the 2016 Presidential Election, and will be the primary model moving forward.

In [27]:
# Drop Testing columns
perc_trend.drop(["votes_dem", "label", "rf_pred", "lr_pred", "2016"], axis = 1, inplace = True)

In [28]:
# Add a column with the probabilities of voting Democrat from the Random Forest Model
proba_party = pd.DataFrame(rf_2.predict_proba(X), columns = ["rep", "dem"], index = perc_trend.index)

In [29]:
proba_party

Unnamed: 0_level_0,rep,dem
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,0.950857,0.049143
Alaska,0.580306,0.419694
Arizona,0.498993,0.501007
Arkansas,0.847289,0.152711
California,0.263962,0.736038
Colorado,0.242271,0.757729
Connecticut,0.133897,0.866103
Delaware,0.574315,0.425685
District of Columbia,0.065825,0.934175
Florida,0.681857,0.318143


## Calculate Probabilities for Last Four Months

In [30]:
# Get recent Google Trends and create a column with the month of the results
trends_dict = {}
for i, j in enumerate(range(2, 13), start = 1):
    month_df = pd.read_csv(f"datasets/google_trends/{i}x{j}_trends.csv")
    month_df["month"] = i
    trends_dict[f"{i}x{j}_trends"] = month_df

In [31]:
# Concatenate the dataframes
dict_list = [trends_dict["11x12_trends"],
             trends_dict["10x11_trends"],
             trends_dict["9x10_trends"],
             trends_dict["8x9_trends"],
             trends_dict["7x8_trends"],
             trends_dict["6x7_trends"],
             trends_dict["5x6_trends"],
             trends_dict["4x5_trends"],
             trends_dict["3x4_trends"],
             trends_dict["2x3_trends"],
             trends_dict["1x2_trends"]]

past_trends = pd.concat(dict_list)

In [32]:
# Create a weight column for each corresponding month
past_trends.loc[past_trends["month"] == 11, "weight"] = .20
past_trends.loc[past_trends["month"] == 10, "weight"] = .15
past_trends.loc[past_trends["month"] == 9, "weight"] = .15
past_trends.loc[past_trends["month"] == 8, "weight"] = .10
past_trends.loc[past_trends["month"] == 7, "weight"] = .10
past_trends.loc[past_trends["month"] == 6, "weight"] = .05
past_trends.loc[past_trends["month"] == 5, "weight"] = .05
past_trends.loc[past_trends["month"] == 4, "weight"] = .05
past_trends.loc[past_trends["month"] == 3, "weight"] = .05
past_trends.loc[past_trends["month"] == 2, "weight"] = .05
past_trends.loc[past_trends["month"] == 1, "weight"] = .05

In [33]:
# Create weighted columns for eac search term
past_trends["weighted_economy"] = past_trends["economy"] * past_trends["weight"]
past_trends["weighted_abortion"] = past_trends["abortion"] * past_trends["weight"]
past_trends["weighted_immigration"] = past_trends["immigration"] * past_trends["weight"]
past_trends["weighted_climate_change"] = past_trends["climate change"] * past_trends["weight"]
past_trends["weighted_health_care"] = past_trends["health care"] * past_trends["weight"]

In [34]:
# Drop the non-weighted columns from the DataFrame
nw_cols = ["economy",
           "abortion",
           "immigration",
           "climate change",
           "health care",
           "month",
           "weight"]

weighted_df = past_trends.groupby(["geoName"], as_index=False).agg("sum").drop(nw_cols, axis = 1)

In [35]:
weighted_df

Unnamed: 0,geoName,weighted_economy,weighted_abortion,weighted_immigration,weighted_climate_change,weighted_health_care
0,Alabama,23.9,27.2,13.05,6.8,29.05
1,Alaska,22.35,18.55,14.75,18.05,26.3
2,Arizona,22.15,18.05,18.6,8.25,32.95
3,Arkansas,24.0,22.8,16.1,8.45,28.65
4,California,26.05,15.3,24.65,10.7,23.3
5,Colorado,22.8,16.45,19.45,12.25,29.05
6,Connecticut,22.1,14.4,19.75,11.4,32.35
7,Delaware,23.1,19.05,16.5,9.15,32.2
8,District of Columbia,18.55,13.05,29.0,14.25,25.15
9,Florida,21.1,18.25,21.75,6.8,32.1


In [36]:
# Save the predictions for the past year
weighted_preds = pd.DataFrame(lr_2.predict_proba(weighted_df.drop("geoName", axis = 1)),
                     columns = ["rep", "dem"],
                     index = weighted_df["geoName"])

In [37]:
# Export predictions
weighted_preds.to_csv("datasets/weighted_preds.csv")