# Import packages

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

# Get Data

In [24]:
# Import voting record data
vote_by_year = pd.read_csv("datasets/historic_voting.csv", index_col = "Unnamed: 0")

In [25]:
# Import policy search trend data
perc_trend = pd.read_csv("datasets/rep_dem_policy_dist.csv", header = [1], index_col = 0)
perc_trend.columns = ["economy", "abortion", "immigration", "Race", "health_care"]
perc_trend = perc_trend.sort_values("Region")

In [26]:
# Remove % sign from data
for col in perc_trend.columns:
    for i in range(0, 51):
        perc_trend[col][i] = perc_trend[col][i][:-1]

In [27]:
# Add a column for number of elections voted Democrat in the last 9 elections
perc_trend["votes_dem"] = [vote_by_year[i].sum() for i in vote_by_year.drop("year", axis = 1)]

In [28]:
# Create and assign a 1 to the label column if it will likely vote dem
perc_trend["label"] = 0
perc_trend.loc[perc_trend["votes_dem"] >= 4, "label"] = 1

In [29]:
# Ensure that data are integers
perc_trend = perc_trend.astype(int)

## Model Building

In [30]:
# Create and run a model with relative popularity data
cols = ["economy",
        "abortion",
        "immigration",
        "Race",
        "health_care"]

X = perc_trend[cols]
y = perc_trend["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1776)

In [31]:
# Instantiate and fit a basic RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 300,
                            max_depth = 2,
                            min_samples_leaf = 1,
                            min_samples_split = 2,
                            random_state=1776)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, n_estimators=300, random_state=1776)

In [32]:
# Create column for predicted values
rf_pred = rf.predict(X)

### Review Model Predictions

In [33]:
# Get the votes for 2016 election to use as a metric for identifying the better model
past_vote = pd.read_csv("datasets/historic_voting.csv", index_col = "Unnamed: 0")
votes_2016 = np.array(past_vote.loc[past_vote["year"] == 2016].drop(columns = "year"))
perc_trend["2016"] = votes_2016[0]

In [34]:
# Drop Testing columns
perc_trend.drop(["votes_dem", "label", "2016"], axis = 1, inplace = True)

In [35]:
# Add a column with the probabilities of voting Democrat from the Random Forest Model
proba_party = pd.DataFrame(rf.predict_proba(X), columns = ["rep", "dem"], index = perc_trend.index)

In [36]:
proba_party

Unnamed: 0_level_0,rep,dem
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,0.941556,0.058444
Alaska,0.595235,0.404765
Arizona,0.499286,0.500714
Arkansas,0.85317,0.14683
California,0.244105,0.755895
Colorado,0.207777,0.792223
Connecticut,0.134838,0.865162
Delaware,0.587419,0.412581
District of Columbia,0.064607,0.935393
Florida,0.666237,0.333763


## Calculate Probabilities for Current Year

In [37]:
# Get recent Google Trends and create a column with the month of the results
trends_dict = {}
for i, j in enumerate(range(2, 13), start = 1):
    month_df = pd.read_csv(f"datasets/google_trends/{i}x{j}_trends.csv")
    month_df["month"] = i
    trends_dict[f"{i}x{j}_trends"] = month_df

In [38]:
# Concatenate the dataframes
dict_list = [trends_dict["11x12_trends"],
             trends_dict["10x11_trends"],
             trends_dict["9x10_trends"],
             trends_dict["8x9_trends"],
             trends_dict["7x8_trends"],
             trends_dict["6x7_trends"],
             trends_dict["5x6_trends"],
             trends_dict["4x5_trends"],
             trends_dict["3x4_trends"],
             trends_dict["2x3_trends"],
             trends_dict["1x2_trends"]]

past_trends = pd.concat(dict_list)

In [39]:
# Create a weight column for each corresponding month
past_trends.loc[past_trends["month"] == 11, "weight"] = .20
past_trends.loc[past_trends["month"] == 10, "weight"] = .15
past_trends.loc[past_trends["month"] == 9, "weight"] = .15
past_trends.loc[past_trends["month"] == 8, "weight"] = .10
past_trends.loc[past_trends["month"] == 7, "weight"] = .10
past_trends.loc[past_trends["month"] == 6, "weight"] = .05
past_trends.loc[past_trends["month"] == 5, "weight"] = .05
past_trends.loc[past_trends["month"] == 4, "weight"] = .05
past_trends.loc[past_trends["month"] == 3, "weight"] = .05
past_trends.loc[past_trends["month"] == 2, "weight"] = .05
past_trends.loc[past_trends["month"] == 1, "weight"] = .05

In [40]:
# Create weighted columns for eac search term
past_trends["weighted_economy"] = past_trends["economy"] * past_trends["weight"]
past_trends["weighted_abortion"] = past_trends["abortion"] * past_trends["weight"]
past_trends["weighted_immigration"] = past_trends["immigration"] * past_trends["weight"]
past_trends["weighted_race"] = past_trends["Race"] * past_trends["weight"]
past_trends["weighted_health_care"] = past_trends["health care"] * past_trends["weight"]

In [41]:
# Drop the non-weighted columns from the DataFrame
nw_cols = ["economy",
           "abortion",
           "immigration",
           "Race",
           "health care",
           "month",
           "weight"]

weighted_df = past_trends.groupby(["geoName"], as_index=False).agg("sum").drop(nw_cols, axis = 1)

In [42]:
weighted_df

Unnamed: 0,geoName,weighted_economy,weighted_abortion,weighted_immigration,weighted_race,weighted_health_care
0,Alabama,11.9,14.2,5.95,54.2,13.75
1,Alaska,11.9,11.8,6.4,55.0,14.9
2,Arizona,13.1,10.7,10.8,46.95,18.45
3,Arkansas,12.25,11.5,7.85,54.05,14.35
4,California,15.85,9.55,14.4,45.85,14.35
5,Colorado,12.5,9.1,10.85,50.8,16.75
6,Connecticut,12.9,8.9,11.25,48.95,18.0
7,Delaware,11.85,10.55,9.95,50.45,17.2
8,District of Columbia,15.2,10.05,22.85,32.2,19.7
9,Florida,12.3,10.5,13.1,45.6,18.5


In [43]:
# Save the predictions for the past year
weighted_preds = pd.DataFrame(rf.predict_proba(weighted_df.drop("geoName", axis = 1)),
                     columns = ["rep", "dem"],
                     index = weighted_df["geoName"])

In [44]:
# Export predictions
weighted_preds.to_csv("datasets/current_weighted_preds.csv")