# Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

# Get Data

In [2]:
# Import voting record data
vote_by_year = pd.read_csv("datasets/historic_voting.csv", index_col = "Unnamed: 0")

In [3]:
# Import policy search trend data
perc_trend = pd.read_csv("datasets/rep_dem_policy_dist.csv", header = [1], index_col = 0)
perc_trend.columns = ["economy", "abortion", "immigration", "climate_change", "health_care"]
perc_trend = perc_trend.sort_values("Region")

In [4]:
# Remove % sign from data
for col in perc_trend.columns:
    for i in range(0, 51):
        perc_trend[col][i] = perc_trend[col][i][:-1]

In [5]:
# Add a column for number of elections voted Democrat in the last 9 elections
perc_trend["votes_dem"] = [vote_by_year[i].sum() for i in vote_by_year.drop("year", axis = 1)]

In [6]:
# Create and assign a 1 to the label column if it will likely vote dem
perc_trend["label"] = 0
perc_trend.loc[perc_trend["votes_dem"] >= 4, "label"] = 1

In [7]:
# Ensure that data are integers
perc_trend = perc_trend.astype(int)

## Model Building and Testing

In [8]:
# Create and run a model with relative popularity data
cols = ["economy",
        "abortion",
        "immigration",
        "climate_change",
        "health_care"]

X = perc_trend[cols]
y = perc_trend["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1776)

In [9]:
# Instantiate and fit a basic RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 300,
                            max_depth = 2,
                            min_samples_leaf = 1,
                            min_samples_split = 2,
                            random_state=1776)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=1776,
                       verbose=0, warm_start=False)

In [10]:
# Create column for predicted values
rf_pred = rf.predict(X)

### Review Model Predictions

In [11]:
# Get the votes for 2016 election to use as a metric for identifying the better model
past_vote = pd.read_csv("datasets/historic_voting.csv", index_col = "Unnamed: 0")
votes_2016 = np.array(past_vote.loc[past_vote["year"] == 2016].drop(columns = "year"))
perc_trend["2016"] = votes_2016[0]

In [12]:
# Drop Testing columns
perc_trend.drop(["votes_dem", "label", "2016"], axis = 1, inplace = True)

In [13]:
# Add a column with the probabilities of voting Democrat from the Random Forest Model
proba_party = pd.DataFrame(rf.predict_proba(X), columns = ["rep", "dem"], index = perc_trend.index)

In [14]:
proba_party

Unnamed: 0_level_0,rep,dem
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,0.941556,0.058444
Alaska,0.595235,0.404765
Arizona,0.499286,0.500714
Arkansas,0.85317,0.14683
California,0.244105,0.755895
Colorado,0.207777,0.792223
Connecticut,0.134838,0.865162
Delaware,0.587419,0.412581
District of Columbia,0.064607,0.935393
Florida,0.666237,0.333763


## Calculate Probabilities for Current Year

In [15]:
# Get recent Google Trends and create a column with the month of the results
trends_dict = {}
for i, j in enumerate(range(2, 13), start = 1):
    month_df = pd.read_csv(f"datasets/google_trends/{i}x{j}_trends.csv")
    month_df["month"] = i
    trends_dict[f"{i}x{j}_trends"] = month_df

In [16]:
# Concatenate the dataframes
dict_list = [trends_dict["11x12_trends"],
             trends_dict["10x11_trends"],
             trends_dict["9x10_trends"],
             trends_dict["8x9_trends"],
             trends_dict["7x8_trends"],
             trends_dict["6x7_trends"],
             trends_dict["5x6_trends"],
             trends_dict["4x5_trends"],
             trends_dict["3x4_trends"],
             trends_dict["2x3_trends"],
             trends_dict["1x2_trends"]]

past_trends = pd.concat(dict_list)

In [17]:
# Create a weight column for each corresponding month
past_trends.loc[past_trends["month"] == 11, "weight"] = .20
past_trends.loc[past_trends["month"] == 10, "weight"] = .15
past_trends.loc[past_trends["month"] == 9, "weight"] = .15
past_trends.loc[past_trends["month"] == 8, "weight"] = .10
past_trends.loc[past_trends["month"] == 7, "weight"] = .10
past_trends.loc[past_trends["month"] == 6, "weight"] = .05
past_trends.loc[past_trends["month"] == 5, "weight"] = .05
past_trends.loc[past_trends["month"] == 4, "weight"] = .05
past_trends.loc[past_trends["month"] == 3, "weight"] = .05
past_trends.loc[past_trends["month"] == 2, "weight"] = .05
past_trends.loc[past_trends["month"] == 1, "weight"] = .05

In [18]:
# Create weighted columns for eac search term
past_trends["weighted_economy"] = past_trends["economy"] * past_trends["weight"]
past_trends["weighted_abortion"] = past_trends["abortion"] * past_trends["weight"]
past_trends["weighted_immigration"] = past_trends["immigration"] * past_trends["weight"]
past_trends["weighted_climate_change"] = past_trends["climate change"] * past_trends["weight"]
past_trends["weighted_health_care"] = past_trends["health care"] * past_trends["weight"]

In [19]:
# Drop the non-weighted columns from the DataFrame
nw_cols = ["economy",
           "abortion",
           "immigration",
           "climate change",
           "health care",
           "month",
           "weight"]

weighted_df = past_trends.groupby(["geoName"], as_index=False).agg("sum").drop(nw_cols, axis = 1)

In [20]:
weighted_df

Unnamed: 0,geoName,weighted_economy,weighted_abortion,weighted_immigration,weighted_climate_change,weighted_health_care
0,Alabama,23.9,27.2,13.05,6.8,29.05
1,Alaska,22.35,18.55,14.75,18.05,26.3
2,Arizona,22.15,18.05,18.6,8.25,32.95
3,Arkansas,24.0,22.8,16.1,8.45,28.65
4,California,26.05,15.3,24.65,10.7,23.3
5,Colorado,22.8,16.45,19.45,12.25,29.05
6,Connecticut,22.1,14.4,19.75,11.4,32.35
7,Delaware,23.1,19.05,16.5,9.15,32.2
8,District of Columbia,18.55,13.05,29.0,14.25,25.15
9,Florida,21.1,18.25,21.75,6.8,32.1


In [21]:
# Save the predictions for the past year
weighted_preds = pd.DataFrame(rf.predict_proba(weighted_df.drop("geoName", axis = 1)),
                     columns = ["rep", "dem"],
                     index = weighted_df["geoName"])

In [22]:
# Export predictions
weighted_preds.to_csv("datasets/current_weighted_preds.csv")