# Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# Calculate Weighted Google Trends for 2016

In [2]:
# Get recent Google Trends and create a column with the month of the results
trends_dict = {}
for i, j in enumerate(range(2, 13), start = 1):
    month_df = pd.read_csv(f"datasets/google_trends_2016/{i}x{j}_trends.csv")
    month_df["month"] = i
    trends_dict[f"{i}x{j}_trends"] = month_df

In [3]:
# Concatenate the dataframes
dict_list = [trends_dict["11x12_trends"],
             trends_dict["10x11_trends"],
             trends_dict["9x10_trends"],
             trends_dict["8x9_trends"],
             trends_dict["7x8_trends"],
             trends_dict["6x7_trends"],
             trends_dict["5x6_trends"],
             trends_dict["4x5_trends"],
             trends_dict["3x4_trends"],
             trends_dict["2x3_trends"],
             trends_dict["1x2_trends"]]

past_trends = pd.concat(dict_list)

In [4]:
# Create a weight column for each corresponding month
past_trends.loc[past_trends["month"] == 11, "weight"] = .20
past_trends.loc[past_trends["month"] == 10, "weight"] = .15
past_trends.loc[past_trends["month"] == 9, "weight"] = .15
past_trends.loc[past_trends["month"] == 8, "weight"] = .10
past_trends.loc[past_trends["month"] == 7, "weight"] = .10
past_trends.loc[past_trends["month"] == 6, "weight"] = .05
past_trends.loc[past_trends["month"] == 5, "weight"] = .05
past_trends.loc[past_trends["month"] == 4, "weight"] = .05
past_trends.loc[past_trends["month"] == 3, "weight"] = .05
past_trends.loc[past_trends["month"] == 2, "weight"] = .05
past_trends.loc[past_trends["month"] == 1, "weight"] = .05

In [5]:
# Create weighted columns for eac search term
past_trends["weighted_economy"] = past_trends["economy"] * past_trends["weight"]
past_trends["weighted_abortion"] = past_trends["abortion"] * past_trends["weight"]
past_trends["weighted_immigration"] = past_trends["immigration"] * past_trends["weight"]
past_trends["weighted_climate_change"] = past_trends["climate change"] * past_trends["weight"]
past_trends["weighted_health_care"] = past_trends["health care"] * past_trends["weight"]

In [6]:
# Drop the non-weighted columns from the DataFrame
nw_cols = ["economy",
           "abortion",
           "immigration",
           "climate change",
           "health care",
           "month",
           "weight"]

weighted_df = past_trends.groupby(["geoName"], as_index=False).agg("sum").drop(nw_cols, axis = 1)

In [7]:
# Set index as the state
weighted_df = weighted_df.set_index(weighted_df["geoName"])

In [8]:
# Import voting record data
vote_by_year = pd.read_csv("datasets/historic_voting.csv", index_col = "Unnamed: 0")

In [9]:
# Add a column for number of elections voted Democrat in the last 9 elections
weighted_df["votes_dem"] = [vote_by_year[i].sum() for i in vote_by_year.drop("year", axis = 1)]

In [10]:
# Create and assign a 1 to the label column if it will likely vote dem
weighted_df["label"] = 0
weighted_df.loc[weighted_df["votes_dem"] >= 4, "label"] = 1

In [11]:
# Ensure that data are integers
weighted_df = weighted_df.drop(columns = "geoName").astype(int)

In [12]:
weighted_df.head()

Unnamed: 0_level_0,weighted_economy,weighted_abortion,weighted_immigration,weighted_climate_change,weighted_health_care,votes_dem,label
geoName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama,24,24,16,4,29,0,0
Alaska,26,20,16,9,26,0,0
Arizona,20,18,22,4,34,1,0
Arkansas,25,24,18,4,27,2,0
California,24,16,28,5,24,7,1


## Model Building and Testing

In [13]:
# Create and run a model with relative popularity data
cols = ["weighted_economy",
        "weighted_abortion",
        "weighted_immigration",
        "weighted_climate_change",
        "weighted_health_care"]

X = weighted_df[cols]
y = weighted_df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1776)

In [14]:
# Fit and run a Logistic Regression model
lr = LogisticRegressionCV(cv = 5, fit_intercept = False, random_state = 1776)
lr.fit(X_train, y_train)

print(f"Model Training Score: {lr.score(X_train, y_train)}")
print(f"Model Testing Score:{lr.score(X_test, y_test)}")

Model Training Score: 0.8157894736842105
Model Testing Score:0.8461538461538461


In [15]:
# GridSearch for hyperparameters of a LogisticRegressionCV model
param_grid = {
    "max_iter" : [100, 200, 300],
    "fit_intercept" : [True, False]
}

lr_grid = GridSearchCV(LogisticRegressionCV(cv = 5, random_state = 1776),
                       param_grid,
                       cv = 5
)

In [16]:
# Fit and keep the best model
lr_grid.fit(X_train, y_train)
lr_2 = lr_grid.best_estimator_

print(f"Model Training Score: {lr_2.score(X_train, y_train)}")
print(f"Model Testing Score:{lr_2.score(X_test, y_test)}")

Model Training Score: 0.8157894736842105
Model Testing Score:0.8461538461538461




In [18]:
# Instantiate and fit a basic RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, random_state=1776)
rf.fit(X_train, y_train)

print(f"Model Training Score: {rf.score(X_train, y_train)}")
print(f"Model Testing Score:{rf.score(X_test, y_test)}")

Model Training Score: 1.0
Model Testing Score:0.9230769230769231


In [19]:
# GridSearch for hyperparameters of a RandomForestClassifier
param_grid = {
    "n_estimators" : [150, 250, 300, 350],
    "max_depth" : [2, 3],
    "min_samples_split" : [2, 3],
    "min_samples_leaf" : [1, 2]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state = 1776),
                       param_grid,
                       cv = 5
)

In [20]:
# Fit and keep the best model
rf_grid.fit(X_train, y_train)
rf_2 = rf_grid.best_estimator_

print(f"Model Training Score: {rf_2.score(X_train, y_train)}")
print(f"Model Testing Score:{rf_2.score(X_test, y_test)}")

Model Training Score: 0.8947368421052632
Model Testing Score:0.9230769230769231




In [38]:
rf_grid.best_params_

{'max_depth': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

Will use these parameters for the model for the 2020 election.