In [135]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

First, we are going to start working on modelling the states without the gdp index. We will try to keep to approach as modular as possible, in case we need to change the selected dataframe.

In [11]:
correlation_matrix = abs(pd.read_csv('Correlation_matrix.csv'))*10

We will always take the absolute value of the round of 10 correlation between states. First let us focus on swing states. We will need to select the swing state based on sources **previous to the 2020 election**. Source selected: https://fr.wikipedia.org/wiki/Swing_state#Historique (last modification: 24 october 2020).  

#### Construct the data

One big problem will be the way we will construct the validation set. It needs to be from the swing state considered. How can we prevent the creation of a deterministic validation set while imposing such a constraint ? 

###### Getting a list of states and swing states

In [7]:
swing_states = ['Texas', 'Florida', 'Ohio', 'Georgia', 'North Carolina', 'Arizona', 'Iowa', 'Pennsylvania', 'Michigan', 'Virginia', 'Minnesota', 'Wisconsin', 'Colorado', 'Nevada', 'New Hampshire']
all_states = []
index_swing_states = []
for i, file in enumerate(os.listdir('states/')):
    state = file.split('.')[0]
    all_states.append(state)
    if state in swing_states:
        index_swing_states.append(i)

##### Construct the base dataframe

In [9]:
final_df = pd.concat([pd.read_csv('states/'+file) for file in os.listdir('states/')])

In [28]:
final_importance_dfs = {}
for index_state in index_swing_states:
    data_swing_state = []
    for i, state in enumerate(all_states):
        data_state = final_df[final_df['State'] == state]
        correlation = int(correlation_matrix.iloc[index_state, i])
        if correlation > 0:
            data_state_importance = pd.concat([data_state for _ in range(correlation)])
            if data_state.values.tolist():
                try:
                    data_swing_state = pd.concat([data_swing_state, data_state_importance])
                except:
                    data_swing_state = data_state_importance
    final_importance_dfs[all_states[index_state]] = data_swing_state
        

In [35]:
final_importance_dfs['Texas']

Unnamed: 0,month_10,month_11,month_9,republican,Year,Rep_House_Prop,State,Result,rep_loyalty,popular_vote_percentage,density,RDI
0,56.380492,56.679982,53.953977,1,1988,0.285714,Alabama,1,0.8,53.370,76.757225,52087.05
1,33.295363,36.285676,31.824211,0,1988,0.714286,Alabama,0,0.8,45.650,76.757225,52087.05
2,42.974670,38.698232,43.976458,0,1992,0.714286,Alabama,0,0.9,43.010,78.959026,56035.35
3,42.175509,41.956795,44.865857,1,1992,0.285714,Alabama,1,0.9,37.450,78.959026,56035.35
4,40.765363,40.999200,40.518828,0,1996,0.571429,Alabama,0,1.0,49.230,81.841997,60091.59
...,...,...,...,...,...,...,...,...,...,...,...,...
9,58.828129,60.276110,60.148473,1,2008,1.000000,Wyoming,1,1.0,45.660,5.582234,9154.98
10,57.320477,60.283224,57.678128,1,2016,1.000000,Wyoming,1,1.0,46.090,5.985616,10109.96
11,21.309231,22.211773,20.764698,0,2016,0.000000,Wyoming,0,1.0,48.180,5.985616,10109.96
12,66.209900,62.336910,66.226278,1,2020,1.000000,Wyoming,1,1.0,46.525,5.796735,10593.19


##### Modelling part : swing state

Enter the swing state you wish to study

In [30]:
swing_state = 'Texas'

In [58]:
def split(final_importance_df, swing_state):
    swing_state_data = final_importance_df[final_importance_df['State'] == swing_state]
    data_test = swing_state_data[swing_state_data['Year'] == 2020].iloc[:2, :]
    X_test, y_test = data_test.drop('Result', axis=1), data_test['Result']
    swing_state_data = swing_state_data[swing_state_data['Year'] != 2020]
    X_train, X_val = train_test_split(swing_state_data, train_size = 0.7)
    X_val, y_val = X_val.drop('Result', axis=1), X_val['Result']
    X_values = pd.concat((final_importance_df[final_importance_df['State'] != swing_state], X_train))
    X_values = X_values.sample(frac=1, replace=False) 
    X, y = X_values.drop('Result', axis=1), X_values['Result']
    return X, X_val, X_test, y, y_val, y_test

In [59]:
X_train, X_val, X_test, y_train, y_val, y_test = split(final_importance_dfs[swing_state], swing_state)

In [128]:
def modelling(X_train, X_val, y_train, y_val, models, scaler=True, Poly=1, PCA_comp=0):
    # pca is the total variance explained required. 0 means that we do not want to perform pca
    if 'State' in X_train.columns:
        X_train = X_train.drop('State', axis=1)
        X_val = X_val.drop('State', axis=1)
    if 'Year' in X_train.columns:
        X_train = X_train.drop('Year', axis=1)
        X_val = X_val.drop('Year', axis=1)
    if PCA_comp > 0 and not scaler:
        raise ValueError('We must normalize before performing PCA')
    # if PCA_comp > 1 and Poly > 1:
        # print('Performing Polynomial Transformation on top of PCA ...')
    if scaler:
        # print('Normalizing the data ...')
        scaler = MinMaxScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_val = scaler.transform(X_val)
    if PCA_comp > 0:
        # print('Performing PCA ...')
        pca = PCA(n_components=X_train.shape[1])
        pca.fit(X_train)
        X_train = pca.transform(X_train)
        X_val = pca.transform(X_val)
        total_variance = np.cumsum(pca.explained_variance_ratio_)
        for i, variance in enumerate(total_variance):
            if variance > PCA_comp:
                break 
        X_train, X_val = X_train[:, :i+1], X_val[:, :i+1]
    if Poly > 1:
        # print('Polynomial transformation ...')
        poly_features = PolynomialFeatures(degree=Poly, include_bias=False).fit(X_train)
        X_train = poly_features.transform(X_train)
        X_val = poly_features.transform(X_val)
    f1s = []
    accuracies = []
    auc = []
    # print('Modelling and gathering the predictions ..')
    for i, model in enumerate(models):
        model.fit(X_train, y_train)
        predictions = model.predict(X_val)
        accuracies.append(accuracy_score(y_val, predictions))
        f1s.append(f1_score(y_val, predictions))
        auc.append(roc_auc_score(y_val, predictions))
    return models, f1s, accuracies, auc, predictions

In [129]:
models, f1s, accuracies, auc, predictions = modelling(X_train, X_val, y_train, y_val, models=[LogisticRegression()], scaler=True, Poly=2, PCA_comp=0.9)

In [136]:
models_considered = [LogisticRegression(), DecisionTreeClassifier()]
F1, acc, auc = [], [], []
for state in swing_states:
    X_train, X_val, X_test, y_train, y_val, y_test = split(final_importance_dfs[state], state)
    models, f1s, accuracies, aucs, predictions = modelling(X_train, X_val, y_train, y_val, models=models_considered, scaler=True, Poly=2, PCA_comp=0.9)
    F1.append(f1s)
    acc.append(accuracies)
    auc.append(aucs)
F1_mean, acc_mean, auc_mean = np.mean(np.array(f1s), axis=0), np.mean(np.array(acc), axis=0), np.mean(np.array(auc), axis=0)
    