In [28]:
#Basic Imports
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import  RandomForestClassifier

import pandas as pd
import matplotlib.pyplot as plot
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
df2012 = pd.read_stata('data/SETUPS2012/SETUPS2012.dta')
df2016 = pd.read_stata('data/SETUPS2016/SETUPS2016.dta')
df2020 = pd.read_stata('data/SETUPS2020/SETUPS2020.dta')

In [30]:
df2012.shape, df2016.shape, df2020.shape

((5914, 195), (3649, 204), (7453, 257))

In [31]:
df2012.drop(['CASEID','WEIGHT_FULL'], axis=1, inplace=True)
df2016.drop(['CASEID','WEIGHT'], axis=1, inplace=True)
df2020.drop(['CASEID','WEIGHT'], axis=1, inplace=True)

In [32]:
df2020 = df2020.loc[(df2020['A01'] == '1. Voted') & ((df2020['A02'] == '1. Joe Biden') | (df2020['A02'] == '2. Donald Trump'))]

In [33]:
y = df2020['A02']
df2020.drop(['A02'], axis=1, inplace=True)
X = df2020

In [34]:
df2012.shape, df2016.shape, df2020.shape

((5914, 193), (3649, 202), (6075, 254))

In [35]:
def get_columns(df):
    dictionary = {}
    import string
    alphabet = list(string.ascii_uppercase[0:26])
    for char in alphabet:
        dictionary[char] = []
        for num in list(range(df.shape[1])):
            if df.columns[num].startswith(char):
                dictionary[char].append(df.columns[num])            
        temp = dictionary.pop(char)
        if temp != []:
            dictionary[char] = temp
    return dictionary

In [36]:
_2012_dictionary = get_columns(df2012)
_2016_dictionary = get_columns(df2016)
_2020_dictionary = get_columns(df2020)

In [10]:
# Survey_Subset = input("What section of the survey would you like to analyze? ")
Survey_Subset = 'M'

In [11]:
df2020_Subset = df2020.loc[:,df2020.columns.str.startswith(Survey_Subset)]

In [12]:
df2020_Subset_code = df2020_Subset

In [13]:
# df2020_Subset_code = df2020_Subset_code[['A01',
#   'A02',
#   'A03',
#   'A04',
#   'A05',
#   'A06',
#   'A07',
#   'A08',
#   'A09',
#   'A10',
#   'A11',
#   'A12',
#   'A13',
#   'A14',
#   'A15',
#   'A16',
#   'A17',
#   'A18']].apply(lambda x: x.astype('category').cat.codes)

In [14]:
# df2020_Subset["A08"].cat.codes.value_counts()

In [15]:
# df2020_Subset["A06"].cat.categories


In [16]:
categorical_columns = _2020_dictionary[Survey_Subset]
numerical_columns = []

In [17]:
# categorical_columns = list(df2020.columns)
# numerical_columns = []

In [18]:
X = X[categorical_columns + numerical_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_processing = OneHotEncoder(handle_unknown='ignore')
numerical_pipe = SimpleImputer(strategy="mean")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_processing, categorical_columns),
        ("num", numerical_pipe, numerical_columns),
    ],
    verbose_feature_names_out=False,
)

pipe = Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)
pipe.fit(X_train, y_train)

In [20]:
# param_grid = {'classifier__n_estimators':np.arange(50,200,15),
#               'classifier__max_features':np.arange(0.1, 1, 0.1),
#               'classifier__max_depth': [3, 5, 7, 9],
#               'classifier__max_samples': [0.3, 0.5, 0.8]}

In [21]:
param_grid = {'classifier__n_estimators': [50, 100, 150],
              'classifier__max_features':[0.4],
              'classifier__max_depth': [7, 9, 11],
              'classifier__max_samples': [0.5,]}

In [22]:
gridsearch = GridSearchCV(estimator=pipe, 
                          param_grid=param_grid, 
                          scoring='accuracy', 
                          cv=5)

In [26]:
# Fit the training data
gridsearch.fit(X_train, y_train)



KeyboardInterrupt: 

In [None]:
# Print the accuracy on test set
gridsearch.score(X_test, y_test), gridsearch.best_params_

In [None]:
# feature_names = pipe[:-1].get_feature_names_out()

# mdi_importances = pd.Series(
#     pipe[-1].feature_importances_, index=feature_names
# ).sort_values(ascending=True)

In [None]:
# mdi_importances.max()

In [None]:
# ax = mdi_importances[:10].plot.barh()
# ax.set_title("Random Forest Feature Importances (MDI)")
# ax.figure.tight_layout()

# Decision Tree

In [38]:
OHE = OneHotEncoder(handle_unknown='ignore')

In [39]:
analyze_each_catagory(X, y, _2020_dictionary)

A
B
C
D
E
F
G
H
J
K
L
M
N
P
Q
R


{'A': array([0.95967078, 0.94403292, 0.94979424, 0.9308642 , 0.9399177 ]),
 'B': array([0.59670782, 0.56460905, 0.57860082, 0.60493827, 0.59506173]),
 'C': array([0.83045267, 0.80823045, 0.81563786, 0.79259259, 0.82880658]),
 'D': array([0.98271605, 0.96213992, 0.97037037, 0.96296296, 0.97037037]),
 'E': array([0.96954733, 0.94897119, 0.95884774, 0.93744856, 0.95802469]),
 'F': array([0.92016461, 0.91687243, 0.91193416, 0.91028807, 0.91358025]),
 'G': array([0.81646091, 0.80823045, 0.82880658, 0.81646091, 0.80411523]),
 'H': array([0.94403292, 0.92510288, 0.93497942, 0.93168724, 0.94814815]),
 'J': array([0.89876543, 0.88477366, 0.89218107, 0.88065844, 0.89300412]),
 'K': array([0.93168724, 0.90864198, 0.91358025, 0.91193416, 0.91851852]),
 'L': array([0.78930041, 0.77613169, 0.75555556, 0.7654321 , 0.76872428]),
 'M': array([0.89135802, 0.87983539, 0.89135802, 0.87572016, 0.87407407]),
 'N': array([0.85596708, 0.83621399, 0.85925926, 0.84115226, 0.85596708]),
 'P': array([0.93415638, 

In [37]:
def analyze_each_catagory(data, target, dictionary):
    scores_dict = {}
    for key in dictionary.keys():
        categorical_columns = dictionary[key]
        numerical_columns = []
        data_subset = data[categorical_columns + numerical_columns]
        OHE = OneHotEncoder(handle_unknown='ignore')
        data_subset_encoded = OHE.fit_transform(data_subset)
        data_train, data_test, target_train, target_test = train_test_split(data_subset_encoded, target, random_state=42)
        tree_clf = DecisionTreeClassifier(max_depth=5) 
        tree_clf.fit(data_train, target_train)
        scores = cross_val_score(estimator=tree_clf, X=data_subset_encoded, y=target, cv=5, n_jobs=4)
        scores_dict[key] = scores
    return scores_dict

In [None]:
categorical_columns = dictionary[key]
numerical_columns = []

In [None]:
X = X[categorical_columns + numerical_columns]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
X_encoded = OHE.fit_transform(X)
tree_clf = DecisionTreeClassifier(max_depth=5) 
tree_clf.fit(data_train, target_train)
scores = cross_val_score(estimator=tree_clf, X=X_encoded, y=y, cv=5, n_jobs=4)
scores_dict[key] = scores

In [81]:
X_encoded = OHE.fit_transform(X)

In [82]:
data_train, data_test, target_train, target_test = train_test_split(X_encoded, y, random_state=42)

In [83]:
tree_clf = DecisionTreeClassifier(max_depth=5) 
tree_clf.fit(data_train, target_train)

In [84]:
pred = tree_clf.predict(data_test)
print(classification_report(target_test, pred))

                 precision    recall  f1-score   support

   1. Joe Biden       0.97      0.98      0.97       888
2. Donald Trump       0.97      0.96      0.96       631

       accuracy                           0.97      1519
      macro avg       0.97      0.97      0.97      1519
   weighted avg       0.97      0.97      0.97      1519



In [85]:
scores = cross_val_score(estimator=tree_clf, X=X_encoded, y=y, cv=5, n_jobs=4)

In [86]:
scores

array([0.97530864, 0.96790123, 0.97201646, 0.95884774, 0.97201646])