In [8]:
# go up a level to import mc_processor and analyses
import sys
sys.path.append('..')
import mc_processor
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
import statsmodels.formula.api as smf


%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [5]:
# read results files

# clarifier = "field1/"
# clarifier = "field2/rvoter/"
clarifier = "stacked/"

codebook = pd.read_csv('raw/levels_codebook.csv', encoding='utf-8')
data = pd.read_csv(f'raw/{clarifier}data.csv', encoding='utf-8')
q_codebook = pd.read_csv('raw/question_codebook.csv', encoding='utf-8', index_col = 'qid')

# q_codebook into dict
q_codebook = q_codebook.to_dict().get('qidFull')

In [6]:
q = 'BPCxdem2'
print(q)
for i in range(1,3):
    print(f'{i}: {mc_processor.get_name_from_codebook(codebook, q, i)}')

print(' ')

q = 'BPC20a'
print(q)
for i in range(1,5):
    print(f'{i}: {mc_processor.get_name_from_codebook(codebook, q, 5-i)}')

## Confidence analysis (using MNlogit)

import pandas as pd

confidence_keys = {'BPC20':"Your vote", 'BPC21':"Votes in your city or county", 'BPC22':"Votes in your state", 'BPC23':"Votes across the country"}


# Melting the data to get long format
data_long = pd.melt(data, 
                    id_vars=['BPCxdem2', 'wts'], 
                    value_vars=['BPC20', 'BPC21', 'BPC22', 'BPC23'], 
                    var_name='Geographical_Level', 
                    value_name='Confidence')

# Map BPCxdem2 to meaningful labels
data_long['BPCxdem2'] = data_long['BPCxdem2'].map({1: 'Transgender', 2: 'Cisgender'}).astype('category')

# Map the geographical levels to their labels
data_long['Geographical_Level'] = data_long['Geographical_Level'].map(confidence_keys).astype('category')


# Drop or reclassify "Don't know" responses (e.g., excluding them)
data_long = data_long[data_long['Confidence'] != 5]  # Assuming 5 represents "Don't know"

# Reclassify the confidence levels so that higher values indicate higher confidence
data_long['Confidence'] = 5 - data_long['Confidence']


# Ensure that the Confidence column is treated as an ordinal categorical variable
# data_long['Confidence'] = data_long['Confidence'].astype('category')
data_long['Confidence'] = pd.Categorical(data_long['Confidence'], ordered=True)


# Create design matrices for the ordinal logistic regression
y, X = dmatrices('Confidence ~ BPCxdem2 * Geographical_Level', data_long, return_type='dataframe')

# Build and fit the weighted model using ordinal logistic regression
model = sm.MNLogit(y, X, weights=data_long['wts'])

# Fit the model
result = model.fit()

# Print the summary of the model
print(result.summary())



BPCxdem2
1: Identify Trans/Genderqueer/Non-Conforming - Yes
2: Identify Trans/Genderqueer/Non-Conforming - No
 
BPC20a
1: Not confident at all
2: Not to confident
3: Somewhat confident
4: Very confident
Optimization terminated successfully.
         Current function value: 1.157796
         Iterations 6




                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                13334
Model:                        MNLogit   Df Residuals:                    13310
Method:                           MLE   Df Model:                           21
Date:                Wed, 30 Oct 2024   Pseudo R-squ.:                0.003400
Time:                        13:03:56   Log-Likelihood:                -15438.
converged:                       True   LL-Null:                       -15491.
Covariance Type:            nonrobust   LLR p-value:                 3.249e-13
                                                           y=Confidence[2]       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------------------------------------
Intercept                                                                      0.5

In [9]:

# Ensure the Confidence variable is numeric
data_long['Confidence'] = pd.to_numeric(data_long['Confidence'], errors='coerce')

# Ensure BPCxdem2 and Geographical_Level are treated as categorical
data_long['BPCxdem2'] = data_long['BPCxdem2'].astype('category')
data_long['Geographical_Level'] = data_long['Geographical_Level'].astype('category')

# Define the formula for the interaction between BPCxdem2 and Geographical_Level
formula = 'Confidence ~ BPCxdem2 * Geographical_Level'

# Fit the GLM model using the formula, Gaussian family, and weights
model = smf.glm(formula=formula, data=data_long, family=sm.families.Gaussian(), freq_weights=data_long['wts'])

# Fit the model
result = model.fit()

# Print the model summary
print(result.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:             Confidence   No. Observations:                13334
Model:                            GLM   Df Residuals:                 13586.57
Model Family:                Gaussian   Df Model:                            7
Link Function:               Identity   Scale:                         0.81488
Method:                          IRLS   Log-Likelihood:                -17894.
Date:                Wed, 30 Oct 2024   Deviance:                       11071.
Time:                        13:04:46   Pearson chi2:                 1.11e+04
No. Iterations:                     3   Pseudo R-squ. (CS):            0.01097
Covariance Type:            nonrobust                                         
                                                                                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------