In [10]:
import sys
sys.path.append('..')
import mc_processor
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
import statsmodels.formula.api as smf


%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [11]:
# read results files

# clarifier = "field1/"
# clarifier = "field2/rvoter/"
clarifier = "stacked/"

codebook = pd.read_csv('raw/levels_codebook.csv', encoding='utf-8')
data = pd.read_csv(f'raw/{clarifier}data.csv', encoding='utf-8')
q_codebook = pd.read_csv('raw/question_codebook.csv', encoding='utf-8', index_col = 'qid')

# q_codebook into dict
q_codebook = q_codebook.to_dict().get('qidFull')

## BPC20-23: Confidence in vote counting

In [12]:
# MNLogit: Transgender X Geographical level (no weights)

q = 'BPCxdem2'
print(q)
for i in range(1,3):
    print(f'{i}: {mc_processor.get_name_from_codebook(codebook, q, i)}')

print(' ')

q = 'BPC20a'
print(q)
for i in range(1,5):
    print(f'{i}: {mc_processor.get_name_from_codebook(codebook, q, 5-i)}')

## Confidence analysis (using MNlogit)

import pandas as pd

confidence_keys = {'BPC20':"Your vote", 'BPC21':"Votes in your city or county", 'BPC22':"Votes in your state", 'BPC23':"Votes across the country"}


# Melting the data to get long format
data_long = pd.melt(data, 
                    id_vars=['BPCxdem2', 'wts'], 
                    value_vars=['BPC20', 'BPC21', 'BPC22', 'BPC23'], 
                    var_name='Geographical_Level', 
                    value_name='Confidence')

# Map BPCxdem2 to meaningful labels
data_long['BPCxdem2'] = data_long['BPCxdem2'].map({1: 'Transgender', 2: 'Cisgender'}).astype('category')

# Map the geographical levels to their labels
data_long['Geographical_Level'] = data_long['Geographical_Level'].map(confidence_keys).astype('category')


# Drop or reclassify "Don't know" responses (e.g., excluding them)
data_long = data_long[data_long['Confidence'] != 5]  # Assuming 5 represents "Don't know"

# Reclassify the confidence levels so that higher values indicate higher confidence
data_long['Confidence'] = 5 - data_long['Confidence']


# Ensure that the Confidence column is treated as an ordinal categorical variable
# data_long['Confidence'] = data_long['Confidence'].astype('category')
data_long['Confidence'] = pd.Categorical(data_long['Confidence'], ordered=True)


# Create design matrices for the ordinal logistic regression
y, X = dmatrices('Confidence ~ BPCxdem2 * Geographical_Level', data_long, return_type='dataframe')

# Build and fit the weighted model using ordinal logistic regression
model = sm.MNLogit(y, X, weights=data_long['wts'])

# Fit the model
result = model.fit()

# Print the summary of the model
print(result.summary())



BPCxdem2
1: Identify Trans/Genderqueer/Non-Conforming - Yes
2: Identify Trans/Genderqueer/Non-Conforming - No
 
BPC20a
1: Not confident at all
2: Not to confident
3: Somewhat confident
4: Very confident
Optimization terminated successfully.
         Current function value: 1.157796
         Iterations 6




                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                13334
Model:                        MNLogit   Df Residuals:                    13310
Method:                           MLE   Df Model:                           21
Date:                Wed, 30 Oct 2024   Pseudo R-squ.:                0.003400
Time:                        13:20:03   Log-Likelihood:                -15438.
converged:                       True   LL-Null:                       -15491.
Covariance Type:            nonrobust   LLR p-value:                 3.249e-13
                                                           y=Confidence[2]       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------------------------------------
Intercept                                                                      0.5

In [13]:
# GLM: Transgender X Geographical level (with weights)

# Ensure the Confidence variable is numeric
data_long['Confidence'] = pd.to_numeric(data_long['Confidence'], errors='coerce')

# Ensure BPCxdem2 and Geographical_Level are treated as categorical
data_long['BPCxdem2'] = data_long['BPCxdem2'].astype('category')
data_long['Geographical_Level'] = data_long['Geographical_Level'].astype('category')

# Define the formula for the interaction between BPCxdem2 and Geographical_Level
formula = 'Confidence ~ BPCxdem2 * Geographical_Level'

# Fit the GLM model using the formula, Gaussian family, and weights
model = smf.glm(formula=formula, data=data_long, family=sm.families.Gaussian(), freq_weights=data_long['wts'])

# Fit the model
result = model.fit()

# Print the model summary
print(result.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:             Confidence   No. Observations:                13334
Model:                            GLM   Df Residuals:                 13586.57
Model Family:                Gaussian   Df Model:                            7
Link Function:               Identity   Scale:                         0.81488
Method:                          IRLS   Log-Likelihood:                -17894.
Date:                Wed, 30 Oct 2024   Deviance:                       11071.
Time:                        13:20:04   Pearson chi2:                 1.11e+04
No. Iterations:                     3   Pseudo R-squ. (CS):            0.01097
Covariance Type:            nonrobust                                         
                                                                                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

## BPC24_8: Concern about violence

In [52]:
# MNLogit: Transgender X Geographical level (no weights)

q = 'BPCxdem2'
print(q)
for i in range(1,3):
    print(f'{i}: {mc_processor.get_name_from_codebook(codebook, q, i)}')

print(' ')

q = 'BPC24_8'
print(q)
# concern_levels = []
for i in range(1,5):
    concern_level = mc_processor.get_name_from_codebook(codebook, q, 5-i)
    print(f'{i}: {concern_level}')
    # concern_levels.append(concern_level)


BPCxdem2
1: Identify Trans/Genderqueer/Non-Conforming - Yes
2: Identify Trans/Genderqueer/Non-Conforming - No
 
BPC24_8
1: Very concerned
2: Somewhat concerned
3: Not too concerned
4: Not concerned at all


In [59]:
# simple logit regression of gender on concern about violence (BPC24_8)

data_cleaned = data[['BPC24_8', 'BPCxdem2', 'wts']]
# Step 1: Drop "Don't know" responses (BPC24_8 == 5)
data_cleaned = data_cleaned[data_cleaned['BPC24_8'] != 5]

# Step 2: Reverse the polarity of BPC24_8
data_cleaned['BPC24_8_reversed'] = data_cleaned['BPC24_8'].replace({1: 4, 2: 3, 3: 2, 4: 1})

# Step 3: Combine levels
# "Very concerned" (4) and "Somewhat concerned" (3) become 1 (Concerned)
# "Not too concerned" (2) and "Not concerned at all" (1) become 0 (Not concerned)
data_cleaned['concern_level'] = data_cleaned['BPC24_8_reversed'].apply(lambda x: 1 if x >= 3 else 0)

data_cleaned[['BPC24_8', 'BPCxdem2', 'wts', 'concern_level']].head()

# Step 4: Prepare the logistic regression model
# Response variable: concern_level (0 or 1)
# Predictor: BPCxdem2 (1 = Transgender, 2 = Cisgender)

# Add a constant to the model
data_cleaned['const'] = 1

# Logistic regression model
model = sm.Logit(data_cleaned['concern_level'], data_cleaned[['const', 'BPCxdem2']])

# Fit the model
result = model.fit()

# Display the results
result.summary()


Optimization terminated successfully.
         Current function value: 0.679615
         Iterations 5


0,1,2,3
Dep. Variable:,concern_level,No. Observations:,3343.0
Model:,Logit,Df Residuals:,3341.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 30 Oct 2024",Pseudo R-squ.:,0.0008447
Time:,13:50:41,Log-Likelihood:,-2272.0
converged:,True,LL-Null:,-2273.9
Covariance Type:,nonrobust,LLR p-value:,0.05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.2161,0.467,2.602,0.009,0.300,2.132
BPCxdem2,-0.4515,0.236,-1.916,0.055,-0.913,0.010


In [60]:
# Simplified model: Logistic regression without gender variable
data_cleaned['const'] = 1
simplified_model = sm.Logit(data_cleaned['concern_level'], data_cleaned[['const']])
simplified_result = simplified_model.fit()

# Display the summary of the simplified model
simplified_result.summary()


Optimization terminated successfully.
         Current function value: 0.680189
         Iterations 4


0,1,2,3
Dep. Variable:,concern_level,No. Observations:,3343.0
Model:,Logit,Df Residuals:,3342.0
Method:,MLE,Df Model:,0.0
Date:,"Wed, 30 Oct 2024",Pseudo R-squ.:,2.875e-11
Time:,13:55:03,Log-Likelihood:,-2273.9
converged:,True,LL-Null:,-2273.9
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.3241,0.035,9.247,0.000,0.255,0.393


In [55]:
# Weighted logistic regression using gender variable
model_weighted = sm.WLS(data_cleaned['concern_level'], data_cleaned[['const', 'BPCxdem2']], weights=data_cleaned['wts'])
result_weighted = model_weighted.fit()

# Display weighted results
result_weighted.summary()


0,1,2,3
Dep. Variable:,concern_level,R-squared:,0.002
Model:,WLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,5.824
Date:,"Wed, 30 Oct 2024",Prob (F-statistic):,0.0159
Time:,13:50:12,Log-Likelihood:,-4628.2
No. Observations:,3343,AIC:,9260.0
Df Residuals:,3341,BIC:,9273.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.8544,0.123,6.953,0.000,0.613,1.095
BPCxdem2,-0.1494,0.062,-2.413,0.016,-0.271,-0.028

0,1,2,3
Omnibus:,93.348,Durbin-Watson:,1.923
Prob(Omnibus):,0.0,Jarque-Bera (JB):,97.862
Skew:,-0.404,Prob(JB):,5.62e-22
Kurtosis:,2.774,Cond. No.,35.6
