In [None]:
import sys
sys.path.append('..')
import mc_processor
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
import statsmodels.formula.api as smf

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Load data

In [21]:
# read results files

# clarifier = "field1/"
# clarifier = "field2/rvoter/"
clarifier = "stacked/"

codebook = pd.read_csv('raw/levels_codebook.csv', encoding='utf-8')
data = pd.read_csv(f'raw/{clarifier}data.csv', encoding='utf-8')
q_codebook = pd.read_csv('raw/question_codebook.csv', encoding='utf-8', index_col = 'qid')

# q_codebook into dict
q_codebook = q_codebook.to_dict().get('qidFull')

In [49]:
# for controls:
# "xdemWhite" -- binary: 1 = white
# "xeduc3" -- 1 = Educ: < College; 2 = Educ: Bachelors degree; 3 = Educ: Post-grad
# "xdemInc3" -- 1 = Income: Under 50k; 2 = Income: 50k-100k; 3 = Income: 100k+


# Melting the data to get long format with control variables included
data_long = pd.melt(data, 
                    id_vars=['BPCxdem2', 'wts', 'xdemWhite','xpid3', 'xeduc3', 'xdemInc3'], 
                    value_vars=['BPC20', 'BPC21', 'BPC22', 'BPC23'], 
                    var_name='Geographical_Level', 
                    value_name='Confidence')

# Map BPCxdem2 to meaningful labels
data_long['BPCxdem2'] = data_long['BPCxdem2'].map({1: 'Transgender', 2: 'Cisgender'}).astype('category')

# Map the geographical levels to their labels
data_long['Geographical_Level'] = data_long['Geographical_Level'].map(confidence_keys).astype('category')

# Drop or reclassify "Don't know" responses (e.g., excluding them)
data_long = data_long[data_long['Confidence'] != 5]  # Assuming 5 represents "Don't know"

# Reclassify the confidence levels so that higher values indicate higher confidence
data_long['Confidence'] = 5 - data_long['Confidence']

# Replace values in xdemWhite, xeduc3, and xdemInc3 columns with descriptive names
data_long['xdemWhite'] = data_long['xdemWhite'].map({1: 'White', 0: 'Non-White'})

data_long['xeduc3'] = data_long['xeduc3'].map({
    1: 'less_college',
    2: 'bach',
    3: 'postgrad'
})

data_long['xdemInc3'] = data_long['xdemInc3'].map({
    1: 'income_less50',
    2: 'income_50to100',
    3: 'income_over100k'
})

data_long['xpid3'] = data_long['xpid3'].map({
    1: 'dem',
    2: 'ind',
    3: 'rep'
})

# Creating dummy variables for xeduc3 and xdemInc3
data_long = pd.get_dummies(data_long, columns=['xeduc3', 'xdemInc3','xpid3'], drop_first=False)


# drop baselines 
data_long = data_long.drop(['xeduc3_less_college','xpid3_rep','xdemInc3_income_less50'],axis=1)

## BPC20-23: Confidence in vote counting

In [8]:
# MNLogit: Transgender X Geographical level (no weights)

q = 'BPCxdem2'
print(q)
for i in range(1,3):
    print(f'{i}: {mc_processor.get_name_from_codebook(codebook, q, i)}')

print(' ')

q = 'BPC20a'
print(q)
for i in range(1,5):
    print(f'{i}: {mc_processor.get_name_from_codebook(codebook, q, 5-i)}')

## Confidence analysis (using MNlogit)

import pandas as pd

confidence_keys = {'BPC20':"Your vote", 'BPC21':"Votes in your city or county", 'BPC22':"Votes in your state", 'BPC23':"Votes across the country"}


# Melting the data to get long format
data_long = pd.melt(data, 
                    id_vars=['BPCxdem2', 'wts'], 
                    value_vars=['BPC20', 'BPC21', 'BPC22', 'BPC23'], 
                    var_name='Geographical_Level', 
                    value_name='Confidence')

# Map BPCxdem2 to meaningful labels
data_long['BPCxdem2'] = data_long['BPCxdem2'].map({1: 'Transgender', 2: 'Cisgender'}).astype('category')

# Map the geographical levels to their labels
data_long['Geographical_Level'] = data_long['Geographical_Level'].map(confidence_keys).astype('category')


# Drop or reclassify "Don't know" responses (e.g., excluding them)
data_long = data_long[data_long['Confidence'] != 5]  # Assuming 5 represents "Don't know"

# Reclassify the confidence levels so that higher values indicate higher confidence
data_long['Confidence'] = 5 - data_long['Confidence']


# Ensure that the Confidence column is treated as an ordinal categorical variable
# data_long['Confidence'] = data_long['Confidence'].astype('category')
data_long['Confidence'] = pd.Categorical(data_long['Confidence'], ordered=True)


# Create design matrices for the ordinal logistic regression
y, X = dmatrices('Confidence ~ BPCxdem2 * Geographical_Level', data_long, return_type='dataframe')

# Build and fit the weighted model using ordinal logistic regression
model = sm.MNLogit(y, X, weights=data_long['wts'])

# Fit the model
result = model.fit()

# Print the summary of the model
print(result.summary())

BPCxdem2
1: Identify Trans/Genderqueer/Non-Conforming - Yes
2: Identify Trans/Genderqueer/Non-Conforming - No
 
BPC20a
1: Not confident at all
2: Not to confident
3: Somewhat confident
4: Very confident
Optimization terminated successfully.
         Current function value: 1.157796
         Iterations 6




                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                13334
Model:                        MNLogit   Df Residuals:                    13310
Method:                           MLE   Df Model:                           21
Date:                Thu, 31 Oct 2024   Pseudo R-squ.:                0.003400
Time:                        13:28:18   Log-Likelihood:                -15438.
converged:                       True   LL-Null:                       -15491.
Covariance Type:            nonrobust   LLR p-value:                 3.249e-13
                                                           y=Confidence[2]       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------------------------------------
Intercept                                                                      0.5

In [9]:
# GLM: Transgender X Geographical level (with weights)

# Ensure the Confidence variable is numeric
data_long['Confidence'] = pd.to_numeric(data_long['Confidence'], errors='coerce')

# Ensure BPCxdem2 and Geographical_Level are treated as categorical
data_long['BPCxdem2'] = data_long['BPCxdem2'].astype('category')
data_long['Geographical_Level'] = data_long['Geographical_Level'].astype('category')

# Define the formula for the interaction between BPCxdem2 and Geographical_Level
formula = 'Confidence ~ BPCxdem2 * Geographical_Level'

# Fit the GLM model using the formula, Gaussian family, and weights
model = smf.glm(formula=formula, data=data_long, family=sm.families.Gaussian(), freq_weights=data_long['wts'])

# Fit the model
result = model.fit()

# Print the model summary
print(result.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:             Confidence   No. Observations:                13334
Model:                            GLM   Df Residuals:                 13586.57
Model Family:                Gaussian   Df Model:                            7
Link Function:               Identity   Scale:                         0.81488
Method:                          IRLS   Log-Likelihood:                -17894.
Date:                Thu, 31 Oct 2024   Deviance:                       11071.
Time:                        13:28:18   Pearson chi2:                 1.11e+04
No. Iterations:                     3   Pseudo R-squ. (CS):            0.01097
Covariance Type:            nonrobust                                         
                                                                                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

## BPC20-23 with controls

Linear regression

In [51]:
# Ensure the Confidence variable is numeric
data_long['Confidence'] = pd.to_numeric(data_long['Confidence'], errors='coerce')

# Treat 'BPCxdem2' and 'Geographical_Level' as categorical variables
data_long['BPCxdem2'] = data_long['BPCxdem2'].astype('category')
data_long['Geographical_Level'] = data_long['Geographical_Level'].astype('category')

# Update the formula to include the dummy variables for education and income levels
formula = 'Confidence ~ BPCxdem2 * Geographical_Level + xdemWhite + xpid3_dem + xpid3_ind + xeduc3_bach + xeduc3_postgrad + xdemInc3_income_50to100 + xdemInc3_income_over100k'

# Fit the GLM model using the updated formula
model = smf.glm(formula=formula, data=data_long, family=sm.families.Gaussian(), freq_weights=data_long['wts'])

# Fit and display results
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             Confidence   No. Observations:                 9563
Model:                            GLM   Df Residuals:                  8100.67
Model Family:                Gaussian   Df Model:                           13
Link Function:               Identity   Scale:                         0.77124
Method:                          IRLS   Log-Likelihood:                -10453.
Date:                Thu, 31 Oct 2024   Deviance:                       6247.6
Time:                        14:00:39   Pearson chi2:                 6.25e+03
No. Iterations:                     3   Pseudo R-squ. (CS):             0.1199
Covariance Type:            nonrobust                                         
                                                                                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

### ChatGPT Interpretation of GLM results with controls
### Interpretation of GLM Results

**Model Fit**:
- The **Pseudo R-squared** value is 0.1199, indicating that the model explains approximately 11.99% of the variance in `Confidence`. This suggests a moderate fit, showing that the included predictors account for a meaningful portion of the variation in `Confidence`.

**Main Effects**:
1. **BPCxdem2 (T.Transgender)**:
   - Coefficient: 0.3792 with a *p-value of 0.028*, meaning it is statistically significant at the 5% level.
   - Interpretation: Being transgender is associated with a higher `Confidence` compared to the baseline (cisgender), after controlling for other factors.

2. **Geographical_Level**:
   - **Votes in your city or county**: Coefficient is 0.2990 with a *p-value of 0.000*.
   - **Votes in your state**: Coefficient is 0.2113 with a *p-value of 0.000*.
   - **Your vote**: Coefficient is 0.2241 with a *p-value of 0.000*.
   - Interpretation: Confidence increases for more localized geographical levels (e.g., city/county, state, personal vote) compared to the baseline. Each level is statistically significant, suggesting that respondents feel more confident about vote accuracy at more localized levels.

3. **Political Identity (PID)**:
   - **PID: Dem (no lean)**: Coefficient is 0.6800 with a *p-value of 0.000*.
   - **PID: Ind (no lean)**: Coefficient is 0.2879 with a *p-value of 0.000*.
   - Interpretation: Respondents identifying as Democrat or Independent (no lean) report significantly higher `Confidence` compared to the baseline (presumably Republican). Democrats, in particular, report the highest increase in `Confidence`.

4. **Education**:
   - **Bachelors Degree (xeduc3_bach)**: Coefficient is 0.1276 with a *p-value of 0.000*.
   - **Postgraduate (xeduc3_postgrad)**: Coefficient is 0.1736 with a *p-value of 0.000*.
   - Interpretation: Higher education levels (bachelor’s degree and postgraduate) are associated with higher `Confidence`. This effect is statistically significant for both categories.

5. **Income**:
   - **Income 50-100k**: Coefficient is -0.0349 with a *p-value of 0.149* (not statistically significant).
   - **Income over 100k**: Coefficient is 0.0849 with a *p-value of 0.001*, indicating statistical significance.
   - Interpretation: Respondents with an income over 100k report significantly higher `Confidence`, while the income bracket 50-100k does not show a significant effect.

**Interaction Effects**:
- **BPCxdem2[T.Transgender]:Geographical_Level**:
   - The interaction terms between `BPCxdem2[T.Transgender]` and different geographical levels are not statistically significant (p-values greater than 0.05), indicating that the impact of being transgender on `Confidence` does not vary significantly by geographical level.

MNLogit

In [55]:
import pandas as pd
from patsy import dmatrices
import statsmodels.api as sm

# Ensure that the Confidence column is treated as a categorical variable
data_long['Confidence'] = pd.Categorical(data_long['Confidence'], ordered=True)

# Create design matrices for MNLogit including control variables
formula = 'Confidence ~ BPCxdem2 * Geographical_Level +  xdemWhite + xpid3_dem + xpid3_ind + xeduc3_bach + xeduc3_postgrad + xdemInc3_income_50to100 + xdemInc3_income_over100k'
y, X = dmatrices(formula, data_long, return_type='dataframe')

# Build and fit the MNLogit model without weights
model = sm.MNLogit(y, X)

# Fit the model
result = model.fit()

# Print the summary of the model
print(result.summary())


Optimization terminated successfully.
         Current function value: 1.113492
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                 9563
Model:                        MNLogit   Df Residuals:                     9521
Method:                           MLE   Df Model:                           39
Date:                Thu, 31 Oct 2024   Pseudo R-squ.:                 0.05770
Time:                        14:05:55   Log-Likelihood:                -10648.
converged:                       True   LL-Null:                       -11300.
Covariance Type:            nonrobust   LLR p-value:                2.902e-248
                                                           y=Confidence[2]       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------------------

### Interpretation of MNLogit Results

**Model Fit**:
- The **Pseudo R-squared** value is 0.0577, meaning the model explains approximately 5.77% of the variance in `Confidence`. While this is a modest fit, it suggests that some variation in confidence is explained by the included predictors.
- The model converged successfully, indicating stable parameter estimates.

### Coefficients Interpretation for Each `Confidence` Category

This output shows the log-odds of being in each confidence category (`Confidence[2]`, `Confidence[3]`, `Confidence[4]`) relative to the baseline confidence level (e.g., `Confidence[1]`).

#### `y=Confidence[2]`
1. **Intercept**: 
   - Coefficient: 0.2573, **p-value: 0.014** (significant).
   - Interpretation: This represents the log-odds of being in `Confidence[2]` when all other predictors are at their reference levels.

2. **BPCxdem2 (T.Transgender)**:
   - Coefficient: -0.4845, **p-value: 0.555** (not significant).
   - Interpretation: No significant difference in confidence between transgender and cisgender respondents in this category.

3. **Geographical_Level**:
   - None of the levels (city/county, state, or individual vote) are statistically significant, suggesting that geographical level does not impact the likelihood of being in `Confidence[2]`.

4. **Political Identity (PID)**:
   - **PID: Dem (no lean)**: Coefficient is 0.7707, **p-value: 0.000** (significant).
   - Interpretation: Democrats have higher odds of being in `Confidence[2]` compared to Republicans.

5. **Education**:
   - **Postgraduate**: Coefficient is 0.2793, **p-value: 0.095** (marginally significant).
   - Interpretation: Respondents with postgraduate education have slightly higher odds of being in `Confidence[2]`.

#### `y=Confidence[3]`
1. **Geographical_Level**:
   - **Votes in your city or county**: Coefficient is 0.5533, **p-value: 0.000** (significant).
   - **Votes in your state**: Coefficient is 0.3957, **p-value: 0.000** (significant).
   - **Your vote**: Coefficient is 0.3711, **p-value: 0.001** (significant).
   - Interpretation: Higher confidence is associated with closer geographical levels in this category.

2. **Political Identity (PID)**:
   - **PID: Dem (no lean)**: Coefficient is 1.4922, **p-value: 0.000** (significant).
   - Interpretation: Democrats have much higher odds of being in `Confidence[3]`.

3. **Education**:
   - **Bachelors Degree**: Coefficient is 0.3334, **p-value: 0.001** (significant).
   - **Postgraduate**: Coefficient is 0.3618, **p-value: 0.016** (significant).
   - Interpretation: Higher education levels are associated with higher confidence in this category.

#### `y=Confidence[4]`
1. **Geographical_Level**:
   - **Votes in your city or county**: Coefficient is 0.7828, **p-value: 0.000** (significant).
   - **Votes in your state**: Coefficient is 0.5862, **p-value: 0.000** (significant).
   - **Your vote**: Coefficient is 0.6014, **p-value: 0.000** (significant).
   - Interpretation: Respondents have higher odds of being in `Confidence[4]` with more localized voting levels.

2. **Political Identity (PID)**:
   - **PID: Dem (no lean)**: Coefficient is 2.5807, **p-value: 0.000** (significant).
   - **PID: Ind (no lean)**: Coefficient is 0.6695, **p-value: 0.000** (significant).
   - Interpretation: Both Democrats and Independents have higher odds of being in `Confidence[4]`, with the effect being strongest for Democrats.

3. **Education**:
   - **Bachelors Degree**: Coefficient is 0.4843, **p-value: 0.000** (significant).
   - **Postgraduate**: Coefficient is 0.6482, **p-value: 0.000** (significant).
   - Interpretation: Higher education is strongly associated with higher odds of being in `Confidence[4]`.

### Interaction Terms
- None of the interaction terms between `BPCxdem2[T.Transgender]` and `Geographical_Level` are statistically significant across any confidence level. This suggests that being transgender does not significantly alter the effect of geographical level on confidence.

### Summary
- **Political Identity**: Democrats have consistently higher odds of being in higher confidence categories compared to Republicans. Independents also show increased odds in the highest confidence category.
- **Geographical Level**: Confidence tends to be higher for votes at more localized levels (e.g., city/county, state).
- **Education**: Higher education levels (bachelor’s and postgraduate) are associated with greater confidence across multiple categories.
- **Transgender Identity**: No significant effect of being transgender on confidence was found, nor did it interact significantly with geographical level.

This model suggests that political identity, geographical level, and education are important predictors of confidence levels.


## BPC24_8: Concern about violence

In [52]:
# MNLogit: Transgender X Geographical level (no weights)

q = 'BPCxdem2'
print(q)
for i in range(1,3):
    print(f'{i}: {mc_processor.get_name_from_codebook(codebook, q, i)}')

print(' ')

q = 'BPC24_8'
print(q)
# concern_levels = []
for i in range(1,5):
    concern_level = mc_processor.get_name_from_codebook(codebook, q, 5-i)
    print(f'{i}: {concern_level}')
    # concern_levels.append(concern_level)


BPCxdem2
1: Identify Trans/Genderqueer/Non-Conforming - Yes
2: Identify Trans/Genderqueer/Non-Conforming - No
 
BPC24_8
1: Very concerned
2: Somewhat concerned
3: Not too concerned
4: Not concerned at all


In [59]:
# simple logit regression of gender on concern about violence (BPC24_8)

data_cleaned = data[['BPC24_8', 'BPCxdem2', 'wts']]
# Step 1: Drop "Don't know" responses (BPC24_8 == 5)
data_cleaned = data_cleaned[data_cleaned['BPC24_8'] != 5]

# Step 2: Reverse the polarity of BPC24_8
data_cleaned['BPC24_8_reversed'] = data_cleaned['BPC24_8'].replace({1: 4, 2: 3, 3: 2, 4: 1})

# Step 3: Combine levels
# "Very concerned" (4) and "Somewhat concerned" (3) become 1 (Concerned)
# "Not too concerned" (2) and "Not concerned at all" (1) become 0 (Not concerned)
data_cleaned['concern_level'] = data_cleaned['BPC24_8_reversed'].apply(lambda x: 1 if x >= 3 else 0)

data_cleaned[['BPC24_8', 'BPCxdem2', 'wts', 'concern_level']].head()

# Step 4: Prepare the logistic regression model
# Response variable: concern_level (0 or 1)
# Predictor: BPCxdem2 (1 = Transgender, 2 = Cisgender)

# Add a constant to the model
data_cleaned['const'] = 1

# Logistic regression model
model = sm.Logit(data_cleaned['concern_level'], data_cleaned[['const', 'BPCxdem2']])

# Fit the model
result = model.fit()

# Display the results
result.summary()


Optimization terminated successfully.
         Current function value: 0.679615
         Iterations 5


0,1,2,3
Dep. Variable:,concern_level,No. Observations:,3343.0
Model:,Logit,Df Residuals:,3341.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 30 Oct 2024",Pseudo R-squ.:,0.0008447
Time:,13:50:41,Log-Likelihood:,-2272.0
converged:,True,LL-Null:,-2273.9
Covariance Type:,nonrobust,LLR p-value:,0.05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.2161,0.467,2.602,0.009,0.300,2.132
BPCxdem2,-0.4515,0.236,-1.916,0.055,-0.913,0.010


In [60]:
# Simplified model: Logistic regression without gender variable
data_cleaned['const'] = 1
simplified_model = sm.Logit(data_cleaned['concern_level'], data_cleaned[['const']])
simplified_result = simplified_model.fit()

# Display the summary of the simplified model
simplified_result.summary()


Optimization terminated successfully.
         Current function value: 0.680189
         Iterations 4


0,1,2,3
Dep. Variable:,concern_level,No. Observations:,3343.0
Model:,Logit,Df Residuals:,3342.0
Method:,MLE,Df Model:,0.0
Date:,"Wed, 30 Oct 2024",Pseudo R-squ.:,2.875e-11
Time:,13:55:03,Log-Likelihood:,-2273.9
converged:,True,LL-Null:,-2273.9
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.3241,0.035,9.247,0.000,0.255,0.393


In [55]:
# Weighted logistic regression using gender variable
model_weighted = sm.WLS(data_cleaned['concern_level'], data_cleaned[['const', 'BPCxdem2']], weights=data_cleaned['wts'])
result_weighted = model_weighted.fit()

# Display weighted results
result_weighted.summary()


0,1,2,3
Dep. Variable:,concern_level,R-squared:,0.002
Model:,WLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,5.824
Date:,"Wed, 30 Oct 2024",Prob (F-statistic):,0.0159
Time:,13:50:12,Log-Likelihood:,-4628.2
No. Observations:,3343,AIC:,9260.0
Df Residuals:,3341,BIC:,9273.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.8544,0.123,6.953,0.000,0.613,1.095
BPCxdem2,-0.1494,0.062,-2.413,0.016,-0.271,-0.028

0,1,2,3
Omnibus:,93.348,Durbin-Watson:,1.923
Prob(Omnibus):,0.0,Jarque-Bera (JB):,97.862
Skew:,-0.404,Prob(JB):,5.62e-22
Kurtosis:,2.774,Cond. No.,35.6
