<a href="https://colab.research.google.com/github/550tealeaves/DATA-70500-working-with-data/blob/main/Lab10_CategoricalDataAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# We need to install these libraries to use some of the tools below.
!pip install pyreadstat
!pip install researchpy


Collecting pyreadstat
  Downloading pyreadstat-1.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.0 kB)
Downloading pyreadstat-1.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadstat
Successfully installed pyreadstat-1.2.8
Collecting researchpy
  Downloading researchpy-0.3.6-py3-none-any.whl.metadata (1.2 kB)
Downloading researchpy-0.3.6-py3-none-any.whl (34 kB)
Installing collected packages: researchpy
Successfully installed researchpy-0.3.6


In [None]:
# Code block 1: importing libraries
import pandas as pd
import numpy as np
import researchpy as rp
from pandas.api.types import CategoricalDtype
import seaborn as sb
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio

In [None]:
census_df = pd.read_stata('usa_00020.dta')
census_df.info('verbose')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10419 entries, 0 to 10418
Data columns (total 40 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   year        10419 non-null  category
 1   sample      10419 non-null  category
 2   serial      10419 non-null  int32   
 3   cbserial    10419 non-null  float64 
 4   hhwt        10419 non-null  int16   
 5   cluster     10419 non-null  float64 
 6   region      10419 non-null  category
 7   statefip    10419 non-null  category
 8   metro       10419 non-null  category
 9   strata      10419 non-null  int32   
 10  gq          10419 non-null  category
 11  ownershp    10419 non-null  category
 12  ownershpd   10419 non-null  category
 13  rooms       10419 non-null  category
 14  cinethh     10419 non-null  category
 15  cihispeed   10419 non-null  category
 16  coupletype  10419 non-null  category
 17  multgen     10419 non-null  category
 18  multgend    10419 non-null  category
 19  pern

In [None]:
# Var cinethh - asks if household has internet access
# n/a - no answer - have to remove this b/c it's missing
census_df['multgen'].value_counts()

Unnamed: 0_level_0,count
multgen,Unnamed: 1_level_1
2 generations,5127
1 generation,3766
3+ generations,833
,693


In [None]:
# Turns missing value into not a number
# If the category is exactly this, then treat it as this
census_df['multgenr'] = np.where(census_df['multgen'] == 'n/a', np.nan, census_df['multgen'])
census_df['multgenr'].value_counts()

Unnamed: 0_level_0,count
multgenr,Unnamed: 1_level_1
2 generations,5127
1 generation,3766
3+ generations,833


In [None]:
census_df['nonmetro'] = np.where(census_df['metro'] == 'not in metropolitan area', 1, 0)
census_df['nonmetro'].value_counts()

In [None]:
# Wants to calculate odds ration = must turn the answers into a binary
census_df['muladultgen'] = np.where(census_df['multgendr'].isin(['2 adjacent generations, adult-adult','3+ generations (census 2008 definition)',

                                                              '3+ generations (additional ipums definition)']), 1, 0)

#includes clear multiple adult generations

census_df['muladultgen'].value_counts()


Unnamed: 0_level_0,count
nointernet,Unnamed: 1_level_1
0,10041
1,378


In [None]:
# Selected another variable - asking where people live
census_df['metro'].value_counts()

Unnamed: 0_level_0,count
metro,Unnamed: 1_level_1
in metropolitan area: not in central/principal city,3791
in metropolitan area: in central/principal city,3698
metropolitan status indeterminable (mixed),1320
in metropolitan area: central/principal city status indeterminable (mixed),932
not in metropolitan area,678


In [None]:
# Classify "not in a metro area" as rural
# 1 = rural, 0 = metro
census_df['nonmetro'] = np.where(census_df['metro'] == 'not in metropolitan area', 1, 0)
census_df['nonmetro'].value_counts()

Unnamed: 0_level_0,count
nonmetro,Unnamed: 1_level_1
0,9741
1,678


In [None]:
table0 = pd.crosstab(census_df['nointernet'], census_df['nonmetro'], dropna=False)
odds0 = odds_ratio(table0)
fisher0 = fisher_exact(table0) # significance test

print(f'The odds that a non-metro household will not have internet access are {odds0.statistic:.2f}.')
print(f'The significance of the Fisher exact test is less than {fisher0.pvalue:.2e}.')
pd.crosstab(census_df['nointernet'], census_df['nonmetro'], dropna=False, normalize=True)

The odds that a non-metro household will not have internet access are 2.69.
The significance of the Fisher exact test is less than 1.72e-09.


nonmetro,0,1
nointernet,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.904117,0.059603
1,0.030809,0.005471


Take (0.90 / 0.005) / (0.03 / 0.05) to calculate the odds ratio (1st set = numerator, 2nd = denominator)

In [None]:
census_df['ownershp'].value_counts()

Unnamed: 0_level_0,count
ownershp,Unnamed: 1_level_1
owned or being bought (loan),6823
rented,2903
,693


In [None]:
# Marking n/a as missing
# Create binary w/ 1s & 0s to use for odds and fischer test
census_df['ownershpr'] = np.where(census_df['ownershp'] == 'n/a', np.nan, census_df['ownershp'])
census_df['renter'] = np.where(census_df['ownershpr'] == 'rented', 1, 0)


In [None]:
# Predict no internet but with home ownership
table1 = pd.crosstab(census_df['nointernet'], census_df['renter'], dropna=False)
odds1 = odds_ratio(table1) # calculates odds ratio
fisher1 = fisher_exact(table1)

print(f'The odds that a renter household will not have internet access are {odds1.statistic:.2f}.')
print(f'The significance of the Fisher exact test is less than {fisher1.pvalue:.2e}.')
pd.crosstab(census_df['nointernet'], census_df['renter'], dropna=False, normalize=True)

The odds that a renter household will not have internet access are 2.39.
The significance of the Fisher exact test is less than 6.57e-16.


renter,0,1
nointernet,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.702179,0.261541
1,0.019196,0.017084


## **Being rural and a renter contributes to not having home Internet access**

In [None]:
# Includes the intercept (0) b/c it's meaningful (being a homeowner and living in a metro area)
import statsmodels.api as sm

Y = census_df['renter']
X = census_df[['nonmetro', 'muladultgen']]
X = sm.add_constant(X)
model1 = sm.Logit(Y, X, missing='drop').fit()
print(model1.summary())

# We add a constant in this logit model because with categorical data
# the intercept is often meaningful: the odds of being in Y=1
# (in this case, having a multi generational adult household) when the value of all Xs
# is set to zero (in this case, not being nonmetro or not being a renter).


Optimization terminated successfully.
         Current function value: 0.150429
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:             nointernet   No. Observations:                10419
Model:                          Logit   Df Residuals:                    10416
Method:                           MLE   Df Model:                            2
Date:                Wed, 27 Nov 2024   Pseudo R-squ.:                 0.03531
Time:                        00:08:42   Log-Likelihood:                -1567.3
converged:                       True   LL-Null:                       -1624.7
Covariance Type:            nonrobust   LLR p-value:                 1.215e-25
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.7620      0.079    -47.907      0.000      -3.916      -3.608
nonmetro       1.2054      0.

- negative coeffiencies for constant = ppl who are metro living are less likely to not have internet



In [None]:
print(np.exp(model1.params)) #These are expressed as odds ratios

const       0.023237
nonmetro    3.338005
renter      2.654337
dtype: float64


In [None]:
import math
print(f'The odds of having internet at home for people who live in cities and are homeowners is {1/math.exp(model1.params[0]):.3f}.')


The odds of having internet at home for people who live in cities and are homeowners is 43.035.


## **The odds of having internet at home for people who live in cities and are homeowners is 43.035.**

In [None]:
# Used where to convert categories to binaries
census_df['asian'] = np.where(census_df['racasian'] == 'yes', 1, 0)
census_df['black'] = np.where(census_df['racblk'] == 'yes', 1, 0)



In [None]:
Y = census_df['nointernet']
X = census_df[['nonmetro', 'renter', 'asian', 'black']]
X = sm.add_constant(X)
model2 = sm.Logit(Y, X, missing='drop').fit()
print(model2.summary())


Optimization terminated successfully.
         Current function value: 0.149815
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:             nointernet   No. Observations:                10419
Model:                          Logit   Df Residuals:                    10414
Method:                           MLE   Df Model:                            4
Date:                Wed, 27 Nov 2024   Pseudo R-squ.:                 0.03925
Time:                        00:11:20   Log-Likelihood:                -1560.9
converged:                       True   LL-Null:                       -1624.7
Covariance Type:            nonrobust   LLR p-value:                 1.311e-26
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.6721      0.081    -45.082      0.000      -3.832      -3.512
nonmetro       1.1160      0.

In [None]:
print(np.exp(model2.params)) #These are expressed as odds ratios

const       0.025423
nonmetro    3.052631
renter      2.774304
asian       0.541695
black       0.704220
dtype: float64


## Activity

1. Get a data file from US Census. Download the SPSS or Stata file.
2. Use odds to make some comparisons related to a data narrative.
3. Construct a logistic model and interpret.