In [5]:
import requests
import pandas as pd 

# Define the API URL 
url = "https://api.census.gov/data/2019/acs/acs1/pums?get=RACBLK,RACAIAN,RACWHT,MAR,SEX,ENG,HISP,SCHL,MIGSP,YOEP&AGEP=18:99&WAGP=0&WAGP=4:999999&NATIVITY=2&AGEP=18:99&WAGP=0&WAGP=4:999999"

# Make the API request with SSL verification disabled
response = requests.get(url, verify=False)

# checks if the request was successful
if response.status_code == 200:
    data = response.json()
    df = pd.DataFrame(data[1:], columns=data[0])
    print(df)



       RACBLK RACAIAN RACWHT MAR SEX ENG HISP SCHL MIGSP  YOEP AGEP    WAGP  \
0           0       0      0   5   2   0    1   19     0  1998   21    2000   
1           0       0      0   1   1   4    1   16     0  1989   71       0   
2           0       0      0   1   1   3    1    8     0  1968   83       0   
3           0       0      1   5   1   0    2   16    48  2015   18    1200   
4           0       0      1   5   1   4    2   16     0  2007   53   31200   
...       ...     ...    ...  ..  ..  ..  ...  ...   ...   ...  ...     ...   
357812      0       0      1   2   1   1    2   17     0  1962   86   50000   
357813      0       0      1   3   2   1    2   15     0  1974   62    4500   
357814      0       0      1   3   2   1    1   21     0  1998   60       0   
357815      0       0      1   1   1   1    1   22     0  1973   77  120000   
357816      0       0      1   5   2   0    3   21     0  2000   47       0   

       NATIVITY  
0             2  
1             2

In [6]:
import numpy as np 
import statsmodels.api as sm 
df = df.dropna() # drop missing values in data set
print('shape of data', df.shape) # print shape of data
df.head()  # print first five rows of data set

shape of data (357817, 13)


Unnamed: 0,RACBLK,RACAIAN,RACWHT,MAR,SEX,ENG,HISP,SCHL,MIGSP,YOEP,AGEP,WAGP,NATIVITY
0,0,0,0,5,2,0,1,19,0,1998,21,2000,2
1,0,0,0,1,1,4,1,16,0,1989,71,0,2
2,0,0,0,1,1,3,1,8,0,1968,83,0,2
3,0,0,1,5,1,0,2,16,48,2015,18,1200,2
4,0,0,1,5,1,4,2,16,0,2007,53,31200,2


In [24]:
# Transforming the data for analysis 

df.dropna(inplace=True) # drop missing values in data set

df['WAGP'] = pd.to_numeric(df['WAGP'], errors='coerce') 
    # convert wage to numeric
df['LOG_WAGE'] = np.log(df['WAGP']) # create a new variable that is the log of wage

#English Fluency Values --> zero value was eliminated to only look at individuals with a second language 
    # 1 and 2 will be if the individual has a strong English fluency and 3 and 4 being weak level 0f English fluency
df['ENG'] = pd.to_numeric(df['ENG'], errors='coerce') 
    # convert English fluency to numeric
df = df[df['ENG'] != '0'] 
    # drop missing values in data set
df['ENGLISH'] = df['ENG'].map(lambda x: 1 if x < 3 else 0)
    # changes it into a binary, 1 being strong English fluency and 0 being weak English fluency

# Education Attainment
    # 01-15 -Did not complete high school. 
    # 16-High school graduate - regular high school diploma; 
    # 17-High school graduate - GED or alternative credential
    # 18-19-Some college, no degree, 
    # 20-Associate's degree; 
    # 21-Bachelor's degree; 
    # 22-24-Post graduate degree;
df['SCHL'] = pd.to_numeric(df['SCHL'], errors='coerce')
    # convert education to numeric
df = df[df['SCHL'] != '0']
    # drop missing values in data set
df['NOHS'] = df['SCHL'].map(lambda x: 1 if x < 16 else 0) 
    # changes it into a binary, 1 being no high school and 0 being high school or higher
df['HS'] = df['SCHL'].map(lambda x: 1 if x == 16 or x == 17 else 0)
    # changes it into a binary, 1 being high school or GED and 0 being no high school or higher
df['COLL'] = df['SCHL'].map(lambda x: 1 if x == 18 or x == 19 or x == 20 else 0)
    # changes it into a binary, 1 being some college or associates and 0 being no college or higher
df['BACH'] = df['SCHL'].map(lambda x: 1 if x == 21 else 0)
    # changes it into a binary, 1 being a bachelors degree and 0 being no bachelors or higher
df['POST'] = df['SCHL'].map(lambda x: 1 if x > 21 else 0)
    # changes it into a binary, 1 being a post graduate degree and 0 being no post graduate or higher
# age 
df['AGEP']= pd.to_numeric(df['AGEP'], errors='coerce')
    # convert age to numeric
# Gender 
df['SEX'] = pd.to_numeric(df['SEX'], errors='coerce')
    # convert sex to numeric 
df['Male'] = df['SEX'].map(lambda x: 1 if x == 1 else 0)
    # changes it into a binary, 1 being male

#Martial Status
    # 1-Married
    # 2-Divorced
    # 3-Separated
    # 4-Widowed
    # 5-Never married
df['MAR'] = pd.to_numeric(df['MAR'], errors='coerce')   
    # convert marital status to numeric 
df['MARRIED'] = df['MAR'].map(lambda x: 1 if x == 1 else 0)
    # changes it into a binary, 1 being married and 0 being not married

#Continental Origin, In the hopes of making the data more manageable, I will only group people by continent of origin not country. 
df['MIGSP'] = pd.to_numeric(df['MIGSP'], errors='coerce')
    # convert continental origin to numeric 
df['EURO'] = df['MIGSP'].map(lambda x: 1 if x == 1 else 0)
    # changes it into a binary, 1 being European and 0 being non-European
df['AFRICA'] = df['MIGSP'].map(lambda x: 1 if x == 2 else 0)
    # changes it into a binary, 1 being African and 0 being non-African
df['NORTH'] = df['MIGSP'].map(lambda x: 1 if x == 3 else 0)
    # changes it into a binary, 1 being North American and 0 being non-North American
df['South'] = df['MIGSP'].map(lambda x: 1 if x == 4 else 0)
    # changes it into a binary, 1 being South/Central American and 0 being non-South American
df['ASIA'] = df['MIGSP'].map(lambda x: 1 if x == 5 else 0)
    # changes it into a binary, 1 being Asian and 0 being non-Asian
df['OCEANIA'] = df['MIGSP'].map(lambda x: 1 if x == 6 else 0)
    # changes it into a binary, 1 being Oceanian and 0 being non-Oceanian

#Hispanic Origin
df['HISP'] = pd.to_numeric(df['HISP'], errors='coerce')
df['HISPANIC'] = df['HISP'].map(lambda x: 1 if x == 1 else 0)
    # changes it into a binary, 1 being Hispanic and 0 being non-Hispanic

#Race
df['RACWHT'] = pd.to_numeric(df['RACWHT'], errors='coerce')
df['RACBLK'] = pd.to_numeric(df['RACBLK'], errors='coerce')
df['WHITE'] = df['RACWHT'].map(lambda x: 1 if x == 1 else 0)
    # changes it into a binary, 1 being White and 0 being non-White
df['BLACK'] = df['RACBLK'].map(lambda x: 1 if x == 1 else 0)
    # changes it into a binary, 1 being Black and 0 being non-Black

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [28]:
#Summary 
df.dropna(inplace=True) # drop missing values in data set
selected_columns = ['LOG_WAGE', 'ENGLISH', 'NOHS', 'HS', 'COLL', 'BACH', 'POST', 'AGEP', 'Male', 'MARRIED', 'EURO', 'AFRICA', 'NORTH', 'South', 'ASIA', 'OCEANIA', 'HISPANIC', 'WHITE', 'BLACK']
summary = df[selected_columns].describe()
print(summary)

           LOG_WAGE        ENGLISH           NOHS             HS  \
count  3.578170e+05  357817.000000  357817.000000  357817.000000   
mean           -inf       0.772414       0.233974       0.209562   
std             NaN       0.419274       0.423357       0.406997   
min            -inf       0.000000       0.000000       0.000000   
25%             NaN       1.000000       0.000000       0.000000   
50%    9.680344e+00       1.000000       0.000000       0.000000   
75%    1.075790e+01       1.000000       0.000000       0.000000   
max    1.348283e+01       1.000000       1.000000       1.000000   

                COLL           BACH           POST           AGEP  \
count  357817.000000  357817.000000  357817.000000  357817.000000   
mean        0.206368       0.194267       0.155828      49.904761   
std         0.404698       0.395636       0.362693      17.098871   
min         0.000000       0.000000       0.000000      18.000000   
25%         0.000000       0.000000       

  sqr = _ensure_numeric((avg - values) ** 2)
  diff_b_a = subtract(b, a)


In [34]:
# Check for NaNs and Infs
print(df.isnull().sum())  # Count NaNs in each column
print(np.isinf(df).sum())  # Count Infs in each column


RACBLK      0
RACAIAN     0
RACWHT      0
MAR         0
SEX         0
ENG         0
HISP        0
SCHL        0
MIGSP       0
YOEP        0
AGEP        0
WAGP        0
NATIVITY    0
LOG_WAGE    0
ENGLISH     0
NOHS        0
HS          0
COLL        0
BACH        0
POST        0
Male        0
MARRIED     0
EURO        0
AFRICA      0
NORTH       0
South       0
ASIA        0
OCEANIA     0
HISPANIC    0
WHITE       0
BLACK       0
dtype: int64


TypeError: ufunc 'isinf' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [33]:
# Drop rows with NaNs
df.dropna(inplace=True)

# Impute NaNs with a specific value
#df.fillna(value, inplace=True)

# Replace Infs with a specific value
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with Infs
df = df[~df.isin([np.inf, -np.inf]).any(axis=1)]


In [35]:
#regression model
Predictor_columns = ['LOG_WAGE', 'NOHS', 'HS', 'COLL', 'BACH', 'POST', 'AGEP', 'Male', 'MARRIED', 'EURO', 'AFRICA', 'NORTH', 'South', 'ASIA', 'OCEANIA', 'HISPANIC', 'WHITE', 'BLACK']
probit_model = sm.Probit(df['ENGLISH'], df[Predictor_columns]) 
probit_result = probit_model.fit() # fit probit model

Optimization terminated successfully.
         Current function value: 0.359015
         Iterations 7


LinAlgError: Singular matrix

In [None]:
# Education Attainment
# 01-15 -Did not complete high school. 
# 16-High school graduate - regular high school diploma; 
# 17-High school graduate - GED or alternative credential
# 18-19-Some college, no degree, 
# 20-Associate's degree; 
# 21-Bachelor's degree; 
# 22-24-Post graduate degree;
# High School & GED
Eng$HS<- ifelse(Eng$SCHL =="16"|Eng$SCHL=="17", 1,0)
# Some College & Associate 
Eng$SC<- ifelse(Eng$SCHL =="18"| Eng$SCHL == "19" | Eng$SCHL == "20", 1,0)
# Bachelor's Degree 
Eng$BACH <- ifelse(Eng$SCHL =="21",1,0)
# Post undergraduate 
Eng$GRAD <- ifelse(Eng$SCHL == "22" | Eng$SCHL == "23" | Eng$SCHL == "24", 1,0)

In [None]:
from sklearn.linear_model import Ridge

# Initialize Ridge regression model with regularization parameter alpha
ridge_model = Ridge(alpha=1.0)

# Fit Ridge regression model
ridge_model.fit(X_train, y_train)
