Research Question: How does social class or income affect access to health care?
Strategy: 
- pull data from the GSS columns relating to health care, income, and other things in that topic, and then analyze the trends we find. 

1. Change variable names to be more clear 
2. replace nans 
3. make some pretty graphs 

In [15]:
import pandas as pd
data = pd.read_csv('./Project_Data.csv')

In [3]:
data.head()

Unnamed: 0,wrkstat,income,satfin,finalter,health,helpsick,hlthinsr,doc13,doc14,health1,...,hlthcare,hlthall,hlthcovr,hlthtype,hrdshp6,hlthacc1,hlthacc4,hlthacc3,hlthacc2,prvdhlth
0,working full time,,not satisfied at all,better,good,,,,,,...,,,,,,,,,,
1,retired,,more or less satisfied,stayed same,fair,,,,,,...,,,,,,,,,,
2,working part time,,pretty well satisfied,better,excellent,,,,,,...,,,,,,,,,,
3,working full time,,not satisfied at all,stayed same,good,,,,,,...,,,,,,,,,,
4,keeping house,,pretty well satisfied,better,good,,,,,,...,,,,,,,,,,


In [4]:
# .shape prints out the number of rows and columns 
print(data.shape, '\n')

# .columns prints out the names of the columns 
# There are 25 columns as specificed in the data.shape output
print(data.columns.tolist())

(72426, 25) 

['wrkstat', 'income', 'satfin', 'finalter', 'health', 'helpsick', 'hlthinsr', 'doc13', 'doc14', 'health1', 'hlthplan', 'diffcare', 'insrlmts', 'insrchng', 'emphlth', 'hlthcare', 'hlthall', 'hlthcovr', 'hlthtype', 'hrdshp6', 'hlthacc1', 'hlthacc4', 'hlthacc3', 'hlthacc2', 'prvdhlth']


In [5]:
# Checking the unique values in finalter
# Change in Financial Situation : During the last few years, has your financial situation been getting better, worse, or has it stayed the same?
# This question was asked in all 34 years of the survey
print(data['finalter'].unique())
print(data['finalter'].isnull().sum())
data['finalter'].fillna('No Response', inplace= True)




['better' 'stayed same' 'worse' nan 'finalter']
4795


In [6]:
data.head()

Unnamed: 0,wrkstat,income,satfin,finalter,health,helpsick,hlthinsr,doc13,doc14,health1,...,hlthcare,hlthall,hlthcovr,hlthtype,hrdshp6,hlthacc1,hlthacc4,hlthacc3,hlthacc2,prvdhlth
0,working full time,,not satisfied at all,better,good,,,,,,...,,,,,,,,,,
1,retired,,more or less satisfied,stayed same,fair,,,,,,...,,,,,,,,,,
2,working part time,,pretty well satisfied,better,excellent,,,,,,...,,,,,,,,,,
3,working full time,,not satisfied at all,stayed same,good,,,,,,...,,,,,,,,,,
4,keeping house,,pretty well satisfied,better,good,,,,,,...,,,,,,,,,,


In [7]:
# Cleaning the Income Variable 
# To clean the income variable, we decided to drop the nan because for our research question, we want to do analysis on income related to 
# healthcare data points, and it was not useful to keep the nan values so we dropped them
print(data['income'].unique())
print(data['income'].isnull().sum())
data = data.dropna(subset=['income'])

print(data['income'].isnull().sum())

[nan '$10,000 to $14,999' '$7,000 to $7,999' '$4,000 to $4,999'
 '$1,000 to $2,999' '$15,000 to $19,999' '$5,000 to $5,999'
 '$20,000 to $24,999' '$3,000 to $3,999' 'under $1,000' '$8,000 to $9,999'
 '$25,000 or more' '$6,000 to $6,999' 'income']
8951
0


In [8]:
data.head()

Unnamed: 0,wrkstat,income,satfin,finalter,health,helpsick,hlthinsr,doc13,doc14,health1,...,hlthcare,hlthall,hlthcovr,hlthtype,hrdshp6,hlthacc1,hlthacc4,hlthacc3,hlthacc2,prvdhlth
1613,working full time,"$10,000 to $14,999",pretty well satisfied,stayed same,fair,,,,,,...,,,,,,,,,,
1614,keeping house,"$7,000 to $7,999",more or less satisfied,stayed same,good,,,,,,...,,,,,,,,,,
1615,working full time,"$10,000 to $14,999",more or less satisfied,stayed same,excellent,,,,,,...,,,,,,,,,,
1616,working full time,"$10,000 to $14,999",not satisfied at all,stayed same,excellent,,,,,,...,,,,,,,,,,
1617,keeping house,"$10,000 to $14,999",pretty well satisfied,better,good,,,,,,...,,,,,,,,,,


In [9]:
# Cleaning the 'health' variable 
# From the GSS codebook, the health variable stands for the "conditon of health."
# The question asked was "Would you say your own health, in general, is excellent, good, fair, or  poor?"

# Check for the Number of Missing Values
print(data['health'].unique())
missing = data['health'].isnull().sum()
print("Missing values Before Cleaning: ", missing)

# Impute missing values with 'No Response'
# Replace the NaN values in the health variable with 'No Response' since health is a categorical variable.
# 'No Response' is more helpful than NAN in relation to our research question. 
# When making our graphs and visualization, we can represent NaN with 'No Response' to keep our categorical variables consistent.
data['health'].fillna('No Response', inplace= True)

# Check for Number of Missing Values After CLeaning
unique_types_cleaned = data['health'].unique()
print("Unique 'Type' values after cleaning:", unique_types_cleaned)

#Print missing values after cleaning
missing = data['health'].isnull().sum()
print("Missing values After Cleaning: ", missing, '\n')


['fair' 'good' 'excellent' 'poor' 'health' nan]
Missing values Before Cleaning:  15409
Unique 'Type' values after cleaning: ['fair' 'good' 'excellent' 'poor' 'health' 'No Response']
Missing values After Cleaning:  0 



In [10]:
data.head()

Unnamed: 0,wrkstat,income,satfin,finalter,health,helpsick,hlthinsr,doc13,doc14,health1,...,hlthcare,hlthall,hlthcovr,hlthtype,hrdshp6,hlthacc1,hlthacc4,hlthacc3,hlthacc2,prvdhlth
1613,working full time,"$10,000 to $14,999",pretty well satisfied,stayed same,fair,,,,,,...,,,,,,,,,,
1614,keeping house,"$7,000 to $7,999",more or less satisfied,stayed same,good,,,,,,...,,,,,,,,,,
1615,working full time,"$10,000 to $14,999",more or less satisfied,stayed same,excellent,,,,,,...,,,,,,,,,,
1616,working full time,"$10,000 to $14,999",not satisfied at all,stayed same,excellent,,,,,,...,,,,,,,,,,
1617,keeping house,"$10,000 to $14,999",pretty well satisfied,better,good,,,,,,...,,,,,,,,,,


In [27]:
# Cleaning the 'wrkstat' variable 
# From the GSS codebook, the wrkstat variable stands for the "labor force status."
# The question asked was "Last week were you working full time, part time, going to school, keeping house, or what?"

# Check for the Number of Missing Values
print(data['wrkstat'].unique())
missing = data['wrkstat'].isnull().sum()
print("Missing values Before Cleaning: ", missing, '\n')

# Impute missing values with 'No Response'
# Replace the NaN values in the wrkstat variable with No Response since wrkstat is a categorical variable.
# No Response is more helpful than NaN when we make graphs and visualizations.
# It will make it easier to understand our data and draw conclusions when cross referencing this variable with orther variables.
data['wrkstat'].fillna('No Response', inplace= True)

# Count the number of instances of each unique value in the column 
value_counts = data['wrkstat'].value_counts()
print("Number of Instances before Dropping: ", value_counts, '\n')

# Since 36 values within the column 'wrkstat' are 'wrkstat', we decided to just drop these values
# Having the name of the column as a unique value won't be helpful when we create our visualizations
# Drop the instances that say 'wrkstat'
data = data[data['wrkstat'] != 'wrkstat']

# Check for Number of Missing Values After CLeaning
unique_types_cleaned = data['wrkstat'].unique()
print("Unique 'Type' values after cleaning:", unique_types_cleaned, '\n')

#Print missing values after cleaning
missing = data['wrkstat'].isnull().sum()
print("Missing values After Cleaning: ", missing, '\n')

['working full time' 'retired' 'working part time' 'keeping house'
 'in school' 'unemployed, laid off, looking for work'
 'with a job, but not at work because of temporary illness, vacation, strike'
 'other' 'No Response']
Missing values Before Cleaning:  0
Number of Instances before Dropping:  wrkstat
working full time                                                             35267
retired                                                                       10886
keeping house                                                                 10764
working part time                                                              7430
unemployed, laid off, looking for work                                         2621
in school                                                                      2187
other                                                                          1643
with a job, but not at work because of temporary illness, vacation, strike     1556
No Response             

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['wrkstat'].fillna('No Response', inplace= True)


In [17]:
data.head()

Unnamed: 0,wrkstat,income,satfin,finalter,health,helpsick,hlthinsr,doc13,doc14,health1,...,hlthcare,hlthall,hlthcovr,hlthtype,hrdshp6,hlthacc1,hlthacc4,hlthacc3,hlthacc2,prvdhlth
0,working full time,,not satisfied at all,better,good,,,,,,...,,,,,,,,,,
1,retired,,more or less satisfied,stayed same,fair,,,,,,...,,,,,,,,,,
2,working part time,,pretty well satisfied,better,excellent,,,,,,...,,,,,,,,,,
3,working full time,,not satisfied at all,stayed same,good,,,,,,...,,,,,,,,,,
4,keeping house,,pretty well satisfied,better,good,,,,,,...,,,,,,,,,,
