# Pipeline for cleaning and transformations

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
import glob


Variables:

* Income
* Age
* Sex
* Race
* Insurance
* Education

This test is going to be for a single state (Alaska AK) and then modularized to be used to all states

**Important note:**  summary level 50 should be used to have county fips

## Income variables

All refer to HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) for Households

* b19001_002: <10,000 
* b19001_003: 10,000 ~ 14,999
* b19001_004: 15,000 ~ 19,999
* b19001_005: 20,000 ~ 24,999
* b19001_006: 25,000 ~ 29,999
* b19001_007: 30,000 ~ 34,999
* b19001_008: 35,000 ~ 39,999
* b19001_009: 40,000 ~ 44,999
* b19001_010: 45,000 ~ 49,999
* b19001_011: 50,000 ~ 59,999
* b19001_012: 60,000 ~ 74,999
* b19001_013: 75,000 ~ 99,999
* b19001_014: 100,000 ~ 124,999
* b19001_015: 125,000 ~ 149,999
* b19001_016: 150,000 ~ 199,999
* b19001_017: 200,000+

This are the variables that we're going to use for our analysis. Other features are not going to be eliminated

In [2]:
def raw_data_all_states(folder):
    raw_data=glob.glob(f'{folder}/*.csv')
    feature=pd.DataFrame()
    for f in raw_data:
        file = pd.DataFrame(pd.read_csv(f,low_memory=False))
        file=file.loc[file['SummaryLevel'] == 50]
        feature=feature.append(file)
        
    return feature

In [3]:
income_raw=raw_data_all_states('Income')

In [6]:

def income_cleaning(state_raw):
    income_cols = ['State','AreaName']
    income_codes1 = 'B19001_00'
    income_codes2 = 'B19001_0'
    for i in range (2,10):
        income_cols.append(income_codes1 + str(i))

    for i in range(10,18):
        income_cols.append(income_codes2 + str(i))

    x = state_raw.loc[:,income_cols]  
    return x

In [9]:
cleaned_data = income_cleaning(income_raw)

In [12]:
def change_names_income(state):
    cols_rename = {
                'B19001_002': '<10k',
                'B19001_003': '10k ~ 14,999',
                'B19001_004': '15k ~ 19,999',
                'B19001_005': '20k ~ 24,999',
                'B19001_006': '25k ~ 29,999',
                'B19001_007': '30k ~ 34,999',
                'B19001_008': '35k ~ 39,999',
                'B19001_009': '40k ~ 44,999',
                'B19001_010': '45k ~ 49,999',
                'B19001_011': '50k ~ 59,999',
                'B19001_012': '60k ~ 74,999',
                'B19001_013': '75k ~ 99,999',
                'B19001_014': '100k ~ 124,999',
                'B19001_015': '125k ~ 149,999',
                'B19001_016': '150k ~ 199,999',
                'B19001_017': '200k+'
                }
    state.rename(columns = cols_rename, inplace = True)
    
    return state

In [13]:
Cleaned_income=change_names_income(cleaned_data)

## Age & Sex variables

This features refer to SEX BY AGE for Total Population (Male)

* b01001_003: <5
* b01001_004: 5 ~ 9
* b01001_005: 10 ~ 14
* b01001_006: 15 ~ 17
* b01001_007: 18 ~ 19
* b01001_008: 20
* b01001_009: 21
* b01001_010: 22 ~ 24
* b01001_011: 25 ~ 29
* b01001_012: 30 ~ 34
* b01001_013: 35 ~ 39
* b01001_014: 40 ~ 44
* b01001_015: 45 ~ 49
* b01001_016: 50 ~ 54
* b01001_017: 55 ~ 59
* b01001_018: 60 ~ 61
* b01001_019: 62 ~ 64
* b01001_020: 65 ~ 66
* b01001_021: 67 ~ 69
* b01001_022: 70 ~ 74
* b01001_023: 75 ~ 79
* b01001_024: 80 ~ 84
* b01001_025: 85+

This features refer to SEX BY AGE for Total Population (Female)

* b01001_027: <5
* b01001_028: 5 ~ 9
* b01001_029: 10 ~ 14
* b01001_030: 15 ~ 17
* b01001_031: 18 ~ 19
* b01001_032: 20
* b01001_033: 21
* b01001_034: 22 ~ 24
* b01001_035: 25 ~ 29
* b01001_036: 30 ~ 34
* b01001_037: 35 ~ 39
* b01001_038: 40 ~ 44
* b01001_039: 45 ~ 49
* b01001_040: 50 ~ 54
* b01001_041: 55 ~ 59
* b01001_042: 60 ~ 61
* b01001_043: 62 ~ 64
* b01001_044: 65 ~ 66
* b01001_045: 67 ~ 69
* b01001_046: 70 ~ 74
* b01001_047: 75 ~ 79
* b01001_048: 80 ~ 84
* b01001_049: 85+

In [None]:
age_sex_cols =['State']
age_sex_male1 = 'B01001_00'
age_sex_male2 = 'B01001_0'
for i in range (3,10):
    age_sex_cols.append(age_sex_male1 + str(i))
for i in range (10,26):
    age_sex_cols.append(age_sex_male2 + str(i))



In [None]:
age_sex_female = 'B01001_0'
for i in range (27,50):
    age_sex_cols.append(age_sex_female + str(i))

In [None]:
#age_sex_cols

In [None]:
age_sex_raw = pd.read_csv('AK_age_sex.csv')
age_sex_raw.head(1)

## Race variable

This features refer to RACE for Total Population

* b02001_002: White alone
* b02001_003: Black or African American alone
* b02001_004: American Indian and Alaska Native alone
* b02001_005: Asian alone
* b02001_006: Native Hawaiian and Other Pacific Islander alone
* b02001_007: Some other race alone
* b02001_008: Two or more races

In [None]:
race_raw = pd.read_csv('AK_race.csv')
race_raw.head(1)

In [None]:
race_cols =['State']
race = 'B02001_00'
for i in range (2,9):
    race_cols.append(race + str(i))


In [None]:
#race_cols

## Health insurance variable

* b27001_001: HEALTH INSURANCE COVERAGE
* b27002_001: PRIVATE HEALTH INSURANCE
* b27003_001: PUBLIC HEALTH INSURANCE
* c27004_001: EMPLOYER-BASED HEALTH INSURANCE 
* c27005_001: DIRECT-PURCHASE HEALTH INSURANCE
* c27006_001: MEDICARE COVERAGE
* c27007_001: MEDICAID/MEANS-TESTED PUBLIC COVERAGE
* c27008_001: TRICARE/MILITARY HEALTH COVERAGE
* c27009_001: VA HEALTH CARE




In [None]:
insurance_raw = pd.read_csv('AK_insurance.csv')
insurance_raw.head(1)

In [None]:
insurance_cols =['State']
insurance_b = 'B2700'
insurance_c = 'C2700'
finish= '_001'
for i in range (1,4):
    insurance_cols.append(insurance_b + str(i) + finish)
for i in range (4,10):
    insurance_cols.append(insurance_c + str(i) + finish)



In [None]:
#insurance_cols

In [None]:
alaska_insurance = insurance_raw.loc[[0, 1], insurance_cols]
alaska_insurance.drop([1], inplace=True)
alaska_insurance.head()

In [None]:
samples = insurance_raw.shape[0]
keep_cols = []
drop_cols = []


for col in insurance_raw.columns:
    nas = insurance_raw[col].isna().sum()
    percent = round(100*nas/samples,2)
    if nas > 0:
        print('{0:<20} {1:<20} {2}%'.format(col,nas,percent))
    if percent < 30:
        keep_cols.append(col)
    else:
        drop_cols.append(col)

In [None]:
print("Kepp cols: ",keep_cols , "\n\n Drop cols:" , drop_cols)

In [None]:
## Education variable

