# Pipeline for cleaning and transformations

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
import glob



Variables:

* Income
* Age
* Sex
* Race
* Insurance
* Education


**Important note:**  summary level 50 should be used to have county fips

## Income variables

All refer to HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) for Households

* b19001_002: <10,000 
* b19001_003: 10,000 ~ 14,999
* b19001_004: 15,000 ~ 19,999
* b19001_005: 20,000 ~ 24,999
* b19001_006: 25,000 ~ 29,999
* b19001_007: 30,000 ~ 34,999
* b19001_008: 35,000 ~ 39,999
* b19001_009: 40,000 ~ 44,999
* b19001_010: 45,000 ~ 49,999
* b19001_011: 50,000 ~ 59,999
* b19001_012: 60,000 ~ 74,999
* b19001_013: 75,000 ~ 99,999
* b19001_014: 100,000 ~ 124,999
* b19001_015: 125,000 ~ 149,999
* b19001_016: 150,000 ~ 199,999
* b19001_017: 200,000+

This are the variables that we're going to use for our analysis. Other features are not going to be eliminated

In [2]:
def raw_data_all_states(folder):
    raw_data=glob.glob(f'{folder}/*.csv')
    feature=pd.DataFrame()
    for f in raw_data:
        file = pd.DataFrame(pd.read_csv(f,low_memory=False))
        file=file.loc[file['SummaryLevel'] == 50]
        feature=feature.append(file)
        
    return feature

In [5]:
income_raw=raw_data_all_states('Income')

In [6]:

def income_cleaning(state_raw):
    income_cols = ['State','AreaName']
    income_codes1 = 'B19001_00'
    income_codes2 = 'B19001_0'
    for i in range (2,10):
        income_cols.append(income_codes1 + str(i))

    for i in range(10,18):
        income_cols.append(income_codes2 + str(i))

    x = state_raw.loc[:,income_cols]  
    return x

In [7]:
cleaned_data = income_cleaning(income_raw)

In [8]:
def change_names_income(state):
    cols_rename = {
                'B19001_002': '<10k',
                'B19001_003': '10k ~ 14,999',
                'B19001_004': '15k ~ 19,999',
                'B19001_005': '20k ~ 24,999',
                'B19001_006': '25k ~ 29,999',
                'B19001_007': '30k ~ 34,999',
                'B19001_008': '35k ~ 39,999',
                'B19001_009': '40k ~ 44,999',
                'B19001_010': '45k ~ 49,999',
                'B19001_011': '50k ~ 59,999',
                'B19001_012': '60k ~ 74,999',
                'B19001_013': '75k ~ 99,999',
                'B19001_014': '100k ~ 124,999',
                'B19001_015': '125k ~ 149,999',
                'B19001_016': '150k ~ 199,999',
                'B19001_017': '200k+'
                }
    state.rename(columns = cols_rename, inplace = True)
    
    return state

In [9]:
Cleaned_income=change_names_income(cleaned_data)

In [10]:
Cleaned_income.head(5)

Unnamed: 0,State,AreaName,<10k,"10k ~ 14,999","15k ~ 19,999","20k ~ 24,999","25k ~ 29,999","30k ~ 34,999","35k ~ 39,999","40k ~ 44,999","45k ~ 49,999","50k ~ 59,999","60k ~ 74,999","75k ~ 99,999","100k ~ 124,999","125k ~ 149,999","150k ~ 199,999",200k+
12,AK,"Aleutians East Borough, Alaska",27,36,33,48,31,26,23,27,30,70,102,123,75,48,56,33
13,AK,"Aleutians West Census Area, Alaska",30,34,28,21,38,35,27,63,37,80,108,228,148,95,117,97
14,AK,"Anchorage Municipality, Alaska",2934,2840,2799,3408,3123,3374,3594,4529,3504,6951,11302,15268,12797,8852,10437,9257
15,AK,"Bethel Census Area, Alaska",299,260,292,267,211,219,221,191,196,373,505,580,308,221,288,124
16,AK,"Bristol Bay Borough, Alaska",13,12,11,11,13,11,4,13,13,23,35,54,48,45,36,18


Este es el resultado que buscamos replicar en todas las tablas, tener solo los códigos que elegimos y reemplazarlos por el valor.

## Age & Sex variables

This features refer to SEX BY AGE for Total Population (Male)

* b01001_003: <5
* b01001_004: 5 ~ 9
* b01001_005: 10 ~ 14
* b01001_006: 15 ~ 17
* b01001_007: 18 ~ 19
* b01001_008: 20
* b01001_009: 21
* b01001_010: 22 ~ 24
* b01001_011: 25 ~ 29
* b01001_012: 30 ~ 34
* b01001_013: 35 ~ 39
* b01001_014: 40 ~ 44
* b01001_015: 45 ~ 49
* b01001_016: 50 ~ 54
* b01001_017: 55 ~ 59
* b01001_018: 60 ~ 61
* b01001_019: 62 ~ 64
* b01001_020: 65 ~ 66
* b01001_021: 67 ~ 69
* b01001_022: 70 ~ 74
* b01001_023: 75 ~ 79
* b01001_024: 80 ~ 84
* b01001_025: 85+

This features refer to SEX BY AGE for Total Population (Female)

    * b01001_027: <5
    * b01001_028: 5 ~ 9
    * b01001_029: 10 ~ 14
    * b01001_030: 15 ~ 17
    * b01001_031: 18 ~ 19
    * b01001_032: 20
    * b01001_033: 21
    * b01001_034: 22 ~ 24
    * b01001_035: 25 ~ 29
    * b01001_036: 30 ~ 34
    * b01001_037: 35 ~ 39
    * b01001_038: 40 ~ 44
    * b01001_039: 45 ~ 49
    * b01001_040: 50 ~ 54
    * b01001_041: 55 ~ 59
    * b01001_042: 60 ~ 61
    * b01001_043: 62 ~ 64
    * b01001_044: 65 ~ 66
    * b01001_045: 67 ~ 69
    * b01001_046: 70 ~ 74
    * b01001_047: 75 ~ 79
    * b01001_048: 80 ~ 84
    * b01001_049: 85+

In [3]:
age_sex_raw = raw_data_all_states('Age_sex')

In [4]:

def age_sex_cleaning(state_raw):
    age_sex_cols =['State','AreaName']
    age_sex_male1 = 'B01001_00'
    age_sex_male2 = 'B01001_0'
    age_sex_female = 'B01001_0'
    for i in range (3,10):
        age_sex_cols.append(age_sex_male1 + str(i))
    for i in range (10,26):
        age_sex_cols.append(age_sex_male2 + str(i))
        
    
    for i in range (27,50):
        age_sex_cols.append(age_sex_female + str(i))

    x = state_raw.loc[:,age_sex_cols]  
    return x

In [5]:
cleaned_age_sex = age_sex_cleaning(age_sex_raw)

In [8]:
cleaned_age_sex.head(1)

Unnamed: 0,State,AreaName,B01001_003,B01001_004,B01001_005,B01001_006,B01001_007,B01001_008,B01001_009,B01001_010,...,B01001_040,B01001_041,B01001_042,B01001_043,B01001_044,B01001_045,B01001_046,B01001_047,B01001_048,B01001_049
12,AK,"Aleutians East Borough, Alaska",48,71,74,30,42,36,20,141,...,129,113,26,50,37,18,14,13,7,10


In [9]:
def change_names_age_sex(state):
    cols_rename = {
                'B01001_003': 'Male <5',
                'B01001_004': 'Male 5 ~ 9',
                'B01001_005': 'Male 10 ~ 14',
                'B01001_006': 'Male 15 ~ 17',
            
                'B01001_007': 'Male 18 ~ 19',
                'B01001_008': 'Male 20',
                'B01001_009': 'Male 21',
                'B01001_010': 'Male 22 ~ 24',
                'B01001_011': 'Male 25 ~ 29',
                'B01001_012': 'Male 30 ~ 34',
                'B01001_013': 'Male 35 ~ 39',
                'B01001_014': 'Male 40 ~ 44',
                'B01001_015': 'Male 45 ~ 49',
                'B01001_016': 'Male 50  ~ 54',
                'B01001_017': 'Male 55 ~ 59',
                'B01001_018': 'Male 60 ~ 61',
                'B01001_019': 'Male 62 ~ 64',
                'B01001_020': 'Male 65 ~ 66',
                'B01001_021': 'Male 67 ~ 69',
                'B01001_022': 'Male 70 ~ 74',
                'B01001_023': 'Male 75 ~ 79',
                'B01001_024': 'Male 80 ~ 54',
                'B01001_025': 'Male 85 +',
                'B01001_027': 'Female <5',
                'B01001_028': 'Female 5 ~ 9',
                'B01001_029': 'Female 10 ~ 14',
                'B01001_030': 'Female 15 ~ 17',
                'B01001_031': 'Female 18 ~ 19',
                'B01001_032': 'Female 20',
                'B01001_033': 'Female 21',
                'B01001_034': 'Female 22 ~ 24',
                'B01001_035': 'Female 25 ~ 29',
                'B01001_036': 'Female 30 ~ 34',
                'B01001_037': 'Female 35 ~ 39',
                'B01001_038': 'Female 40 ~ 44',
                'B01001_039': 'Female 45 ~ 49',
                'B01001_040': 'Female 50 ~ 54',
                'B01001_041': 'Female 55 ~ 59',
                'B01001_042': 'Female 60 ~ 61',
                'B01001_043': 'Female 62 ~ 64',
                'B01001_044': 'Female 65 ~ 66',
                'B01001_045': 'Female 67 ~ 69',
                'B01001_046': 'Female 70 ~ 74',
                'B01001_047': 'Female 75 ~ 79',
                'B01001_048': 'Female 80 ~ 84',
                'B01001_049': 'Female 85+'
                }
    state.rename(columns = cols_rename, inplace = True)
    
    return state

In [17]:
def add_ages (df):
    male = 'Male'
    female = 'Female'
    yrs_5 = df[]
#hacer uso de startswith y endswitch
https://www.w3schools.com/python/ref_string_startswith.asp
    https://www.w3schools.com/python/ref_string_startswith.asp
        https://www.w3schools.com/python/ref_string_startswith.asp

In [18]:
ags = change_names_age_sex(cleaned_age_sex)

KeyError: 0

In [38]:
test1 =ags['Male <5']+ags['Female <5']

In [41]:
test1

12       93
13      190
14    22121
15     1883
16       50
      ...  
30     3224
31     1204
32     1558
33      468
34      381
Length: 3220, dtype: int64

In [19]:
example = add_ages(ags)

KeyError: 0

## Race variable

This features refer to RACE for Total Population

* b02001_002: White alone
* b02001_003: Black or African American alone
* b02001_004: American Indian and Alaska Native alone
* b02001_005: Asian alone
* b02001_006: Native Hawaiian and Other Pacific Islander alone
* b02001_007: Some other race alone
* b02001_008: Two or more races

In [96]:
race_raw = raw_data_all_states('Race')

In [99]:
def race_cleaning(state_raw):
    race_cols =['State']
    race = 'B02001_00'
    for i in range (2,9):
        race_cols.append(race + str(i))
        
    x = state_raw.loc[:,race_cols]
    return x


In [100]:
cleaned_race = race_cleaning(race_raw)

In [102]:
def change_names_race(state):
    cols_rename = {
         'B02001_002': 'White alone',
         'B02001_003': 'Black or African American alone',
         'B02001_004': 'American Indian and Alaska Native alone',
         'B02001_005': 'Asian alone',
         'B02001_006': 'Native Hawaiian and Other Pacific Islander alone',
         'B02001_007': 'Some other race alone',
         'B02001_008': 'Two or more races'
        
    }
    
    state.rename(columns = cols_rename,inplace=True)
    
    return state

In [103]:
cleaned_race_final = change_names_race(cleaned_race)
cleaned_race_final.head(1)

Unnamed: 0,State,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
12,AK,630,231,1119,974,36,134,175


## Health insurance variable

* b27001_001: HEALTH INSURANCE COVERAGE
* b27002_001: PRIVATE HEALTH INSURANCE
* b27003_001: PUBLIC HEALTH INSURANCE
* c27004_001: EMPLOYER-BASED HEALTH INSURANCE 
* c27005_001: DIRECT-PURCHASE HEALTH INSURANCE
* c27006_001: MEDICARE COVERAGE
* c27007_001: MEDICAID/MEANS-TESTED PUBLIC COVERAGE
* c27008_001: TRICARE/MILITARY HEALTH COVERAGE
* c27009_001: VA HEALTH CARE





In [17]:
healthcare_raw = raw_data_all_states('Health_insurance')

In [24]:
def healthcare_cleaning(state_raw):

    insurance_cols =['State']
    insurance_b = 'B2700'
    insurance_c = 'C2700'
    finish= '_001'
    for i in range (1,4):
        insurance_cols.append(insurance_b + str(i) + finish)
    for i in range (4,10):
        insurance_cols.append(insurance_c + str(i) + finish)
        
    x = state_raw.loc[:,insurance_cols]
    return x



In [25]:
cleaned_healthcare= healthcare_cleaning(healthcare_raw)

In [29]:
cleaned_healthcare.head(1)

Unnamed: 0,State,B27001_001,B27002_001,B27003_001,C27004_001,C27005_001,C27006_001,C27007_001,C27008_001,C27009_001
12,AK,3294,3294,3294,3294,3294,3294,3294,3294,3294


In [30]:
def change_names_healthcare(state):
    cols_rename = {
        'B27001_001': 'HEALTH INSURANCE COVERAGE',
        'B27002_001': 'PRIVATE HEALTH INSURANCE',
        'B27003_001': 'PUBLIC HEALTH INSURANCE',
        'C27004_001': 'EMPLOYER-BASED HEALTH INSURANCE',
        'C27005_001': 'DIRECT-PURCHASE HEALTH INSURANCE',
        'C27006_001': 'MEDICARE COVERAGE',
        'C27007_001': 'MEDICAID/MEANS-TESTED PUBLIC COVERAGE',
        'C27008_001': 'TRICARE/MILITARY HEALTH COVERAGE',
        'C27009_001': 'VA HEALTH CARE'
    }
    state.rename(columns= cols_rename, inplace=True)
    return state

In [31]:
Cleaned_healthcare_final= change_names_healthcare(cleaned_healthcare)

In [32]:
Cleaned_healthcare_final.head(1)

Unnamed: 0,State,HEALTH INSURANCE COVERAGE,PRIVATE HEALTH INSURANCE,PUBLIC HEALTH INSURANCE,EMPLOYER-BASED HEALTH INSURANCE,DIRECT-PURCHASE HEALTH INSURANCE,MEDICARE COVERAGE,MEDICAID/MEANS-TESTED PUBLIC COVERAGE,TRICARE/MILITARY HEALTH COVERAGE,VA HEALTH CARE
12,AK,3294,3294,3294,3294,3294,3294,3294,3294,3294


## Education variable
## 



* c15002a: EDUCATIONAL ATTAINMENT (WHITE ALONE)
* c15002b: EDUCATIONAL ATTAINMENT (BLACK OR AFRICAN AMERICAN ALONE)
* c15002c: EDUCATIONAL ATTAINMENT (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
* c15002d: EDUCATIONAL ATTAINMENT (ASIAN ALONE)
* c15002e: EDUCATIONAL ATTAINMENT (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)
* c15002f: EDUCATIONAL ATTAINMENT (SOME OTHER RACE ALONE)
* c15002g: EDUCATIONAL ATTAINMENT (TWO OR MORE RACES)
* c15002h: EDUCATIONAL ATTAINMENT (WHITE ALONE, NOT HISPANIC OR LATINO)
* c15002i: EDUCATIONAL ATTAINMENT (HISPANIC OR LATINO)



* c15010a:	FIELD OF BACHELOR'S DEGREE  (WHITE ALONE)
* c15010b:	FIELD OF BACHELOR'S DEGREE  (BLACK OR AFRICAN AMERICAN ALONE)
* c15010c:	FIELD OF BACHELOR'S DEGREE  (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
* c15010d:	FIELD OF BACHELOR'S DEGREE  (ASIAN ALONE)
* c15010e:	FIELD OF BACHELOR'S DEGREE  (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)
* c15010f:	FIELD OF BACHELOR'S DEGREE  (SOME OTHER RACE ALONE)
* c15010g:	FIELD OF BACHELOR'S DEGREE  (TWO OR MORE RACES)
* c15010h:	FIELD OF BACHELOR'S DEGREE  (WHITE ALONE, NOT HISPANIC OR LATINO)
* c15010i:	FIELD OF BACHELOR'S DEGREE  (HISPANIC OR LATINO)




In [34]:
education_raw = raw_data_all_states('Education')

In [91]:
education_raw.head(1)

Unnamed: 0,SummaryLevel,State,StateFIPS,CountyFIPS,PlaceFIPS,CBSACode,CongressDistrict,GEOID,AreaName,B15001_001,...,C15010H_003,C15010H_004,C15010H_005,C15010H_006,C15010I_001,C15010I_002,C15010I_003,C15010I_004,C15010I_005,C15010I_006
12,50,AK,2,13.0,,,,05000US02013,"Aleutians East Borough, Alaska",2892,...,12,8,20,17,8,2,0,0,0,6


In [95]:
education_raw['C15002C_003']

12      96
13      49
14    1038
15     848
16      15
      ... 
30      38
31      28
32       3
33       0
34       0
Name: C15002C_003, Length: 3220, dtype: int64

In [86]:
def education_cleaning(state_raw):
    education_cols = ['State', 'AreaName']
    education_cols1 = 'C15002'
    education_cols2 = 'C15010'
    y = ['a','b','c','d','e','f','g','h','i']
    for i in range(len(y)):
        education_cols.append(education_cols1 + y[i])
        education_cols.append(education_cols2 + y[i])
        
    x = state_raw.loc[:,education_cols]
    return x

In [None]:
education_cols = ['State', 'AreaName']
education_cols1 = 'c15002'
education_cols2 = 'c15010'
y = ['a','b','c','d','e','f','g','h','i']
for i in range(len(y)):
    education_cols.append(education_cols1 + y[i])
    education_cols.append(education_cols2 + y[i])
        
print(education_cols)


In [87]:
cleaned_education = education_cleaning(education_raw)

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['C15002a', 'C15010a', 'C15002b', 'C15010b', 'C15002c',\n       ...\n       'C15010g', 'C15002h', 'C15010h', 'C15002i', 'C15010i'],\n      dtype='object', length=18). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

### Building pipeline

In [None]:
clean_pipeline = Pipeline([
    
])