In [32]:
# Notebook ssetup
import os
import numpy as np
import pandas as pd
import re

# directories
clean = 'Clean_Data/'
raw = 'Foundation/Raw Data/'
root = '/root/'

def concatenator(base_dir, folder):
  ''' 
  Takes in file directory and specific folder name.
  Returns all files in directory as a single pd.DF   
  '''

  files = os.listdir(base_dir + folder)

  # append all files to
  dfs = []

  # loop over files in folder and append to dfs list
  for file_ in files:
    df = pd.read_csv(os.path.join(base_dir, folder, file_))
    dfs.append(df)

  return pd.concat(dfs, axis=0, ignore_index=True)


# Zip Code Ground Truth
San Diego COUNTY Zip Code list is from [sdcourt.ca.gov/sdcourt/familyandchildren2/wheretofile/zipcodedirectory](https://www.sdcourt.ca.gov/sdcourt/familyandchildren2/wheretofile/zipcodedirectory).

<br><br>
**Zip Code Ground Truth will be utilize to filter out any unnecessary Zip Codes from other Data Tables.**

PDF was processed via Google Colab due to Java dependencies of AWS Sagemaker.

In [33]:
# read in GROUND TRUTH
file_path = root + raw + 'Other/' + 'Zip_Codes_by_Court_District_GROUND_TRUTH.csv'
official_zip_code_df = pd.read_csv(file_path)
official_zip_codes = official_zip_code_df['ZIP CODE']
official_zip_codes[:3]

0    91901.0
1    91902.0
2    91903.0
Name: ZIP CODE, dtype: float64

# Census Data

### Read in Cencus Population Data

In [34]:
folder_path = root +  raw + 'Population/'
files = os.listdir(folder_path)

dfs = []

for file_ in files:
  file_path = folder_path + file_
  pop_df = pd.read_excel(file_path, sheet_name='Data')

  # remove first column of Label/Total
  pop_df = pop_df.iloc[:,1:]

  zip_data = []
  # loop through multiple of 2 columns per loop
  indices = list(range(2,222,2))
  for i in range(len(indices)):

  # @ end of for loop will throw an OUT OF RANGE error due to max_
    try:
      min_ = indices[i]
      max_ = indices[i+1]

      # list to store data
      row = []
      # get row Zip Code
      row.append(pop_df.iloc[:,min_:max_].columns[0])

      # gather zip code data
      data = list(pop_df.iloc[1,min_:max_])
      # get row pop esimate
      row.append(data[0])
      # get row pop Margine Error
      row.append(data[1])
      
      # append data to master list
      zip_data.append(row)
    except:
      pass

  df = pd.DataFrame(zip_data, columns=['Zip', 'Est', 'MOE'])

  char_num = file_path.find('ACSDT5Y')
  year_string = file_path[char_num:].split('.')[0]
  year_string = year_string.split('Y')[1]

  df['Year'] = year_string

  dfs.append(df)

pop_df = pd.concat(dfs)
pop_df = pop_df.reset_index(drop=True)
pop_df.head(3)

Unnamed: 0,Zip,Est,MOE,Year
0,ZCTA5 91905,1665,±426,2020
1,ZCTA5 91906,4327,±640,2020
2,ZCTA5 91910,74233,"±2,892",2020


In [35]:
pop_df.shape

(604, 4)

## Data Cleanup

### Convert Census columns to Int

In [36]:
pop_df['Zip'] = pop_df['Zip'].str.replace('ZCTA5', '')
pop_df.head(3)

Unnamed: 0,Zip,Est,MOE,Year
0,91905,1665,±426,2020
1,91906,4327,±640,2020
2,91910,74233,"±2,892",2020


In [37]:
# remove unwanted chars
pop_df['Est'] = pop_df['Est'].str.replace(',', '')

pop_df['MOE'] = pop_df['MOE'].str.replace('±', '')
pop_df['MOE'] = pop_df['MOE'].str.replace(',', '')

print(pop_df.dtypes, '\n')
pop_df.head(3)

Zip     object
Est     object
MOE     object
Year    object
dtype: object 



Unnamed: 0,Zip,Est,MOE,Year
0,91905,1665,426,2020
1,91906,4327,640,2020
2,91910,74233,2892,2020


In [38]:
# remove unwanted chars
pop_df['Est'] = pop_df['Est'].str.replace(',', '')

pop_df['MOE'] = pop_df['MOE'].str.replace('±', '')
pop_df['MOE'] = pop_df['MOE'].str.replace(',', '')

print(pop_df.dtypes, '\n')
pop_df.head(3)

Zip     object
Est     object
MOE     object
Year    object
dtype: object 



Unnamed: 0,Zip,Est,MOE,Year
0,91905,1665,426,2020
1,91906,4327,640,2020
2,91910,74233,2892,2020


In [39]:
pop_df_columns = pop_df.columns

for col in pop_df_columns:
  pop_df[col] = pop_df[col].astype(int)

pop_df.dtypes

Zip     int64
Est     int64
MOE     int64
Year    int64
dtype: object

### Remove Unwanted Zip Codes from Census

In [40]:
pop_df.shape

(604, 4)

In [41]:
# compare pop_df zip code to official zip codes
pop_df = pop_df[pop_df['Zip'].isin(official_zip_codes)].copy()
pop_df.shape

(604, 4)

# Merge Pop W/ Weather Station Location Name
Which will then be merged via SageMaker Data Wrangler

In [42]:
clean_data_dir = root + raw + 'Other/'
zip_code_loc_df = pd.read_csv(clean_data_dir + 'Zipcode_Per_Location.csv')
zip_code_loc_df.head()

Unnamed: 0,linbergh_field,Campo,Carlsbad,Oceanside Airport,San Diego Mont. Field,Ramona,San Diego Brown Field
0,92152,92021.0,92009.0,92069.0,92123.0,92064.0,91902.0
1,92106,92019.0,92029.0,92083.0,92124.0,92040.0,91910.0
2,92140,91935.0,92024.0,92084.0,92120.0,92065.0,91914.0
3,92107,91901.0,92007.0,92028.0,92119.0,92036.0,91978.0
4,92110,91917.0,92091.0,92054.0,91942.0,91916.0,91913.0


In [43]:
locations = []

for index, row in pop_df.iterrows():
  zip = row['Zip']

  location_boolean = zip_code_loc_df.isin([zip]).any()
  location = location_boolean[location_boolean == True].index
  
  try:
    locations.append(location[0])
  except:
    continue

pop_df['weather_station'] = locations

In [44]:
pop_df.head(3)

Unnamed: 0,Zip,Est,MOE,Year,weather_station
0,91905,1665,426,2020,Campo
1,91906,4327,640,2020,Campo
2,91910,74233,2892,2020,San Diego Brown Field


In [45]:
clean_data_dir = root + clean

pop_df.to_parquet(clean_data_dir + 'census_pop.parquet.gzip', compression='gzip', index=False)

FileNotFoundError: [Errno 2] Failed to open local file '/root/Foundation/Clean_Data/census_pop.parquet.gzip'. Detail: [errno 2] No such file or directory

In [None]:
print('Population Data is READY TO ROCK!!!!')

In [None]:
import boto3
import sagemaker

# Sagemaker session
sess = sagemaker.Session()

# bucket = "my-bucket"
bucket = sess.default_bucket()

file_name = 'census_pop.parquet.gzip'
file_path = 'Clean_Data/' + file_name

# Upload flow to S3
s3_client = boto3.client("s3")
s3_client.upload_file(clean_data_dir + file_name, bucket, file_path)