In [None]:
# Data Transformation
# Version 1
# 1/26/25
# 23:00

In [1]:
# Importing Modules

# Importing SYS
import sys

# Importing OS
import os

# Add the root directory to sys.path
sys.path.append(os.path.abspath(".."))

# Now you can use absolute imports
from d497_helpers import folder_manager as fm, archive_module, checkpoint_helper as CheckPoint, database_helper as db_tool, config

# Pandas
import pandas as pd

# Numpy
import numpy as np


In [5]:
# function creates chunks of the data. 

def process_chunk(chunk):
    # List to store updated rows
    updated_rows = []

    # Group by state, year, and month
    grouped = chunk.groupby(['state_fipcode', 'year_code', 'month_code'])
    for (state, year, month), group in grouped:
        # Identify unspecified county
        unspecified = group[group['county_fipcode'] == 'C999']
        if unspecified.empty:
            # If no unspecified county, add the group as-is
            updated_rows.append(group)
            continue

        # Get total births from unspecified county
        unspecified_births = unspecified['births'].sum()

        # Filter counties that are not unspecified
        other_counties = group[group['county_fipcode'] != 'C999']
        num_counties = len(other_counties)

        if num_counties > 0:
            # Distribute births evenly
            per_county = unspecified_births // num_counties
            remainder = unspecified_births % num_counties

            # Add evenly distributed births
            other_counties = other_counties.copy()  # Avoid modifying a slice of DataFrame
            other_counties['births'] += per_county

            # Distribute remainder to random counties
            if remainder > 0:
                remainder_indices = np.random.choice(other_counties.index.to_numpy(), size=remainder, replace=False)
                other_counties.loc[remainder_indices, 'births'] += 1

        # Append updated rows (excluding unspecified counties)
        updated_rows.append(other_counties)

    # Combine updated rows back into a single DataFrame
    return pd.concat(updated_rows, ignore_index=True)

In [6]:
# Initilizing 
db_tool.initialize_engine()
db_tool.initialize_session()

In [7]:
# Process each chunk.

# The data will export the data the database's cdc data table. 
# For each chunk of data, unspecified county's data is split and distributed into the rest of the state's counties. 
# The chunk is then uploaded back into the database into a new table callled processed_cdc_data. 


for year in range(1995, 2024):
    for month in range(1, 13):

        query = f"SELECT * FROM cdc_data WHERE year_code = {year} AND month_code = {month}"
        # Read a chunk of data from the database

        chunk = pd.DataFrame()
        chunk = db_tool.export_df_from_sql(query, chunk)
        
        # Process the chunk
        
        updated_chunk = pd.DataFrame()
        updated_chunk = process_chunk(chunk); # Semi colon to suppress output
        
        # Save the processed chunk back to the database
        db_tool.append_to_sql(updated_chunk, "processed_cdc_data")

        del chunk
        del updated_chunk

2025-02-01 00:02:48,823 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-01 00:02:48,824 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM cdc_data WHERE year_code = 1995 AND month_code = 1")
2025-02-01 00:02:48,825 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:02:48,827 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM cdc_data WHERE year_code = 1995 AND month_code = 1")
2025-02-01 00:02:48,828 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:02:48,829 INFO sqlalchemy.engine.Engine SELECT * FROM cdc_data WHERE year_code = 1995 AND month_code = 1
2025-02-01 00:02:48,829 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:02:48,873 INFO sqlalchemy.engine.Engine COMMIT
2025-02-01 00:02:48,950 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-01 00:02:48,953 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("processed_cdc_data")
2025-02-01 00:02:48,954 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:02:48,9

In [None]:
# Archiving the old data from the cleaned data directory to be replaced with this new data. 

In [8]:
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 


set_search_pattern = "cdc_data_main_df*"
set_search_folder = "cleaned_cdc_data"
set_save_folder = "archived_cdc_data"
set_save_file = "CDC_Cleaned_Data_Unspecified_Counties_Removed"
filter_extension = False
filter_by_file_extension = None
archive_module.create_archive(set_search_pattern, set_search_folder, set_save_folder, set_save_file, filter_extension, filter_by_file_extension, delete=True)


#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 


Files archived successfully.
Deleted: D:\WGU\New folder\D497\WGU_D497_Project_1\data\cleaned_data\cdc_data\cdc_data_main_df.csv
Deleted: D:\WGU\New folder\D497\WGU_D497_Project_1\data\cleaned_data\cdc_data\cdc_data_main_df.pkl
WGU/New folder/D497/WGU_D497_Project_1/data/cleaned_data/cdc_data/cdc_data_main_df.csv
	Modified:	2025-01-31 23:59:20
	System:		0(0 = Windows, 3 = Unix)
	ZIP version:	20
	Compressed:	11392702 bytes
	Uncompressed:	11392702 bytes
WGU/New folder/D497/WGU_D497_Project_1/data/cleaned_data/cdc_data/cdc_data_main_df.pkl
	Modified:	2025-01-31 23:59:20
	System:		0(0 = Windows, 3 = Unix)
	ZIP version:	20
	Compressed:	15429101 bytes
	Uncompressed:	15429101 bytes


In [9]:
# Creating new empty dataframe
cdc_data_main_df = pd.DataFrame()

In [10]:
# Pulling data from the newly created table

sql_query = """
SELECT * FROM processed_cdc_data
"""

In [11]:
# Creating new dataframe from the pulled data using the database helper tool.

cdc_data_main_df = db_tool.export_df_from_sql(sql_query, cdc_data_main_df)

2025-02-01 00:10:57,533 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-01 00:10:57,534 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("
SELECT * FROM processed_cdc_data
")
2025-02-01 00:10:57,535 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:10:57,537 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("
SELECT * FROM processed_cdc_data
")
2025-02-01 00:10:57,538 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:10:57,539 INFO sqlalchemy.engine.Engine 
SELECT * FROM processed_cdc_data

2025-02-01 00:10:57,540 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:10:58,338 INFO sqlalchemy.engine.Engine COMMIT


In [12]:
# Getting info
cdc_data_main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187620 entries, 0 to 187619
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   year_code       187620 non-null  object
 1   month_code      187620 non-null  object
 2   state           187620 non-null  object
 3   state_code      187620 non-null  object
 4   state_fipcode   187620 non-null  object
 5   county          187620 non-null  object
 6   county_fipcode  187620 non-null  object
 7   fips_five       187620 non-null  object
 8   births          187620 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 12.9+ MB


In [13]:
# Visual Inspection
cdc_data_main_df.head()

Unnamed: 0,year_code,month_code,state,state_code,state_fipcode,county,county_fipcode,fips_five,births
0,1995,1,Alabama,AL,1,Calhoun County,C015,1015,464
1,1995,1,Alabama,AL,1,Jefferson County,C073,1073,1118
2,1995,1,Alabama,AL,1,Madison County,C089,1089,658
3,1995,1,Alabama,AL,1,Mobile County,C097,1097,845
4,1995,1,Alabama,AL,1,Montgomery County,C101,1101,631


In [14]:
# Another Visual Inspection 
cdc_data_main_df.sample(10)

Unnamed: 0,year_code,month_code,state,state_code,state_fipcode,county,county_fipcode,fips_five,births
37375,2001,10,Illinois,IL,17,Sangamon County,C167,17167,350
36792,2001,9,Alabama,AL,1,Montgomery County,C101,1101,592
35738,2001,6,Oklahoma,OK,40,Comanche County,C031,40031,684
137898,2016,11,Illinois,IL,17,LaSalle County,C099,17099,190
72826,2007,7,California,CA,6,Sacramento County,C067,6067,2018
118013,2013,12,Wisconsin,WI,55,Racine County,C101,55101,325
166489,2020,12,New York,NY,36,Bronx County,C005,36005,1372
48947,2003,10,Florida,FL,12,Bay County,C005,12005,218
92410,2010,4,Texas,TX,48,Bexar County,C029,48029,2146
78386,2008,4,Ohio,OH,39,Butler County,C017,39017,534


In [15]:
# Checking for unspecified counties 
cdc_data_main_df[cdc_data_main_df['county_fipcode'] == "C999"]

Unnamed: 0,year_code,month_code,state,state_code,state_fipcode,county,county_fipcode,fips_five,births


In [16]:
# Creating a new cleaned data file for the cdc data in the data directory
CheckPoint.create_checkpoint("cleaned_cdc_data", "cdc_data_main_df", cdc_data_main_df)

## [Next Step: Data Cleaning - UFO](data_cleaning_ufo_main.ipynb)
---
#### [Return To Landing Page](order_of_operations_landing.ipynb)
