In [None]:
# Data Transformation
# Version 1
# 1/26/25
# 23:00

In [2]:
# Importing Modules

# Importing SYS
import sys

# Importing OS
import os

# Add the root directory to sys.path
sys.path.append(os.path.abspath(".."))

# Now you can use absolute imports
from d497_helpers import folder_manager as fm, archive_module, checkpoint_helper as CheckPoint, database_helper as db_tool, config

# Pandas
import pandas as pd

# Numpy
import numpy as np


In [3]:
# function creates chunks of the data. 

def process_chunk(chunk):
    # List to store updated rows
    updated_rows = []

    # Group by state, year, and month
    grouped = chunk.groupby(['state_fipcode', 'year_code', 'month_code'])
    for (state, year, month), group in grouped:
        # Identify unspecified county
        unspecified = group[group['county_fipcode'] == 'C999']
        if unspecified.empty:
            # If no unspecified county, add the group as-is
            updated_rows.append(group)
            continue

        # Get total births from unspecified county
        unspecified_births = unspecified['births'].sum()

        # Filter counties that are not unspecified
        other_counties = group[group['county_fipcode'] != 'C999']
        num_counties = len(other_counties)

        if num_counties > 0:
            # Distribute births evenly
            per_county = unspecified_births // num_counties
            remainder = unspecified_births % num_counties

            # Add evenly distributed births
            #other_counties['births'] += per_county
            other_counties.loc[other_counties['births'], 'births'] += per_county

            # Distribute remainder to random counties
            if remainder > 0:
                remainder_indices = np.random.choice(
                    other_counties.index, size=remainder, replace=False
                )
                other_counties.loc[remainder_indices, 'births'] += 1

        # Append updated rows (excluding unspecified counties)
        updated_rows.append(other_counties)

    # Combine updated rows back into a single DataFrame
    return pd.concat(updated_rows)

In [4]:
# Initilizing 
db_tool.initialize_engine()
db_tool.initialize_session()

In [None]:
# Process each chunk.

# The data will export the data the database's cdc data table. 
# For each chunk of data, unspecified county's data is split and distributed into the rest of the state's counties. 
# The chunk is then uploaded back into the database into a new table callled processed_cdc_data. 


for year in range(1995, 2024):
    for month in range(1, 13):

        query = f"SELECT * FROM cdc_data WHERE year_code = {year} AND month_code = {month}"
        # Read a chunk of data from the database

        chunk = pd.DataFrame()
        chunk = db_tool.export_df_from_sql(query, chunk)
        
        # Process the chunk
        
        updated_chunk = pd.DataFrame()
        updated_chunk = process_chunk(chunk)
        
        # Save the processed chunk back to the database
        db_tool.append_to_sql(updated_chunk, "processed_cdc_data")

        del chunk
        del updated_chunk

In [None]:
# Archiving the old data from the cleaned data directory to be replaced with this new data. 

In [7]:
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 


set_search_pattern = "cdc_data_main_df*"
set_search_folder = "cleaned_cdc_data"
set_save_folder = "archived_cdc_data"
set_save_file = "CDC_Cleaned_Data_Unspecified_Counties_Removed"
filter_extension = False
filter_by_file_extension = None
archive_module.create_archive(set_search_pattern, set_search_folder, set_save_folder, set_save_file, filter_extension, filter_by_file_extension, delete=True)


#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 


Files archived successfully.
Deleted: D:\WGU\D497\WGU_D497_Project_1\data\cleaned_data\cdc_data\cdc_data_main_df.csv
Deleted: D:\WGU\D497\WGU_D497_Project_1\data\cleaned_data\cdc_data\cdc_data_main_df.pkl
WGU/D497/WGU_D497_Project_1/data/cleaned_data/cdc_data/cdc_data_main_df.csv
	Modified:	2025-01-20 17:14:12
	System:		0(0 = Windows, 3 = Unix)
	ZIP version:	20
	Compressed:	5377894 bytes
	Uncompressed:	5377894 bytes
WGU/D497/WGU_D497_Project_1/data/cleaned_data/cdc_data/cdc_data_main_df.pkl
	Modified:	2025-01-20 17:14:12
	System:		0(0 = Windows, 3 = Unix)
	ZIP version:	20
	Compressed:	8184892 bytes
	Uncompressed:	8184892 bytes


In [8]:
# Creating new empty dataframe
cdc_data_main_df = pd.DataFrame()

In [9]:
# Pulling data from the newly created table

sql_query = """
SELECT * FROM processed_cdc_data
"""

In [10]:
# Creating new dataframe from the pulled data using the database helper tool.

cdc_data_main_df = db_tool.export_df_from_sql(sql_query, cdc_data_main_df)

2025-01-20 17:47:43,338 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-01-20 17:47:43,339 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("
SELECT * FROM processed_cdc_data
")
2025-01-20 17:47:43,340 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-01-20 17:47:43,342 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("
SELECT * FROM processed_cdc_data
")
2025-01-20 17:47:43,343 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-01-20 17:47:43,344 INFO sqlalchemy.engine.Engine 
SELECT * FROM processed_cdc_data

2025-01-20 17:47:43,345 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-01-20 17:47:43,955 INFO sqlalchemy.engine.Engine COMMIT


In [11]:
# Getting info
cdc_data_main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187620 entries, 0 to 187619
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   year_code       187620 non-null  object
 1   month_code      187620 non-null  object
 2   state_fipcode   187620 non-null  object
 3   county_fipcode  187620 non-null  object
 4   fips_five       187620 non-null  object
 5   births          187620 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 8.6+ MB


In [12]:
# Visual Inspection
cdc_data_main_df.head()

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,births
0,1995,1,1,C015,1015,464
1,1995,1,1,C073,1073,1119
2,1995,1,1,C089,1089,658
3,1995,1,1,C097,1097,845
4,1995,1,1,C101,1101,630


In [13]:
# Another Visual Inspection 
cdc_data_main_df.sample(10)

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,births
123198,2014,9,51,C087,51087,556
89459,2009,11,42,C049,42049,312
179149,2022,10,24,C025,24025,246
98167,2011,2,42,C081,42081,142
24947,1999,7,17,C099,17099,248
18361,1998,4,51,C700,51700,527
175438,2022,3,55,C133,55133,438
44634,2003,1,51,C041,51041,517
14417,1997,8,23,C031,23031,248
90034,2009,12,42,C027,42027,161


In [14]:
# Checking for unspecified counties 
cdc_data_main_df[cdc_data_main_df['county_fipcode'] == "C999"]

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,births


In [15]:
# Creating a new cleaned data file for the cdc data in the data directory
CheckPoint.create_checkpoint("cleaned_cdc_data", "cdc_data_main_df", cdc_data_main_df)

## [Next Step: Data Cleaning - UFO](data_cleaning_ufo_main.ipynb)
---
#### [Return To Landing Page](order_of_operations_landing.ipynb)
