In [1]:
# Data Load - FIPS 
## Version 3
## 1/12/25
## 19:10

In [2]:
import sys
import os

# Add the root directory to sys.path
sys.path.append(os.path.abspath(".."))

# Now you can use absolute imports
from d497_helpers import folder_manager as fm, archive_module, checkpoint_helper as CheckPoint, database_helper as db_tool, config

# Pandas
import pandas as pd

# Numpy
import numpy as np

# RegularExpression for string matching
import re

# GLOB 
import glob 

# Pathlib 
from pathlib import Path



In [3]:
#fips_df = pd.DataFrame()
#fips_data_main_df_csv = pd.DataFrame()
#fips_data_main_df_pickle = pd.DataFrame()


In [4]:
# function loads FIPS data exported in data extraction step

def load_FIPS_data():

    global fips_data_main_df_csv
    global fips_data_main_df_pickle

    csv_dtypes = {
        'state_name': str,
        'county_name': str,
        'city_name': str,
        'state_code': str,
        'state_fipcode': str,
        'county_code': str,
        'county_fipcode': str,
        'city_code': str,
        'city_fipcode': str
    }

    file_name_csv = "raw_fips_master_data.csv"
    file_name_pickle = "raw_fips_master_data.pkl"
    
    file_path_csv = config.global_raw_fips_data_folder_path + "/" + file_name_csv
    file_path_pickle = config.global_raw_fips_data_folder_path + "/" + file_name_pickle

    import_main_df_csv = pd.read_csv(file_path_csv, sep=",", dtype=csv_dtypes)
    import_main_df_pickle = pd.read_pickle(file_path_pickle)

    fips_data_main_df_csv = import_main_df_csv.copy()
    fips_data_main_df_pickle = import_main_df_pickle.copy()

    del import_main_df_csv
    del import_main_df_pickle

    

In [5]:
# Initializer Commands
load_FIPS_data()

In [6]:
# Reviewing the information from the imported FIPS Data csv file. 
fips_data_main_df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39792 entries, 0 to 39791
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   state_name      39792 non-null  object
 1   county_name     39791 non-null  object
 2   city_name       39790 non-null  object
 3   state_code      39792 non-null  object
 4   state_fipcode   39792 non-null  object
 5   county_code     39791 non-null  object
 6   county_fipcode  39791 non-null  object
 7   city_code       39790 non-null  object
 8   city_fipcode    39790 non-null  object
dtypes: object(9)
memory usage: 2.7+ MB


In [7]:
# Reviewing the information from the imported FIPS Data pickle file.
fips_data_main_df_pickle.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39792 entries, 0 to 1791
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   state_name      39792 non-null  object
 1   county_name     39791 non-null  object
 2   city_name       39790 non-null  object
 3   state_code      39792 non-null  object
 4   state_fipcode   39792 non-null  object
 5   county_code     39791 non-null  object
 6   county_fipcode  39791 non-null  object
 7   city_code       39790 non-null  object
 8   city_fipcode    39790 non-null  object
dtypes: object(9)
memory usage: 3.0+ MB


In [8]:
# Decided to keep the pickle file over the CSV. Copying the dataframe to a new one to act as the working main data frame. 
fips_data_main_df = fips_data_main_df_pickle.copy()

In [9]:
# Clean up. deleting the two dataframes from the import. 
del fips_data_main_df_csv
del fips_data_main_df_pickle

In [None]:
# obtaining information about the main dataframe. 
fips_data_main_df.info()

In [11]:
# Generating a sample of the dataframe for visual inspection. 
fips_data_main_df.sample(15)

Unnamed: 0,state_name,county_name,city_name,state_code,state_fipcode,county_code,county_fipcode,city_code,city_fipcode
1406,INDIANA,LA PORTE,UNION MILLS,IN,18,C091,18091,4934,180914934
1652,MASSACHUSETTS,WORCESTER,WEST BROOKFIELD,MA,25,C027,25027,1410,250271410
1939,MICHIGAN,DICKINSON,SAGOLA,MI,26,C043,26043,4352,260434352
1898,KENTUCKY,BUTLER,WOODBURY,KY,21,C031,21031,3660,210313660
1960,SOUTH DAKOTA,DEUEL,ASTORIA,SD,46,C039,46039,150,460390150
1807,SOUTH CAROLINA,SUMTER,SHANNONTOWN,SC,45,C085,45085,2350,450852350
744,IOWA,RINGGOLD,SHANNON CITY,IA,19,C159,19159,7700,191597700
1507,ILLINOIS,COOK,INDIAN HEAD PARK,IL,17,C031,17031,4146,170314146
495,WEST VIRGINIA,TUCKER,THOMAS,WV,54,C093,54093,2600,540932600
53,WEST VIRGINIA,MCDOWELL,PREMIER,WV,54,C047,54047,2173,540472173


In [12]:
# Checking for null values and counting them. 
fips_data_main_df.isnull().sum()

state_name        0
county_name       1
city_name         2
state_code        0
state_fipcode     0
county_code       1
county_fipcode    1
city_code         2
city_fipcode      2
dtype: int64

In [13]:
# Since null values were found. The null values of the city name column are isolated using the label local function on the data frame. 
fips_data_main_df.loc[fips_data_main_df['city_name'].isnull()]

Unnamed: 0,state_name,county_name,city_name,state_code,state_fipcode,county_code,county_fipcode,city_code,city_fipcode
920,HAWAII,KALAWAO,,HI,15,C005,15005.0,,
1226,PUERTO RICO,,,PR,72,,,,


In [14]:
# Performing the same check with the label local function on the county name column. 
fips_data_main_df.loc[fips_data_main_df['county_name'].isnull()]

Unnamed: 0,state_name,county_name,city_name,state_code,state_fipcode,county_code,county_fipcode,city_code,city_fipcode
1226,PUERTO RICO,,,PR,72,,,,


In [15]:
# Removing Puerto Rico from the dataset as it's not within scope of this project. 

fips_data_main_df = fips_data_main_df.loc[~fips_data_main_df['state_fipcode'].str.contains("72")]

In [16]:
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st15_hi_cou2020.txt

# County FIP Code is 005

# Cities in Hawaii
# https://www2.census.gov/geo/docs/reference/codes2020/place/st15_hi_place2020.txt

# Does not appear that Kalawoo has any cities 
# https://www2.census.gov/geo/docs/reference/codes2020/place_by_cou/st15_hi_place_by_county2020.txt

# This location was used to house leper patients in isolation until the cure was found. 
# Now it is going to be added to the national park. 
# It currently has 8 residents, who were former patients. 
# Settings City code to all 9, "9999" to indicate unspecified


In [17]:
# As we can see from the EDA research above. Hawaii has some issues with cities due to most livable areas being tribes or villages. 
# There was also an issued with the volcacno eruption that removed whole section and some areas being reclassified into others. 
# So the county 005 data is having the city name and city code reclasssified with unspecified to remove the null functionality. 
fips_data_main_df.loc[fips_data_main_df['county_fipcode'] == "15005", ['city_name','city_code']] = ['UNSPECIFIED','9999']

In [None]:
# City fipcode is also manually assigned.
fips_data_main_df.loc[fips_data_main_df['county_fipcode'] == "15005", 'city_fipcode'] = "150059999" 

In [19]:
# Pulling the Kalawao county information again to verify that changes were successfull. 
fips_data_main_df[fips_data_main_df['county_fipcode'] == "15005"]

Unnamed: 0,state_name,county_name,city_name,state_code,state_fipcode,county_code,county_fipcode,city_code,city_fipcode
920,HAWAII,KALAWAO,UNSPECIFIED,HI,15,C005,15005,9999,150059999


 EDA Revealed that Miami was changed to Dale-Maimi in '97. All of the 95-02 data has the old fipcode and needs to be updated. 

In [20]:
fips_data_main_df.loc[fips_data_main_df['county_fipcode'] == "12025", 'county_fipcode'] = "12086" 

In [21]:
# Checking for null values visually again. 
fips_data_main_df.isnull().sum().sum()

state_name        0
county_name       0
city_name         0
state_code        0
state_fipcode     0
county_code       0
county_fipcode    0
city_code         0
city_fipcode      0
dtype: int64

In [22]:
# Renaming columns to be more accurrate in their description. 
# Since I will be utilizing County FIP Code in it's 3 digit classifierd, "CXXX" I will rename the listed county_fipcode to fips_five to indicate the five digit fipcode that specifies state and county. 
# I am also changing the city_fipcode to fips_nine to show the nine digit fipcode that represents the state, county and city identifier. 

fips_data_main_df.rename(columns={'county_fipcode':'fips_five', 'city_fipcode':'fips_nine'}, inplace=True)

In [23]:
# Column rename continued. 
# City code will now be renamed to city fipcode to show show the four digit fipcode for the city. 
# County_code will now be renamed to county_fipcode to represent the 3 digit identifier for the county, "CXXX"
fips_data_main_df.rename(columns={'city_code':'city_fipcode','county_code':'county_fipcode'}, inplace=True)

In [24]:
# Saving the FIPs data back onto itself to easily reorganize the columns into the order I would prefer. 
fips_data_main_df = fips_data_main_df[['state_name','state_code','state_fipcode','county_name','county_fipcode','city_name','city_fipcode','fips_five','fips_nine']]

In [25]:
# displaying results. 
fips_data_main_df

Unnamed: 0,state_name,state_code,state_fipcode,county_name,county_fipcode,city_name,city_fipcode,fips_five,fips_nine
0,ALABAMA,AL,01,AUTAUGA,C001,AUTAUGAVILLE,0220,01001,010010220
1,ALABAMA,AL,01,AUTAUGA,C001,BILLINGSLEY,0340,01001,010010340
2,ALABAMA,AL,01,AUTAUGA,C001,BOOTH,0425,01001,010010425
3,ALABAMA,AL,01,AUTAUGA,C001,JONES,1796,01001,010011796
4,ALABAMA,AL,01,AUTAUGA,C001,MARBURY,2002,01001,010012002
...,...,...,...,...,...,...,...,...,...
1787,WYOMING,WY,56,WASHAKIE,C043,TEN SLEEP,0810,56043,560430810
1788,WYOMING,WY,56,WASHAKIE,C043,WORLAND,0890,56043,560430890
1789,WYOMING,WY,56,WESTON,C045,NEWCASTLE,0610,56045,560450610
1790,WYOMING,WY,56,WESTON,C045,OSAGE,0623,56045,560450623


In [26]:
# Creating a Checkpoint of my progresss by exporting a CSV and Pickle file using my checkpoint tool. 
CheckPoint.create_checkpoint("processed_fips_data", "master_fips_data_dfs_complete", fips_data_main_df)

In [27]:
# Creating a new column called Multi-County flag. This is needed because some cities exist in multiple counties. 
# The multi_county_flag will be a boolean value of true or false created by performing a grouping of the state fipcode and city name. 
# Then the transform function is performed with the number of unique function to check if the groupings have more than one results. 
# if so the value is set to true. 
fips_data_main_df['multi_county_flag'] = fips_data_main_df.groupby(['state_fipcode','city_name'])['county_name'].transform('nunique') > 1

In [28]:
# Checking results of the operation. 
fips_data_main_df['multi_county_flag'].value_counts()

multi_county_flag
False    37809
True      1982
Name: count, dtype: int64

In [29]:
# I will now create a new column called county count performing the same operation as above, but this time without the greater than one operation on the end.
# By excluding the greater than one comparision, the result is exported as an integer 
fips_data_main_df['county_count'] = fips_data_main_df.groupby(['state_fipcode', 'city_name'])['county_fipcode'].transform('nunique')

In [30]:
# I will now create a new column called County rank which will store a sudo-ranked number for the number of cities each county has. 
# Essentially, what I am doing here is numbering each result for counties that have more than one city assigned to them. 
# The rank is assigned based on asscending order of the county fipcode of the city name. 
# This is accomplished by first grouping the columns by state fip code and city name. The rank function is then performed on each city
# Any result null result is set to 0 and all rows are set to int. 

fips_data_main_df['county_rank'] = (
    fips_data_main_df.groupby(['state_fipcode','city_name'])['county_fipcode']
    .rank(method='dense', ascending=True)
    .fillna(0)
    .astype(int)
)

In [31]:
# Previewing results. 
fips_data_main_df[(fips_data_main_df['multi_county_flag'] == True) & (fips_data_main_df['state_fipcode'] == "01")].head(50).sort_values(by=["city_name", 'county_rank'])

Unnamed: 0,state_name,state_code,state_fipcode,county_name,county_fipcode,city_name,city_fipcode,fips_five,fips_nine,multi_county_flag,county_count,county_rank
271,ALABAMA,AL,1,ETOWAH,C055,BOAZ,400,1055,10550400,True,2,1
522,ALABAMA,AL,1,MARSHALL,C095,BOAZ,400,1095,10950400,True,2,2
365,ALABAMA,AL,1,JEFFERSON,C073,BROWNVILLE,506,1073,10730506,True,2,1
366,ALABAMA,AL,1,JEFFERSON,C073,BROWNVILLE,509,1073,10730509,True,2,1
699,ALABAMA,AL,1,TUSCALOOSA,C125,BROWNVILLE,506,1125,11250506,True,2,2
50,ALABAMA,AL,1,BLOUNT,C009,COUNTY LINE,798,1009,10090798,True,2,1
373,ALABAMA,AL,1,JEFFERSON,C073,COUNTY LINE,798,1073,10730798,True,2,2
784,ALABAMA,AL,1,DE KALB,C049,CROSSVILLE,810,1049,10490810,True,2,1
424,ALABAMA,AL,1,LAMAR,C075,CROSSVILLE,811,1075,10750811,True,2,2
379,ALABAMA,AL,1,JEFFERSON,C073,FLAT CREEK,1170,1073,10731170,True,2,1


In [32]:
# Checking to make sure all county_ranks do not have a 0
fips_data_main_df[fips_data_main_df['county_rank'] == 0]

Unnamed: 0,state_name,state_code,state_fipcode,county_name,county_fipcode,city_name,city_fipcode,fips_five,fips_nine,multi_county_flag,county_count,county_rank


In [33]:
# Creating a checkpoint again of my progress this time to be used with the next step. 
CheckPoint.create_checkpoint("processed_fips_data", "fips_data_cleaning_complete", fips_data_main_df)


In [34]:
# Archiving all previous data utilized. 

set_search_pattern = "raw_fips_master_data*"
set_search_folder = "raw_fips_data"
set_save_folder = "archived_fips_data"
set_save_file = "FIPS_Raw_Downloads"
filter_extension = False
filter_by_file_extension = None
archive_module.create_archive(set_search_pattern, set_search_folder, set_save_folder, set_save_file, filter_extension, filter_by_file_extension, delete=True)

Files archived successfully.
Deleted: D:\WGU\D497\WGU_D497_Project_1\data\raw_data\fips_data\raw_fips_master_data.csv
Deleted: D:\WGU\D497\WGU_D497_Project_1\data\raw_data\fips_data\raw_fips_master_data.pkl
WGU/D497/WGU_D497_Project_1/data/raw_data/fips_data/raw_fips_master_data.csv
	Modified:	2025-01-20 16:20:46
	System:		0(0 = Windows, 3 = Unix)
	ZIP version:	20
	Compressed:	2399757 bytes
	Uncompressed:	2399757 bytes
WGU/D497/WGU_D497_Project_1/data/raw_data/fips_data/raw_fips_master_data.pkl
	Modified:	2025-01-20 16:20:46
	System:		0(0 = Windows, 3 = Unix)
	ZIP version:	20
	Compressed:	3396248 bytes
	Uncompressed:	3396248 bytes


## [Next Step: Data Load - FIPS Data](data_load_fips.ipynb)
---
#### [Return To Landing Page](order_of_operations_landing.ipynb)