# Loudoun County Growth Study

## Import Libraries

In [1]:
# Import necessary libraries
import os
import pandas as pd
import sqlite3



## Cell 2: Define Column Renaming Mapping

In [2]:
# Column Renaming Map
column_renaming_map = {
    # Demographics Data
    'geographic_area_name_(name)': 'geo_name',
    '2017_naics_code_(naics2017)': 'naics_code',
    'meaning_of_naics_code_(naics2017_label)': 'naics_desc',
    'meaning_of_legal_form_of_organization_code_(lfo_label)': 'legal_form',
    'meaning_of_employment_size_of_establishments_code_(empszes_label)': 'emp_size',
    'year_(year)': 'year',
    'number_of_establishments_(estab)': 'num_estabs',
    'annual_payroll_($1,000)_(payann)': 'ann_payroll_thousands',
    'first_quarter_payroll_($1,000)_(payqtr1)': 'q1_payroll_thousands',
    'number_of_employees_(emp)': 'num_emps',
    'county': 'county',  # Keep as is

    # Decennial Population Data
    'label_(grouping)': 'label',
    'loudoun_county,_virginia!!count': 'loudoun_va_count',
    'loudoun_county,_virginia!!percent': 'loudoun_va_pct',
    'data_type': 'data_type',

    # GDP Data
    'geofips': 'geo_fips',
    'geoname': 'geo_name',
    'linecode': 'line_code',
    'description': 'description',
    # Years can remain as is
    'county': 'county',

    # BLS Data
    'area_fips': 'area_fips',
    'own_code': 'own_code',
    'industry_code': 'ind_code',
    'agglvl_code': 'agg_level_code',
    'size_code': 'size_code',
    'year': 'year',
    'qtr': 'quarter',
    'disclosure_code': 'disc_code',
    'annual_avg_estabs': 'ann_avg_estabs',
    'annual_avg_emplvl': 'ann_avg_emp_lvl',
    # Continue for other columns as needed
    'quarter': 'quarter',
}






## Cell 3: Define simplify_column_name Function

In [3]:
def simplify_column_name(col_name):
    """
    Simplifies a column name by removing unnecessary words and abbreviating common terms,
    without using the 're' module.
    """
    # Remove content within parentheses
    while '(' in col_name and ')' in col_name:
        start = col_name.find('(')
        end = col_name.find(')', start)
        if end != -1:
            col_name = col_name[:start] + col_name[end+1:]
        else:
            break  # No closing parenthesis found

    # Replace '!!' with '_'
    col_name = col_name.replace('!!', '_')

    # Remove special characters and replace them with underscores
    special_chars = ' !@#$%^&*()+={}[]|\\:;"\'<>,.?/~`-'
    for char in special_chars:
        col_name = col_name.replace(char, '_')

    # Replace multiple underscores with a single underscore
    while '__' in col_name:
        col_name = col_name.replace('__', '_')

    # Convert to lowercase
    col_name = col_name.lower()

    # Abbreviate common terms
    abbreviations = {
        'geographic_area_name': 'geo_name',
        'number_of_establishments': 'num_estabs',
        'annual_payroll': 'ann_payroll',
        'first_quarter_payroll': 'q1_payroll',
        'meaning_of': '',
        'naics_code': 'naics_code',
        'employment_size_of_establishments_code': 'emp_size',
        'legal_form_of_organization_code': 'legal_form',
        'number_of_employees': 'num_emps',
        'label_grouping': 'label',
        'percent': 'pct',
        'estimate': 'est',
        'margin_of_error': 'moe',
        'total': '',
        'male': 'male',
        'female': 'female',
        'county': 'cty',
        'virginia': 'va',
        'california': 'ca',
        'maryland': 'md',
        'north_carolina': 'nc',
        'tennessee': 'tn',
        'texas': 'tx',
        'description': 'desc',
        'ownership_code': 'own_code',
        'industry_code': 'ind_code',
        'size_code': 'size_code',
        'quarter': 'qtr',
        'disclosure_code': 'disc_code',
        'annual_avg_estabs': 'ann_avg_estabs',
        'annual_avg_emplvl': 'ann_avg_emp_lvl',
        # Add other abbreviations as needed
    }

    for long, short in abbreviations.items():
        col_name = col_name.replace(long, short)

    # Remove leading and trailing underscores
    col_name = col_name.strip('_')

    return col_name


## Cell 4: Update clean_dataframe Function


In [4]:
def clean_dataframe(df):
    """
    Cleans a DataFrame:
    - Removes columns and rows with only NaNs or nulls
    - Handles NaNs and null values
    - Standardizes and simplifies column names
    - Removes duplicate rows and columns
    """
    df = df.copy()
    
    # Remove columns with all NaNs or nulls
    df.dropna(axis=1, how='all', inplace=True)
    
    # Remove rows with all NaNs or nulls
    df.dropna(axis=0, how='all', inplace=True)
    
    # Reset index to ensure it's unique
    df.reset_index(drop=True, inplace=True)
    
    # Fill missing data forward and backward
    df.fillna(method="ffill", inplace=True)
    df.fillna(method="bfill", inplace=True)
    
    # Standardize column names
    df.columns = df.columns.str.strip().str.lower()
    
    # Simplify column names (assuming simplify_column_name is defined)
    df.columns = [simplify_column_name(col) for col in df.columns]
    
    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]
    
    # Remove duplicate rows
    df.drop_duplicates(inplace=True)
    
    return df



## Cell 5: Define merge_files Function

In [5]:
def merge_files(file_dict, output_file, add_columns=None):
    """
    Merges multiple CSV files into a single DataFrame and saves it.
    - file_dict: Dictionary with keys as identifiers and values as file paths.
    - output_file: Path to save the merged CSV.
    - add_columns: Dictionary with column names as keys and values or functions as values.
    """
    merged_data = pd.DataFrame()
    for key, file_path in file_dict.items():
        try:
            df = pd.read_csv(file_path)
            df = clean_dataframe(df)
            if add_columns:
                for col_name, value in add_columns.items():
                    # Check if the column already exists
                    if col_name in df.columns:
                        # Optionally rename the existing column to prevent overwriting
                        df.rename(columns={col_name: f"{col_name}_original"}, inplace=True)
                    # Assign the new value
                    df[col_name] = value(key)
            merged_data = pd.concat([merged_data, df], ignore_index=True)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
    # Save the merged data
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    merged_data.to_csv(output_file, index=False)
    print(f"Merged data saved to {output_file}")
    return merged_data



## Define File Paths

In [6]:
# Define File Paths for Demographic Data
demographic_files = {
    "Dallas TX": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Demographics\Dallas_County_Texas_CBP.csv",
    "Davidson TN": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Demographics\Davidson_County_Tennessee_CBP.csv",
    "Fairfax VA": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Demographics\Fairfax_County_Virginia_CBP.csv",
    "Loudoun VA": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Demographics\Loudoun_County_Virginia_CBP.csv",
    "Montgomery MD": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Demographics\Montgomery_County_Maryland_CBP.csv",
    "Santa Clara CA": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Demographics\Santa_Clara_County_California_CBP.csv",
    "Wake NC": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Demographics\Wake_County_North_Carolina_CBP.csv",
}



In [7]:
# Merge Demographics Data
demographics_output_file = r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_demographics_data.csv"

# Use a lambda function to assign the 'county' column
merged_demographics_data = merge_files(
    demographic_files,
    demographics_output_file,
    add_columns={"county": lambda key: key}
)


Merged data saved to C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_demographics_data.csv


In [8]:
def process_decennial_file(file_path, output_file):
    """
    Processes and saves the Decennial Population data.
    - file_path: Path to the Decennial Population CSV file.
    - output_file: Output file path where the processed data will be saved.
    """
    try:
        # Load and clean the Decennial Population data
        df = pd.read_csv(file_path)
        df = clean_dataframe(df)
        df["data_type"] = "Decennial Population"

        # Save the cleaned data
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        df.to_csv(output_file, index=False)
        print(f"Decennial Population data saved to {output_file}")
        return df
    except Exception as e:
        print(f"Error processing Decennial Population file: {e}")
        return None


In [9]:
# File path and output location for Decennial Population
decennial_file_path = r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Demographics\2020_decennial_population_housing.csv"
decennial_output_file = r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_decennial_population_data.csv"

# Process and merge the Decennial Population data
merged_decennial_data = process_decennial_file(decennial_file_path, decennial_output_file)


Decennial Population data saved to C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_decennial_population_data.csv


In [10]:
# Define File Paths for GDP Data
gdp_files = {
    "Loudoun County": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\GDP\Loudoun_GDP_summary.csv",
    "Fairfax County": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\GDP\Fairfax_GDP_summary.csv",
    "Santa Clara County": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\GDP\Santa_Clara_CA_GDP_summary.csv",
    "Montgomery County": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\GDP\Mountgomery_MD_GDP_summary.csv",
    "Wake County": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\GDP\Wake_NC_GDP_summary.csv",
    "Davidson County": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\GDP\Davidson_TN_GDP_summary.csv",
    "Dallas County": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\GDP\Dallas_TX_GDP_summary.csv",
}








In [11]:
# Merge GDP Data
gdp_output_file = r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_gdp_data.csv"
merged_gdp_data = merge_files(gdp_files, gdp_output_file)


Merged data saved to C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_gdp_data.csv


In [12]:
# Define File Paths for BLS Data
bls_files = {
    "Dallas": [
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Dallas\BLS_Dallas_2020.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Dallas\BLS_Dallas_2021.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Dallas\BLS_Dallas_2022.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Dallas\BLS_Dallas_2023.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Dallas\BLS_Dallas_2024_q1.csv",
    ],
    "Davidson": [
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Davidson\BLS_Davidson_2020.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Davidson\BLS_Davidson_2021.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Davidson\BLS_Davidson_2022.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Davidson\BLS_Davidson_2023.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Davidson\BLS_Davidson_2024_q1.csv",
    ],
    "Fairfax": [
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Fairfax\BLS_Fairfax_2020.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Fairfax\BLS_Fairfax_2021.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Fairfax\BLS_Fairfax_2022.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Fairfax\BLS_Fairfax_2023.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Fairfax\BLS_Fairfax_2024_q1.csv",
    ],
    "Loudoun": [ 
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Loudoun\BLS_Loudoun_2020.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Loudoun\BLS_Loudoun_2021.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Loudoun\BLS_Loudoun_2022.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Loudoun\BLS_Loudoun_2023.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Loudoun\BLS_Loudoun_2024_q1.csv",
    ],
    "Montgomery": [   
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Montgomery\BLS_Montgomery_2020.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Montgomery\BLS_Montgomery_2021.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Montgomery\BLS_Montgomery_2022.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Montgomery\BLS_Montgomery_2023.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Montgomery\BLS_Montgomery_2024_q1.csv",
    ],
    "Santa Clara": [
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Santa Clara\BLS_Santa_Clara_2020.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Santa Clara\BLS_Santa_Clara_2021.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Santa Clara\BLS_Santa_Clara_2022.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Santa Clara\BLS_Santa_Clara_2023.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Santa Clara\BLS_Santa_Clara_2024_q1.csv",
    ],
    "Wake": [  
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Wake\BLS_Wake_2020.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Wake\BLS_Wake_2021.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Wake\BLS_Wake_2022.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Wake\BLS_Wake_2023.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Bureau of Labor Statistics\Wake\BLS_Wake_2024_q1.csv",
    ],
}


In [13]:
# Create a list to hold tuples of (file_path, county, year, quarter)
bls_file_info = []

for county, file_list in bls_files.items():
    for file_path in file_list:
        # Extract the file name from the file path
        file_name = os.path.basename(file_path)
        # Remove the extension
        base_name = os.path.splitext(file_name)[0]
        # Split by underscores
        parts = base_name.split('_')
        # The last part might be '2020', '2024', or '2024_q1'
        year_part = parts[-1]

        # Check if there's a quarter indicated
        if 'q' in year_part.lower():
            # Split the year and quarter
            year = year_part[:4]
            quarter_part = year_part.lower().split('q')[1]
            quarter = 'Q' + quarter_part  # Format quarter as 'Q1', 'Q2', etc.
        else:
            year = year_part
            quarter = None  # No quarter information

        # Append the information to the list
        bls_file_info.append((file_path, county, year, quarter))



In [14]:
def merge_bls_files(file_info_list, output_file):
    """
    Merges multiple BLS CSV files into a single CSV file.
    - file_info_list: List of tuples (file_path, county, year, quarter).
    - output_file: Output file path where the merged data will be saved.
    """
    merged_data = pd.DataFrame()
    for file_path, county, year, quarter in file_info_list:
        try:
            df = pd.read_csv(file_path)
            df = clean_dataframe(df)
            df["county"] = county
            df["year"] = year
            if quarter:
                df["quarter"] = quarter
            merged_data = pd.concat([merged_data, df], ignore_index=True)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
    # Save the merged data
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    merged_data.to_csv(output_file, index=False)
    print(f"Merged data saved to {output_file}")
    return merged_data



In [15]:
# Merge BLS Data
bls_output_file = r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_bls_data.csv"
merged_bls_data = merge_bls_files(bls_file_info, bls_output_file)


Merged data saved to C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_bls_data.csv


In [16]:
# Define File Paths for Census Data
census_files = {
    "2010": [
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2010\2010_population_age_sex.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2010\2010_household_income.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2010\2010_housing_characteristics.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2010\2010_housing_occupancy.csv",
    ],
    "2015": [
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2015\2015_household_income.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2015\2015_housing_characteristics.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2015\2015_housing_occupancy.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2015\2015_population_age_sex.csv",
    ],
    "2020": [
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2020\2020_household_income.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2020\2020_housing_characteristics.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2020\2020_housing_occupancy.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2020\2020_population_age_sex.csv",
    ],
    "2023": [
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2023\2023_population_age_sex.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2023\2023_household_income.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2023\2023_housing_characteristics.csv",
        r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\US Census\2023\2023_housing_occupancy.csv",
    ],
}


In [17]:
# Create a list to hold tuples of (file_path, year, data_type)
census_file_info = []

for year, file_list in census_files.items():
    for file_path in file_list:
        # Extract the file name from the file path
        file_name = os.path.basename(file_path)
        # Remove the extension
        base_name = os.path.splitext(file_name)[0]
        # Extract data type from the file name
        # Assuming the file names are like '2010_population_age_sex.csv'
        data_type = '_'.join(base_name.split('_')[1:])

        census_file_info.append((file_path, year, data_type))


In [18]:
def merge_census_files(file_info_list, output_file):
    """
    Merges multiple Census CSV files into a single CSV file.
    - file_info_list: List of tuples (file_path, year, data_type).
    - output_file: Output file path where the merged data will be saved.
    """
    merged_data = pd.DataFrame()
    for file_path, year, data_type in file_info_list:
        try:
            df = pd.read_csv(file_path)
            df = clean_dataframe(df)
            df["year"] = year
            df["data_type"] = data_type
            merged_data = pd.concat([merged_data, df], ignore_index=True)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
    # Save the merged data
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    merged_data.to_csv(output_file, index=False)
    print(f"Merged data saved to {output_file}")
    return merged_data


In [19]:
# Merge Census Data
census_output_file = r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_census_data.csv"
merged_census_data = merge_census_files(census_file_info, census_output_file)


Merged data saved to C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Merged Files\merged_census_data.csv


In [20]:
# Display the first few rows of merged dataframes
print("Demographics Data:")
display(merged_demographics_data.head())

print("Decennial Population Data:")
display(merged_decennial_data.head())

print("GDP Data:")
display(merged_gdp_data.head())

print("BLS Data:")
display(merged_bls_data.head())

print("Census Data:")
display(merged_census_data.head())


Demographics Data:


Unnamed: 0,geo_name,2017_naics_code,naics_code,legal_form,emp_size,year,num_estabs,ann_payroll,q1_payroll,num_emps,county
0,"Dallas County, Texas",0,Total for all sectors,All establishments,All establishments,2019,67311,99304828,26327259,1534430,Dallas TX
1,"Dallas County, Texas",0,Total for all sectors,All establishments,Establishments with less than 5 employees,2019,34339,N,N,N,Dallas TX
2,"Dallas County, Texas",0,Total for all sectors,All establishments,Establishments with 5 to 9 employees,2019,11381,N,N,N,Dallas TX
3,"Dallas County, Texas",0,Total for all sectors,All establishments,Establishments with 10 to 19 employees,2019,8771,N,N,N,Dallas TX
4,"Dallas County, Texas",0,Total for all sectors,All establishments,Establishments with 20 to 49 employees,2019,7321,N,N,N,Dallas TX


Decennial Population Data:


Unnamed: 0,label,loudoun_cty_va_count,loudoun_cty_va_pct,data_type
0,SEX AND AGE,420959,100.0%,Decennial Population
1,Total population,420959,100.0%,Decennial Population
2,Under 5 years,27065,6.4%,Decennial Population
3,5 to 9 years,33089,7.9%,Decennial Population
4,10 to 14 years,36363,8.6%,Decennial Population


GDP Data:


Unnamed: 0,geofips,geoname,linecode,desc,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,51107,"Loudoun, VA",1.0,Real GDP (thousands of chained 2017 dollars),11781640.0,12331910.0,12617950.0,14372870.0,14874580.0,16851590.0,...,22197550.0,22845660.0,23989400.0,25206690.0,26813789.0,28280360.0,29573440.0,28724760.0,31763950.0,33771830.0
1,51107,"Loudoun, VA",2.0,Chain-type quantity indexes for real GDP,43.939,45.991,47.058,53.603,55.474,62.847,...,82.784,85.201,89.467,94.006,100.0,105.469,110.292,107.127,118.461,125.95
2,51107,"Loudoun, VA",3.0,Current-dollar GDP (thousands of current dollars),9719971.0,10343500.0,10669940.0,12292150.0,12948120.0,15011560.0,...,21242850.0,22159070.0,23665390.0,25135580.0,26813789.0,28526450.0,30410100.0,29892430.0,33614260.0,37558710.0
3,51919,"Fairfax, Fairfax City + Falls Church, VA*",1.0,Real GDP (thousands of chained 2017 dollars),74858290.0,77407630.0,81803300.0,87352500.0,93302310.0,95036240.0,...,108458200.0,108533100.0,109923800.0,111444500.0,114188029.0,119273700.0,125609600.0,125345400.0,133352800.0,137925800.0
4,51919,"Fairfax, Fairfax City + Falls Church, VA*",2.0,Chain-type quantity indexes for real GDP,65.557,67.79,71.639,76.499,81.709,83.228,...,94.982,95.048,96.266,97.597,100.0,104.454,110.002,109.771,116.784,120.788


BLS Data:


Unnamed: 0,area_fips,own_code,ind_code,agglvl_code,size_code,year,qtr,disc_code,ann_avg_estabs,ann_avg_emp_lvl,...,oty_month3_emplvl_pct_chg,oty__qtrly_wages_chg,oty__qtrly_wages_pct_chg,oty_taxable_qtrly_wages_chg,oty_taxable_qtrly_wages_pct_chg,oty_qtrly_contributions_chg,oty_qtrly_contributions_pct_chg,oty_avg_wkly_wage_chg,oty_avg_wkly_wage_pct_chg,quarter
0,48113,0,10,70,0,2020,A,N,81171.0,1667356.0,...,,,,,,,,,,
1,48113,1,10,71,0,2020,A,N,167.0,26129.0,...,,,,,,,,,,
2,48113,1,101,72,0,2020,A,N,1.0,6.0,...,,,,,,,,,,
3,48113,1,1013,73,0,2020,A,N,1.0,6.0,...,,,,,,,,,,
4,48113,1,102,72,0,2020,A,N,166.0,26123.0,...,,,,,,,,,,


Census Data:


Unnamed: 0,label,santa_clara_cty_ca__est,santa_clara_cty_ca__moe,santa_clara_cty_ca_male_est,santa_clara_cty_ca_male_moe,santa_clara_cty_ca_female_est,santa_clara_cty_ca_female_moe,montgomery_cty_md__est,montgomery_cty_md__moe,montgomery_cty_md_male_est,...,fairfax_cty_va_pct_est,fairfax_cty_va_pct_male_est,fairfax_cty_va_pct_male_moe,fairfax_cty_va_pct_female_est,fairfax_cty_va_pct_female_moe,loudoun_cty_va_pct_est,loudoun_cty_va_pct_male_est,loudoun_cty_va_pct_male_moe,loudoun_cty_va_pct_female_est,loudoun_cty_va_pct_female_moe
0,Total population,1787694,*****,896647,±671,891047,±671,976203,*****,468914,...,,,,,,,,,,
1,AGE,1787694,*****,896647,±671,891047,±671,976203,*****,468914,...,,,,,,,,,,
2,Under 5 years,7.0%,±0.1,7.1%,±0.1,6.8%,±0.1,6.5%,±0.1,7.0%,...,,,,,,,,,,
3,5 to 9 years,6.8%,±0.2,6.7%,±0.3,6.8%,±0.3,6.5%,±0.3,6.6%,...,,,,,,,,,,
4,10 to 14 years,6.5%,±0.2,6.8%,±0.4,6.2%,±0.3,6.8%,±0.3,7.4%,...,,,,,,,,,,
