Loudoun County Growth Study
----------------------------
This project analyzes key economic, demographic, and housing characteristics in Loudoun County, VA.
The workflow includes:
  1. Uploading raw datasets to AWS S3.
  2. Loading and cleaning datasets.
  3. Saving and uploading cleaned data to S3.
  4. (Optional) Loading raw and cleaned data into PostgreSQL.
  5. Running analyses and generating visualizations.
  6. Forecasting future GDP and household income trends.
  
Before running, ensure you have a .env file with your AWS and PostgreSQL credentials.
"""

### Import Libraries



In [1]:
# ===================== Imports and Configuration =====================
import os
import warnings
import pandas as pd
import numpy as np
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv

# Set up visualization style and suppress warnings
sns.set(style="whitegrid")
warnings.filterwarnings("ignore")
load_dotenv()

True

In [2]:
# AWS S3 Configuration
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
s3_initial_bucket = "initial-datasets"
s3_cleaned_bucket = "cleaned-datasets"
s3_region = "us-east-1"

# PostgreSQL Configuration (ensure credentials are securely stored in .env)
db_user = os.getenv("POSTGRES_USER")
db_password = os.getenv("POSTGRES_PASSWORD")
db_host = os.getenv("POSTGRES_HOST")
db_port = os.getenv("POSTGRES_PORT")
db_name = os.getenv("POSTGRES_DB")

### Define File Paths

In [3]:
# File paths for raw datasets
file_paths = {
    "Income": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_B19013.csv",
    "Housing_Costs": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_B25077.csv",
    "Labor_Stats": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_BLS_Data.csv",
    "Population_Characteristics": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_DP05.csv",
    "GDP": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_GDP_Data.csv",
    "Economic_Data": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\EC1700BASIC.csv",
    "Occupation_Data": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_S2403.csv",
    "Decennial_Population_Housing": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\2020_decennial_population_housing.csv",
    "County_Business_Patterns": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\CB2200CBP.csv",
    "Census_Data": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_Census_Data.csv",
    "Demographics_Data": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_Demographics_Data.csv",
    "Housing_Characteristics": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_DP04.csv",
    "Labor_Force": r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\Combined_S2401.csv"
}


In [4]:
# ===================== AWS S3 Integration =====================
def initialize_s3_client():
    """Initialize and return an AWS S3 client."""
    try:
        s3_client = boto3.client(
            "s3",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name=s3_region,
        )
        return s3_client
    except (PartialCredentialsError, NoCredentialsError) as e:
        print(f"Error initializing S3 client: {e}")
        return None

def upload_file_to_s3(s3_client, file_path, bucket_name, folder):
    """Upload a single file to the specified S3 bucket and folder."""
    try:
        file_name = os.path.basename(file_path)
        s3_key = f"{folder}/{file_name}"
        s3_client.upload_file(file_path, bucket_name, s3_key)
        print(f"Uploaded {file_name} to S3 bucket {bucket_name}/{s3_key}.")
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error uploading {file_name}: {e}")

def upload_files_to_s3(s3_client, file_paths, bucket_name, folder):
    """Upload multiple files to S3 using provided file paths."""
    for dataset_name, file_path in file_paths.items():
        upload_file_to_s3(s3_client, file_path, bucket_name, folder)

In [5]:
# ===================== PostgreSQL Integration =====================
def initialize_postgres_engine():
    """Initialize and return a PostgreSQL engine."""
    try:
        connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
        engine = create_engine(connection_string)
        return engine
    except Exception as e:
        print(f"Error initializing PostgreSQL engine: {e}")
        return None

def load_data_to_postgres(engine, file_paths, table_prefix):
    """Load CSV data into PostgreSQL tables with a table name prefix."""
    for dataset_name, file_path in file_paths.items():
        try:
            df = pd.read_csv(file_path)
            table_name = f"{table_prefix}_{dataset_name.lower()}"
            df.to_sql(table_name, engine, if_exists="replace", index=False)
            print(f"Uploaded {dataset_name} to PostgreSQL table {table_name}.")
        except SQLAlchemyError as e:
            print(f"Error uploading {dataset_name} to PostgreSQL: {e}")
        except FileNotFoundError:
            print(f"File not found: {file_path}")

In [6]:
# ===================== Data Loading and Cleaning =====================
def load_dataframes(paths):
    """Load raw CSV files into a dictionary of DataFrames."""
    dataframes = {}
    for name, path in paths.items():
        try:
            dataframes[name] = pd.read_csv(path)
        except FileNotFoundError:
            print(f"File not found: {path}")
        except Exception as e:
            print(f"Error loading {name}: {e}")
    return dataframes

def clean_dataset(df):
    """
    Clean a DataFrame by:
      - Standardizing column names.
      - Trimming whitespace.
      - Dropping duplicate rows.
      - Handling missing values.
      - Converting columns to numeric where applicable.
    """
    df.columns = (
        df.columns.str.strip()
        .str.replace("Ã", "", regex=False)
        .str.replace(r"[\W]+", "_", regex=True)
        .str.lower()
    )
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].str.strip()
    df = df.drop_duplicates().fillna("NA")
    for col in df.select_dtypes(include=["object"]).columns:
        try:
            df[col] = df[col].str.replace(",", "", regex=True)
            df[col] = pd.to_numeric(df[col], errors="ignore")
        except Exception as e:
            print(f"Error converting column {col} to numeric: {e}")
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce")
    return df

def save_and_upload_cleaned_data(paths, output_dir, s3_client, bucket_name):
    """
    Clean each dataset, save it locally in output_dir, and upload to S3.
    Returns a dictionary mapping dataset names to cleaned file paths.
    """
    cleaned_paths = {}
    os.makedirs(output_dir, exist_ok=True)
    for dataset_name, path in paths.items():
        try:
            df = pd.read_csv(path)
            cleaned_df = clean_dataset(df)
            cleaned_path = os.path.join(output_dir, os.path.basename(path))
            cleaned_df.to_csv(cleaned_path, index=False)
            cleaned_paths[dataset_name] = cleaned_path
            # Upload cleaned file to S3
            s3_key = f"cleaned/{os.path.basename(path)}"
            s3_client.upload_file(cleaned_path, bucket_name, s3_key)
            print(f"Uploaded cleaned {dataset_name} to S3 bucket {bucket_name}/{s3_key}.")
        except FileNotFoundError:
            print(f"File not found: {path}")
        except Exception as e:
            print(f"Error processing {path}: {e}")
    return cleaned_paths

## Cell 1: GDP Analysis

In [None]:
def gdp_analysis(cleaned_dfs):
    """Plot GDP growth in Loudoun County."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd

    gdp_df = cleaned_dfs.get("GDP")
    if gdp_df is None:
        print("GDP data not available.")
        return
    gdp_df.columns = gdp_df.columns.str.strip()
    if "real_gdp_thousands_of_chained_2017_dollars" in gdp_df.columns:
        gdp_df = gdp_df.rename(columns={"real_gdp_thousands_of_chained_2017_dollars": "real_gdp"})
    # Filter for Loudoun County
    loudoun_gdp = gdp_df[gdp_df["county_name"].str.contains("Loudoun", case=False, na=False)].copy()
    loudoun_gdp["real_gdp"] = pd.to_numeric(loudoun_gdp["real_gdp"], errors="coerce")
    gdp_growth = loudoun_gdp[["year", "real_gdp"]].sort_values("year")
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=gdp_growth, x="year", y="real_gdp", marker='o', color='blue')
    plt.title("GDP Growth in Loudoun County")
    plt.xlabel("Year")
    plt.ylabel("Real GDP (Thousands of Chained 2017 Dollars)")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


## Cell 2: Employment by Industry Analysis

In [None]:
def employment_by_industry_analysis(cleaned_dfs):
    """Plot employment by industry in Loudoun County."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np
    import pandas as pd

    labor_df = cleaned_dfs.get("Labor_Stats")
    if labor_df is None:
        print("Labor Stats data not available.")
        return
    industry_mapping = {
        "10": "All Industries",
        "101": "Goods-Producing",
        "102": "Service-Providing",
        "1011": "Natural Resources and Mining",
        "1012": "Construction",
        "1013": "Manufacturing",
        "1021": "Trade, Transportation, and Utilities",
        "1022": "Information",
        "1023": "Financial Activities",
        "1024": "Professional and Business Services",
        "1025": "Education and Health Services",
        "1026": "Leisure and Hospitality",
        "1027": "Other Services",
        "1028": "Public Administration",
    }
    labor_df = labor_df.assign(industry_name=labor_df["industry_code"].astype(str).map(industry_mapping))
    loudoun_bls = labor_df[labor_df["area_fips"] == 51107]
    loudoun_bls = loudoun_bls[["year", "industry_name", "annual_avg_emplvl"]].dropna(subset=["industry_name", "annual_avg_emplvl"])
    loudoun_bls["annual_avg_emplvl"] = pd.to_numeric(loudoun_bls["annual_avg_emplvl"], errors="coerce")
    employment = loudoun_bls.groupby(["year", "industry_name"])["annual_avg_emplvl"].sum().reset_index().rename(
        columns={"annual_avg_emplvl": "employment_count"}
    )
    employment_pivot = employment.pivot(index="year", columns="industry_name", values="employment_count").fillna(0)
    colors = plt.colormaps["tab20"](np.linspace(0, 1, len(employment_pivot.columns)))
    plt.figure(figsize=(14, 8))
    employment_pivot.plot(kind="bar", stacked=True, figsize=(14, 8), color=colors)
    plt.title("Employment by Industry in Loudoun County (2020-2023)", fontsize=16)
    plt.xlabel("Year", fontsize=14)
    plt.ylabel("Number of Employees", fontsize=14)
    plt.legend(title="Industry", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.show()


## Cell 3: Housing Costs Analysis

In [None]:
def housing_costs_analysis(cleaned_dfs):
    """Plot median housing costs in Loudoun County."""
    import matplotlib.pyplot as plt
    import seaborn as sns

    housing_df = cleaned_dfs.get("Housing_Costs")
    if housing_df is None:
        print("Housing Costs data not available.")
        return
    # Filter for Loudoun County by matching county name
    loudoun_housing = housing_df[housing_df["county"].str.contains("Loudoun", case=False, na=False)]
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=loudoun_housing, x="year", y="median_value_dollars", marker='o', color='purple')
    plt.title("Median Housing Costs in Loudoun County")
    plt.xlabel("Year")
    plt.ylabel("Median Housing Cost (USD)")
    plt.gca().invert_yaxis()  # Optional: flip y-axis if desired
    plt.grid(True)
    plt.tight_layout()
    plt.show()


## Cell 4: Population Growth Analysis

In [None]:
def population_growth_analysis(cleaned_dfs):
    """Plot population growth in Loudoun County with a trend line."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd

    pop_df = cleaned_dfs.get("Population_Characteristics")
    if pop_df is None:
        print("Population Characteristics data not available.")
        return
    pop_growth = pop_df[pop_df["label"].str.lower() == "total population"][["year", "loudoun_county_virginia"]].rename(
        columns={"loudoun_county_virginia": "population"}
    )
    pop_growth["population"] = pop_growth["population"].str.replace(",", "", regex=True).astype(float)
    plt.figure(figsize=(12, 6))
    sns.regplot(x=pop_growth["year"], y=pop_growth["population"],
                scatter_kws={"color": "blue"}, line_kws={"color": "red"})
    plt.title("Population Growth in Loudoun County (with Trend Line)")
    plt.xlabel("Year")
    plt.ylabel("Population")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


## Cell 5: Business Trends Analysis

In [None]:
def business_trends_analysis(cleaned_dfs):
    """Plot the trend of the number of businesses in Loudoun County."""
    import matplotlib.pyplot as plt
    import pandas as pd

    labor_df = cleaned_dfs.get("Labor_Stats")
    if labor_df is None:
        print("Labor Stats data not available.")
        return
    loudoun_bls = labor_df[labor_df["area_fips"] == 51107]
    business_trend = loudoun_bls.groupby("year", as_index=False)["annual_avg_estabs"].sum().rename(
        columns={"annual_avg_estabs": "number_of_businesses"}
    )
    plt.figure(figsize=(10, 6))
    plt.plot(business_trend["year"], business_trend["number_of_businesses"], marker="o", linestyle="-", linewidth=2)
    plt.title("Number of Businesses in Loudoun County Over Time", fontsize=16)
    plt.xlabel("Year", fontsize=14)
    plt.ylabel("Number of Businesses", fontsize=14)
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.xticks(ticks=business_trend["year"], rotation=45)
    plt.tight_layout()
    plt.show()


## Cell 6: Labor Cost Analysis

In [None]:
def labor_cost_analysis(cleaned_dfs):
    """Plot labor cost trends (total wages and average pay) in Loudoun County."""
    import matplotlib.pyplot as plt
    import pandas as pd

    labor_df = cleaned_dfs.get("Labor_Stats")
    if labor_df is None:
        print("Labor Stats data not available.")
        return
    loudoun_bls = labor_df[labor_df["area_fips"] == 51107].copy()
    loudoun_bls["total_annual_wages"] = pd.to_numeric(loudoun_bls["total_annual_wages"], errors="coerce")
    loudoun_bls["avg_annual_pay"] = pd.to_numeric(loudoun_bls["avg_annual_pay"], errors="coerce")
    loudoun_bls = loudoun_bls[(loudoun_bls["total_annual_wages"] > 0) & (loudoun_bls["avg_annual_pay"] > 0)]
    labor_cost_trend = loudoun_bls.groupby("year", as_index=False)[["total_annual_wages", "avg_annual_pay"]].mean(numeric_only=True).rename(
        columns={"total_annual_wages": "Total Annual Wages", "avg_annual_pay": "Average Annual Pay"}
    )
    fig, ax1 = plt.subplots(figsize=(12, 6))
    ax1.plot(labor_cost_trend["year"], labor_cost_trend["Total Annual Wages"] / 1e9, marker="o",
             linestyle="-", linewidth=2, color="blue", label="Total Annual Wages (Billions)")
    ax1.set_xlabel("Year", fontsize=14)
    ax1.set_ylabel("Total Annual Wages (Billions USD)", fontsize=14, color="blue")
    ax1.tick_params(axis="y", labelcolor="blue")
    ax1.grid(True, linestyle="--", alpha=0.7)
    ax2 = ax1.twinx()
    ax2.plot(labor_cost_trend["year"], labor_cost_trend["Average Annual Pay"], marker="o",
             linestyle="--", color="green", linewidth=2, label="Average Annual Pay")
    ax2.set_ylabel("Average Annual Pay (USD)", fontsize=14, color="green")
    ax2.tick_params(axis="y", labelcolor="green")
    lines_1, labels_1 = ax1.get_legend_handles_labels()
    lines_2, labels_2 = ax2.get_legend_handles_labels()
    ax2.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
    fig.suptitle("Cost of Labor in Loudoun County Over Time", fontsize=16)
    fig.tight_layout()
    plt.xticks(ticks=labor_cost_trend["year"], rotation=45)
    plt.show()


## Cell 7: Income Trends Analysis

In [None]:
def income_trends_analysis(cleaned_dfs):
    """Plot median household income trends in Loudoun County."""
    import matplotlib.pyplot as plt
    import pandas as pd

    income_df = cleaned_dfs.get("Income")
    if income_df is None:
        print("Income data not available.")
        return
    if {"year", "loudoun_county_virginia"}.issubset(income_df.columns):
        loudoun_income = income_df[["year", "loudoun_county_virginia"]].sort_values("year")
        plt.figure(figsize=(12, 6))
        plt.plot(loudoun_income["year"], loudoun_income["loudoun_county_virginia"], marker="o",
                 linestyle="-", color="blue", label="Median Household Income")
        plt.title("Median Household Income in Loudoun County Over Time", fontsize=16)
        plt.xlabel("Year", fontsize=14)
        plt.ylabel("Median Household Income (USD)", fontsize=14)
        plt.grid(True, linestyle="--", alpha=0.7)
        plt.xticks(loudoun_income["year"], rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.show()
    else:
        print("Required columns for income analysis are missing.")


## Cell 8: Peer County Income Heatmap

In [None]:
def peer_county_income_heatmap(income_df):
    """Generate a heatmap of median household income by county for selected peer counties."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd

    counties = [
        "loudoun_county_virginia",
        "fairfax_county_virginia",
        "montgomery_county_maryland",
        "santa_clara_county_california",
        "dallas_county_texas",
        "davidson_county_tennessee",
        "wake_county_north_carolina"
    ]
    selected = income_df[["year"] + counties]
    income_long = selected.melt(id_vars="year", var_name="county", value_name="median_household_income")
    income_long["county"] = (income_long["county"]
                             .str.replace("_", " ")
                             .str.replace("county", "", case=False)
                             .str.replace("virginia", "VA", case=False)
                             .str.replace("maryland", "MD", case=False)
                             .str.replace("california", "CA", case=False)
                             .str.replace("texas", "TX", case=False)
                             .str.replace("tennessee", "TN", case=False)
                             .str.replace("north carolina", "NC", case=False)
                             .str.strip())
    income_long["median_household_income"] = pd.to_numeric(
        income_long["median_household_income"].str.replace(",", "", regex=True), errors="coerce"
    )
    income_pivot = income_long.pivot(index="county", columns="year", values="median_household_income").iloc[::-1]
    plt.figure(figsize=(14, 8))
    ax = sns.heatmap(income_pivot, annot=True, fmt=",.0f", cmap="YlGnBu", linewidths=0.5)
    for text in ax.texts:
        value = text.get_text().replace(",", "")
        text.set_text(f'${float(value):,.0f}')
    plt.title("Median Household Income Heatmap (2010-2023)", fontsize=16)
    plt.xlabel("County", fontsize=14)
    plt.ylabel("Year", fontsize=14)
    plt.tight_layout()
    plt.show()


## Cell 9: Employment Trends by County

In [None]:
def employment_trends_by_county(cleaned_dfs):
    """Plot employment trends by county for selected FIPS codes."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd

    labor_df = cleaned_dfs.get("Labor_Stats")
    if labor_df is None:
        print("Labor Stats data not available.")
        return
    selected_fips = {
        "51107": "Loudoun County, VA",
        "51600": "Fairfax County, VA",
        "24033": "Montgomery County, MD",
        "6085": "Santa Clara County, CA",
        "48113": "Dallas County, TX",
        "47037": "Davidson County, TN",
        "37183": "Wake County, NC"
    }
    filtered = labor_df[labor_df["area_fips"].astype(str).isin(selected_fips.keys())]
    filtered["county"] = filtered["area_fips"].astype(str).map(selected_fips)
    trends = filtered.groupby(["year", "county"], as_index=False)["annual_avg_emplvl"].sum()
    if not trends.empty:
        plt.figure(figsize=(14, 8))
        sns.lineplot(data=trends, x="year", y="annual_avg_emplvl", hue="county", marker="o")
        plt.title("Employment Trends by County (2020)", fontsize=16)
        plt.xlabel("Year", fontsize=14)
        plt.ylabel("Annual Average Employment Level", fontsize=14)
        plt.grid(True, linestyle="--", alpha=0.7)
        plt.xticks(ticks=trends["year"].unique(), rotation=45)
        plt.legend(title="County", bbox_to_anchor=(1.05, 1), loc="upper left")
        plt.tight_layout()
        plt.show()
    else:
        print("No data to plot for employment trends.")


## Cell 10: GDP Trends by County

In [None]:
def gdp_trends_by_county(gdp_df):
    """Plot GDP trends by county for selected counties."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd

    counties = ["Loudoun, VA", "Fairfax, VA", "Montgomery, MD", "Santa Clara, CA", "Dallas, TX", "Davidson, TN", "Wake, NC"]
    filtered = gdp_df[gdp_df["county_name"].isin(counties)]
    filtered = filtered[["county_name", "year", "real_gdp"]].sort_values(by=["county_name", "year"])
    plt.figure(figsize=(14, 8))
    sns.lineplot(data=filtered, x="year", y="real_gdp", hue="county_name", marker="o")
    plt.title("GDP Trends by County (2001-2023)", fontsize=16)
    plt.xlabel("Year", fontsize=14)
    plt.ylabel("Real GDP (Thousands of Chained 2017 Dollars)", fontsize=14)
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.legend(title="County", bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    plt.show()


## Cell 11: Housing Cost Heatmap

In [None]:
def housing_cost_heatmap(cleaned_dfs):
    """Generate a heatmap of median housing costs by county."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd

    housing_df = cleaned_dfs.get("Housing_Costs")
    if housing_df is None:
        print("Housing Costs data not available.")
        return
    housing_df.columns = housing_df.columns.str.lower()
    housing_df["median_value_dollars"] = housing_df["median_value_dollars"].str.replace(",", "", regex=True).astype(float)
    counties = [
        "Santa Clara County, California",
        "Montgomery County, Maryland",
        "Wake County, North Carolina",
        "Davidson County, Tennessee",
        "Dallas County, Texas",
        "Fairfax County, Virginia",
        "Loudoun County, Virginia"
    ]
    filtered = housing_df[housing_df["county"].isin(counties)]
    pivot_df = filtered.pivot(index="county", columns="year", values="median_value_dollars")
    plt.figure(figsize=(14, 8))
    ax = sns.heatmap(pivot_df, cmap="coolwarm", annot=True, fmt=".0f", linewidths=0.5, cbar_kws={"label": "Median Housing Cost (USD)"})
    for text in ax.texts:
        value = text.get_text().replace(",", "")
        text.set_text('${:,.0f}'.format(float(value)))
    plt.title("Median Housing Costs by County Over Time", fontsize=16)
    plt.xlabel("Year", fontsize=14)
    plt.ylabel("County", fontsize=14)
    plt.tight_layout()
    plt.show()


## Cell 12: Establishments Trends Analysis

In [None]:
def establishments_trends_analysis(cleaned_dfs):
    """Plot the trend of the number of establishments by county."""
    import matplotlib.pyplot as plt
    import pandas as pd

    labor_df = cleaned_dfs.get("Labor_Stats")
    if labor_df is None:
        print("Labor Stats data not available.")
        return
    selected_fips = {
        "48113": "Dallas County, TX",
        "47037": "Davidson County, TN",
        "51600": "Fairfax County, VA",
        "51107": "Loudoun County, VA",
        "24033": "Montgomery County, MD",
        "6085": "Santa Clara County, CA",
        "37183": "Wake County, NC",
    }
    filtered = labor_df[labor_df["area_fips"].astype(str).isin(selected_fips.keys())].copy()
    filtered["county"] = filtered["area_fips"].astype(str).map(selected_fips)
    est_trends = filtered.groupby(["year", "county"], as_index=False)["annual_avg_estabs"].sum()
    pivot_est = est_trends.pivot(index="year", columns="county", values="annual_avg_estabs").fillna(0)
    ax = pivot_est.plot(kind="bar", stacked=True, figsize=(14, 8), colormap="GnBu", edgecolor="black")
    plt.title("Average Number of Establishments by County (2010-2023)", fontsize=16)
    plt.xlabel("Year", fontsize=14)
    plt.ylabel("Average Number of Establishments", fontsize=14)
    plt.legend(title="County", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()


## Cell 13: GDP Predictions

In [None]:
def gdp_predictions(gdp_df):
    """Predict future GDP for Loudoun County using linear regression."""
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.linear_model import LinearRegression

    loudoun_gdp = gdp_df[gdp_df['county_name'].str.contains("Loudoun", case=False, na=False)]
    if loudoun_gdp.empty:
        print("No GDP data for Loudoun County available.")
        return
    if 'real_gdp' not in loudoun_gdp.columns:
        print("Column 'real_gdp' is missing in GDP data.")
        return
    years = loudoun_gdp['year'].values.reshape(-1, 1)
    real_gdp = loudoun_gdp['real_gdp'].values
    model = LinearRegression()
    model.fit(years, real_gdp)
    future_years = np.arange(years[-1][0] + 1, years[-1][0] + 11).reshape(-1, 1)
    future_gdp = model.predict(future_years)
    plt.figure(figsize=(10, 6))
    plt.plot(loudoun_gdp['year'], real_gdp, label='Historical GDP', marker='o')
    plt.plot(future_years, future_gdp, label='Predicted GDP', linestyle='--', color='red')
    plt.xlabel('Year')
    plt.ylabel('Real GDP (Thousands of 2017 Dollars)')
    plt.title('Loudoun County Real GDP Trends and Predictions')
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.show()


## Cell 14: Income Predictions

In [None]:
def income_predictions(income_df):
    """Forecast future median household income for Loudoun County using ARIMA."""
    import numpy as np
    import matplotlib.pyplot as plt
    from statsmodels.tsa.arima.model import ARIMA
    import pandas as pd

    income_df['loudoun_county_virginia'] = pd.to_numeric(
        income_df['loudoun_county_virginia'].str.replace(",", "", regex=True), errors="coerce"
    )
    income_clean = income_df[['year', 'loudoun_county_virginia']].dropna()
    if income_clean.empty:
        print("No valid income data available for forecasting.")
        return
    arima_model = ARIMA(income_clean['loudoun_county_virginia'], order=(1, 1, 1))
    arima_fit = arima_model.fit()
    forecast_steps = 10
    forecast = arima_fit.forecast(steps=forecast_steps)
    future_years = np.arange(income_clean['year'].max() + 1, income_clean['year'].max() + forecast_steps + 1)
    plt.figure(figsize=(10, 6))
    plt.plot(income_clean['year'], income_clean['loudoun_county_virginia'], label='Historical Income', marker='o')
    plt.plot(future_years, forecast, label='Predicted Income', linestyle='--', color='red')
    plt.xlabel('Year')
    plt.ylabel('Median Household Income (USD)')
    plt.title('Loudoun County Household Income Trends and Predictions (ARIMA)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# Main Workflow Cell
# Make sure that your cleaned_dataframes dictionary (or individual dataframes like gdp_df, income_df, etc.) is already loaded.
gdp_analysis(cleaned_dataframes)
employment_by_industry_analysis(cleaned_dataframes)
housing_costs_analysis(cleaned_dataframes)
population_growth_analysis(cleaned_dataframes)
business_trends_analysis(cleaned_dataframes)
labor_cost_analysis(cleaned_dataframes)
income_trends_analysis(cleaned_dataframes)
peer_county_income_heatmap(cleaned_dataframes.get("Income"))
employment_trends_by_county(cleaned_dataframes)
gdp_trends_by_county(cleaned_dataframes.get("GDP"))
housing_cost_heatmap(cleaned_dataframes)
establishments_trends_analysis(cleaned_dataframes)
gdp_predictions(cleaned_dataframes.get("GDP"))
income_predictions(cleaned_dataframes.get("Income"))
