In [3]:
import pandas as pd
import os

def clean_datasets(schema_definitions, dataset_paths, output_folder):
    for table_name, file_paths in dataset_paths.items():
        expected_columns = schema_definitions.get(table_name, [])

        for file_path in file_paths:
            try:
                # Load the dataset
                df = pd.read_csv(file_path)

                # Identify missing and extra columns
                actual_columns = list(df.columns)
                missing_columns = [col for col in expected_columns if col not in actual_columns]
                extra_columns = [col for col in actual_columns if col not in expected_columns]

                # Add placeholders for missing columns
                for col in missing_columns:
                    df[col] = None

                # Drop extra columns
                df = df[expected_columns]  # Ensures order matches schema

                # Save the cleaned file
                output_path = os.path.join(output_folder, os.path.basename(file_path))
                df.to_csv(output_path, index=False)
                print(f"Cleaned file saved: {output_path}")

            except Exception as e:
                print(f"Error cleaning file {file_path}: {e}")

# Example usage
output_folder = r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\cleaned_files\fixed_files"
os.makedirs(output_folder, exist_ok=True)

clean_datasets(schema_definitions, cleaned_files, output_folder)

Index(['geofips', 'geoname', 'linecode', 'description', '2001', '2002', '2003',
       '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022', 'data_type'],
      dtype='object')


In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\cleaned_files\cleaned_2024_q1_retail_trade.csv")

# Display the column names
print(df.columns)


Index(['area_fips', 'own_code', 'industry_code', 'agglvl_code', 'size_code',
       'year', 'qtr', 'qtrly_estabs', 'month1_emplvl', 'month2_emplvl',
       'month3_emplvl', 'total_qtrly_wages', 'taxable_qtrly_wages',
       'qtrly_contributions', 'avg_wkly_wage', 'lq_qtrly_estabs',
       'lq_month1_emplvl', 'lq_month2_emplvl', 'lq_month3_emplvl',
       'lq_total_qtrly_wages', 'lq_taxable_qtrly_wages',
       'lq_qtrly_contributions', 'lq_avg_wkly_wage', 'oty_qtrly_estabs_chg',
       'oty_qtrly_estabs_pct_chg', 'oty_month1_emplvl_chg',
       'oty_month1_emplvl_pct_chg', 'oty_month2_emplvl_chg',
       'oty_month2_emplvl_pct_chg', 'oty_month3_emplvl_chg',
       'oty_month3_emplvl_pct_chg', 'oty_total_qtrly_wages_chg',
       'oty_total_qtrly_wages_pct_chg', 'oty_taxable_qtrly_wages_chg',
       'oty_taxable_qtrly_wages_pct_chg', 'oty_qtrly_contributions_chg',
       'oty_qtrly_contributions_pct_chg', 'oty_avg_wkly_wage_chg',
       'oty_avg_wkly_wage_pct_chg', 'data_type'],
      dt

In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\cleaned_files\cleaned_2020_housing_occupancy.csv")

# Display the column names
print(df.columns)


Index(['label_grouping', 'santa_clara_county_californiaestimate',
       'santa_clara_county_californiamargin_of_error',
       'montgomery_county_marylandestimate',
       'montgomery_county_marylandmargin_of_error',
       'wake_county_north_carolinaestimate',
       'wake_county_north_carolinamargin_of_error',
       'davidson_county_tennesseeestimate',
       'davidson_county_tennesseemargin_of_error',
       'dallas_county_texasestimate', 'dallas_county_texasmargin_of_error',
       'fairfax_county_virginiaestimate',
       'fairfax_county_virginiamargin_of_error',
       'loudoun_county_virginiaestimate',
       'loudoun_county_virginiamargin_of_error', 'data_type'],
      dtype='object')


In [8]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\cleaned_files\cleaned_2020_household_income.csv")
# Display the column names
print(df.columns)

Index(['label_grouping', 'santa_clara_county_californiahouseholdsestimate',
       'santa_clara_county_californiahouseholdsmargin_of_error',
       'santa_clara_county_californiafamiliesestimate',
       'santa_clara_county_californiafamiliesmargin_of_error',
       'santa_clara_county_californiamarriedcouple_familiesestimate',
       'santa_clara_county_californiamarriedcouple_familiesmargin_of_error',
       'santa_clara_county_californianonfamily_householdsestimate',
       'santa_clara_county_californianonfamily_householdsmargin_of_error',
       'montgomery_county_marylandhouseholdsestimate',
       'montgomery_county_marylandhouseholdsmargin_of_error',
       'montgomery_county_marylandfamiliesestimate',
       'montgomery_county_marylandfamiliesmargin_of_error',
       'montgomery_county_marylandmarriedcouple_familiesestimate',
       'montgomery_county_marylandmarriedcouple_familiesmargin_of_error',
       'montgomery_county_marylandnonfamily_householdsestimate',
       'montgo

In [9]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\cleaned_files\cleaned_2020_population_age_sex.csv")
# Display the column names
print(df.columns)

Index(['label_grouping', 'santa_clara_county_californiatotalestimate',
       'santa_clara_county_californiatotalmargin_of_error',
       'santa_clara_county_californiapercentestimate',
       'santa_clara_county_californiapercentmargin_of_error',
       'santa_clara_county_californiamaleestimate',
       'santa_clara_county_californiamalemargin_of_error',
       'santa_clara_county_californiapercent_maleestimate',
       'santa_clara_county_californiapercent_malemargin_of_error',
       'santa_clara_county_californiafemaleestimate',
       'santa_clara_county_californiafemalemargin_of_error',
       'santa_clara_county_californiapercent_femaleestimate',
       'santa_clara_county_californiapercent_femalemargin_of_error',
       'montgomery_county_marylandtotalestimate',
       'montgomery_county_marylandtotalmargin_of_error',
       'montgomery_county_marylandpercentestimate',
       'montgomery_county_marylandpercentmargin_of_error',
       'montgomery_county_marylandmaleestimate',
 

In [10]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\cleaned_files\cleaned_2020_CBP.csv")
# Display the column names
print(df.columns)

Index(['geographic_area_name_name', '2017_naics_code_naics2017',
       'meaning_of_naics_code_naics2017_label',
       'meaning_of_legal_form_of_organization_code_lfo_label',
       'meaning_of_employment_size_of_establishments_code_empszes_label',
       'year_year', 'number_of_establishments_estab',
       'annual_payroll_1000_payann', 'firstquarter_payroll_1000_payqtr1',
       'number_of_employees_emp', 'data_type'],
      dtype='object')


In [11]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\AKKem\OneDrive\Desktop\Data Analysis Modules\Projects\Loudoun_Growth_Study\data\cleaned_files\cleaned_2020_decennial_population_housing.csv")
# Display the column names
print(df.columns)

Index(['label_grouping', 'loudoun_county_virginiacount',
       'loudoun_county_virginiapercent', 'data_type'],
      dtype='object')
