# Data Cleaning

In [2]:
import os
import pandas as pd

In [3]:
# Paths for the raw and clean data directories
corn_raw_path = r'../data/raw_data/corn_data/'
soybean_raw_path = r'../data/raw_data/soybean_data/'
economic_raw_path = r'../data/raw_data/economic_data/'

corn_clean_path = r'../data/clean_data/corn_data/'
soybean_clean_path = r'../data/clean_data/soybean_data/'
economic_clean_path = r'../data/clean_data/economic_data/'

In [4]:
# Define the function to standardize state names
def standardize_state_names(df):
    # Standardize state names by stripping extra spaces and capitalizing properly
    if "State" in df.columns:
        df["State"] = df["State"].str.strip().str.title()
    return df

In [5]:
# Function to standardize economic data
def standardize_economic_data(file_path, output_path, is_poverty_file=False):
    """
    Function to standardize economic data by renaming the column 'GeoName' to 'State'.
    Additionally, it handles transformations for the poverty_levels file.
    """
    df = pd.read_csv(file_path)
    
    if is_poverty_file:
        df = df.drop(columns=["Total population", "Percent in poverty"])
        df = df.pivot(index="State", columns="Year", values="Number in poverty").reset_index()
    else:
        if "GeoName" in df.columns:
            df = df.rename(columns={"GeoName": "State"})
    
    df.to_csv(output_path, index=False)
    return df

In [6]:
# Function to process and save standardized files
def process_and_standardize_files(data_path, output_path, is_economic_data=False):
    """
    Standardize state names for each CSV file and save to the cleaned data directory.
    If it is economic data, standardize using the economic data function.
    """
    files_to_standardize = [f for f in os.listdir(data_path) if not f.startswith('.')]  # Skip hidden files/folders
    
    for file in files_to_standardize:
        file_path = os.path.join(data_path, file)
        
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Standardize the state names
        df_standardized = standardize_state_names(df)
        
        # Special handling for economic data
        if is_economic_data:
            is_poverty_file = file == "poverty_levels.csv"
            df_standardized = standardize_economic_data(file_path, output_path=os.path.join(output_path, file), is_poverty_file=is_poverty_file)
        else:
            # Save the standardized file
            df_standardized.to_csv(os.path.join(output_path, file), index=False)

In [7]:
# Ensure the output directories exist
os.makedirs(corn_clean_path, exist_ok=True)
os.makedirs(soybean_clean_path, exist_ok=True)
os.makedirs(economic_clean_path, exist_ok=True)

In [8]:
# Process and standardize the data files for corn, soybean, and economic data
process_and_standardize_files(corn_raw_path, corn_clean_path)
process_and_standardize_files(soybean_raw_path, soybean_clean_path)
process_and_standardize_files(economic_raw_path, economic_clean_path, is_economic_data=True)

print("Data processing and standardization complete!")

Data processing and standardization complete!
