In [None]:
import pandas as pd
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the input and output paths
input_file = Path("/Users/magic-rabbit/Documents/MA-data/NREL_Residential_typology.tsv")
output_dir = Path("county_data")
output_dir.mkdir(exist_ok=True)

# Process the file in chunks
chunk_size = 100000
processed_counties = set()

logger.info("Starting to process the TSV file...")

for chunk in pd.read_csv(input_file, sep="\t", chunksize=chunk_size):
    # Convert county to string for consistent naming
    chunk['in.county'] = chunk['in.county'].astype(str)
    logger.info(chunk)
    
    # Process each county in the current chunk
    for county_id, group in chunk.groupby('in.county'):
        if county_id not in processed_counties:
            output_file = output_dir / f"county_{county_id}.parquet"
            group.to_parquet(output_file, index=False)
            processed_counties.add(county_id)
            logger.info(f"Saved data for county {county_id}")
    
    # Optional: Add a progress indicator
    logger.info(f"Processed {len(processed_counties)} unique counties so far")

logger.info("Finished processing all counties")

In [None]:


# Load a single county's data
import pandas as pd

# Load one county file (using G0100370 as an example)
county_data = pd.read_parquet("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/notebooks/county_data/county_G2002070.parquet")

# Display basic information about the data
print("Data shape:", county_data.shape)
print("\nColumns:", county_data.columns.tolist())
print("\nFirst few rows:")
display(county_data.head())


columns_to_show = [
    'in.county',
    'in.state',
    'in.puma',
    'in.ahs_region',
    'in.american_housing_survey_region',
    'in.resstock_county_id',
    'in.resstock_puma_id',
    'in.nhgis_county_gisjoin',
    'in.nhgis_puma_gisjoin',
    'in.state_name'
]

# Display the first 5 random rows of the selected columns
print(county_data[columns_to_show].sample(n=3, random_state=1))

# Show some basic statistics
print("\nBasic statistics:")
display(county_data.describe())

In [None]:
import pandas as pd
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the input and output paths
input_file = Path("/Users/magic-rabbit/Library/Mobile Documents/com~apple~CloudDocs/01_University/06_MIT_Stay/Master_Thesis/Data/NREL_Residential_typology.tsv")
output_dir = Path("county_data")
output_dir.mkdir(exist_ok=True)

# Process the file in chunks
chunk_size = 100000
processed_counties = set()

logger.info("Starting to process the TSV file...")

for chunk in pd.read_csv(input_file, sep="\t", chunksize=chunk_size):
    # Convert county to string for consistent naming
    chunk['in.county'] = chunk['in.county'].astype(str)
    
    # Process each county in the current chunk
    for county_id, group in chunk.groupby('in.county'):
        if county_id not in processed_counties:
            output_file = output_dir / f"county_{county_id}.parquet"
            group.to_parquet(output_file, index=False)
            processed_counties.add(county_id)
            logger.info(f"Saved data for county {county_id}")
    
    # Optional: Add a progress indicator
    logger.info(f"Processed {len(processed_counties)} unique counties so far")

logger.info("Finished processing all counties")