In [2]:
import pandas as pd
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the input and output paths
input_file = Path("/Users/magic-rabbit/Library/Mobile Documents/com~apple~CloudDocs/01_University/06_MIT_Stay/Master_Thesis/Data/NREL_Residential_typology.tsv")
output_dir = Path("county_data")
output_dir.mkdir(exist_ok=True)

# Process the file in chunks
chunk_size = 100000
processed_counties = set()

logger.info("Starting to process the TSV file...")

for chunk in pd.read_csv(input_file, sep="\t", chunksize=chunk_size):
    # Convert county to string for consistent naming
    chunk['in.county'] = chunk['in.county'].astype(str)
    
    # Process each county in the current chunk
    for county_id, group in chunk.groupby('in.county'):
        if county_id not in processed_counties:
            output_file = output_dir / f"county_{county_id}.parquet"
            group.to_parquet(output_file, index=False)
            processed_counties.add(county_id)
            logger.info(f"Saved data for county {county_id}")
    
    # Optional: Add a progress indicator
    logger.info(f"Processed {len(processed_counties)} unique counties so far")

logger.info("Finished processing all counties")

INFO:__main__:Starting to process the TSV file...
INFO:__main__:Saved data for county G0100010
INFO:__main__:Saved data for county G0100030
INFO:__main__:Saved data for county G0100050
INFO:__main__:Saved data for county G0100070
INFO:__main__:Saved data for county G0100090
INFO:__main__:Saved data for county G0100110
INFO:__main__:Saved data for county G0100130
INFO:__main__:Saved data for county G0100150
INFO:__main__:Saved data for county G0100170
INFO:__main__:Saved data for county G0100190
INFO:__main__:Saved data for county G0100210
INFO:__main__:Saved data for county G0100230
INFO:__main__:Saved data for county G0100250
INFO:__main__:Saved data for county G0100270
INFO:__main__:Saved data for county G0100290
INFO:__main__:Saved data for county G0100310
INFO:__main__:Saved data for county G0100330
INFO:__main__:Saved data for county G0100350
INFO:__main__:Saved data for county G0100370
INFO:__main__:Saved data for county G0100390
INFO:__main__:Saved data for county G0100410
INFO:

In [3]:


# Load a single county's data
import pandas as pd

# Load one county file (using G0100370 as an example)
county_data = pd.read_parquet("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/data_exploration/county_data/county_G0100130.parquet")

# Display basic information about the data
print("Data shape:", county_data.shape)
print("\nColumns:", county_data.columns.tolist())
print("\nFirst few rows:")
display(county_data.head())

# Show some basic statistics
print("\nBasic statistics:")
display(county_data.describe())

Data shape: (9, 275)

Columns: ['Unnamed: 0', 'bldg_id', 'in.county', 'in.puma', 'in.ashrae_iecc_climate_zone_2004', 'in.building_america_climate_zone', 'in.iso_rto_region', 'applicability', 'weight', 'in.sqft', 'in.ahs_region', 'in.applicable', 'in.bathroom_spot_vent_hour', 'in.bedrooms', 'in.cec_climate_zone', 'in.ceiling_fan', 'in.census_division', 'in.census_division_recs', 'in.census_region', 'in.clothes_dryer', 'in.clothes_washer', 'in.clothes_washer_presence', 'in.cooking_range', 'in.cooling_setpoint', 'in.cooling_setpoint_has_offset', 'in.cooling_setpoint_offset_magnitude', 'in.cooling_setpoint_offset_period', 'in.corridor', 'in.dehumidifier', 'in.dishwasher', 'in.door_area', 'in.doors', 'in.ducts', 'in.eaves', 'in.electric_vehicle', 'in.geometry_attic_type', 'in.geometry_building_horizontal_location_mf', 'in.geometry_building_horizontal_location_sfa', 'in.geometry_building_level_mf', 'in.geometry_building_number_units_mf', 'in.geometry_building_number_units_sfa', 'in.geometry_

Unnamed: 0.1,Unnamed: 0,bldg_id,in.county,in.puma,in.ashrae_iecc_climate_zone_2004,in.building_america_climate_zone,in.iso_rto_region,applicability,weight,in.sqft,...,in.door_area_ft_2,in.duct_unconditioned_surface_area_ft_2,in.floor_area_attic_ft_2,in.floor_area_conditioned_ft_2,in.floor_area_lighting_ft_2,in.roof_area_ft_2,in.wall_area_above_grade_conditioned_ft_2,in.wall_area_above_grade_exterior_ft_2,in.wall_area_below_grade_ft_2,in.window_area_ft_2
0,26960,26970,G0100130,G01002300,3A,Hot-Humid,,True,242.131013,1220.0,...,20.0,0.0,1220.0,1220.0,1220.0,1364.0,1166.33,1335.78,583.17,69.98
1,46296,46311,G0100130,G01002300,3A,Hot-Humid,,True,242.131013,1220.0,...,20.0,390.4,1220.0,1220.0,1220.0,1364.0,1166.33,1335.78,1166.33,209.94
2,59005,59022,G0100130,G01002300,3A,Hot-Humid,,True,242.131013,2663.0,...,20.0,738.98,1619.5,2663.0,3239.0,1810.66,2628.93,3077.86,465.23,421.37
3,68441,68462,G0100130,G01002300,3A,Hot-Humid,,True,242.131013,1690.0,...,20.0,540.8,1690.0,1690.0,2266.0,2533.47,1485.1,2023.82,0.0,359.13
4,72310,72331,G0100130,G01002300,3A,Hot-Humid,,True,242.131013,1690.0,...,20.0,540.8,1690.0,1690.0,1690.0,1889.48,1372.73,1607.45,0.0,205.91



Basic statistics:


Unnamed: 0.1,Unnamed: 0,bldg_id,weight,in.sqft,in.bedrooms,in.cec_climate_zone,in.dehumidifier,in.electric_vehicle,in.geometry_building_number_units_mf,in.geometry_building_number_units_sfa,...,in.door_area_ft_2,in.duct_unconditioned_surface_area_ft_2,in.floor_area_attic_ft_2,in.floor_area_conditioned_ft_2,in.floor_area_lighting_ft_2,in.roof_area_ft_2,in.wall_area_above_grade_conditioned_ft_2,in.wall_area_above_grade_exterior_ft_2,in.wall_area_below_grade_ft_2,in.window_area_ft_2
count,9.0,9.0,9.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,69876.888889,69898.555556,242.131013,1606.0,3.222222,,,,,,...,20.0,414.588889,1490.055556,1606.0,1798.0,1809.042222,1446.875556,1746.273333,246.081111,216.217778
std,22454.990316,22461.601992,0.0,551.151749,0.833333,,,,,,...,0.0,272.489858,386.019951,551.151749,750.408056,582.139426,482.146128,598.096886,414.102743,109.569138
min,26960.0,26970.0,242.131013,885.0,2.0,,,,,,...,20.0,0.0,885.0,885.0,885.0,989.46,993.38,1116.29,0.0,69.98
25%,59005.0,59022.0,242.131013,1220.0,3.0,,,,,,...,20.0,283.2,1220.0,1220.0,1220.0,1364.0,1166.33,1335.78,0.0,149.01
50%,72310.0,72331.0,242.131013,1690.0,3.0,,,,,,...,20.0,540.8,1619.5,1690.0,1690.0,1810.66,1372.73,1607.45,0.0,205.91
75%,88957.0,88984.0,242.131013,1690.0,3.0,,,,,,...,20.0,540.8,1690.0,1690.0,2266.0,2432.84,1485.1,2023.82,465.23,215.48
max,94018.0,94048.0,242.131013,2663.0,5.0,,,,,,...,20.0,738.98,2176.0,2663.0,3239.0,2533.47,2628.93,3077.86,1166.33,421.37
