# Windspeed Dataset Combiner

Python Version: 3.13.5

In [1]:
import csv

def read_csv_rows(absFilePath: str) -> list[list[str]]:
    with open(absFilePath, mode = 'r') as f:
        reader = csv.reader(f)
        rows = []
        for row in reader:
            rows.append(row)
        f.close()
        return rows

In [2]:
import os

rawDataDirectory = os.path.join(os.getcwd(), "rawData")

locationCodeTofilePathMap: dict[str, str] = dict()

for fileName in os.listdir(rawDataDirectory):
    filePath = os.path.join(rawDataDirectory, fileName)
    if (not os.path.isfile(filePath)):
        print(f"Warning: non-file found in rawData path ({filePath}) ")
        continue
    # assuming that file name is of the format daily_***_WSPD_ALL.csv, where *** is the location code
    first_underscore_index = fileName.find('_')
    second_underscore_index = fileName.find('_',first_underscore_index+1)
    if (first_underscore_index  == -1 or second_underscore_index == -1):
        print(f"Warning: file name not as expected ({fileName}), skipping file")
        continue
    
    locationCode = fileName[first_underscore_index+1: second_underscore_index]
    if locationCode in locationCodeTofilePathMap:
        print(f"Warning: duplicate location code found ({locationCode}), skipping file")
        continue
    locationCodeTofilePathMap[locationCode] = filePath

In [3]:
# import json
# print(json.dumps(locationCodeTofilePathMap, indent = 2))

In [4]:
from typing import Literal, Union

from pydantic import BaseModel

class WindSpeedDatapoint(BaseModel):
    year            :   int
    month           :   int
    day             :   int
    locationName    :   str
    locationCode    :   str
    value           :   Union[float, Literal["***", ""]]
    completeness    :   Literal["C", "#", '']

In [5]:
from typing import Literal, Union

from pydantic import BaseModel

class PartialWindSpeedDatapoint(BaseModel):
    year            :   int
    month           :   int
    day             :   int
    value           :   Union[float, Literal["***", ""]]
    completeness    :   Literal["C", "#", '']

def load_partial_datapoint_from_row(row: list[str]) -> PartialWindSpeedDatapoint:
    return PartialWindSpeedDatapoint(year = row[0],
                                    month= row[1],
                                    day  = row[2],
                                    value= row[3],
                                    completeness=row[4])

def extractLocationNameFromLocationRow(row: list[str]) -> str:
    return row[0][25:].strip()

def load_datapoint_list_from_rows(rows: list[list[str]], locationCode: str)->list[WindSpeedDatapoint]:
    windspeed_datapoint_list: list[WindSpeedDatapoint] = []
    locationName = extractLocationNameFromLocationRow(rows[1])
    for row in rows[3:]:
        if len(row) < 5:
            continue
        partial_datapoint = load_partial_datapoint_from_row(row)
        datapoint = WindSpeedDatapoint(year=partial_datapoint.year,
                                      month=partial_datapoint.month,
                                      day=partial_datapoint.day,
                                      locationName=locationName,
                                      locationCode=locationCode,
                                      value=partial_datapoint.value,
                                      completeness=partial_datapoint.completeness)
        windspeed_datapoint_list.append(datapoint)
    
    return windspeed_datapoint_list

In [6]:
combined_datapoint_list: list[WindSpeedDatapoint] = []

for locationCode in locationCodeTofilePathMap:
    rows = read_csv_rows(locationCodeTofilePathMap[locationCode])
    datapoint_list = load_datapoint_list_from_rows(rows, locationCode)
    combined_datapoint_list.extend(datapoint_list)

In [7]:
def save_datapoint_list(datapoint_list: list[BaseModel], absFilePath: str):
    with open(absFilePath, mode="w") as f:
        for datapoint in datapoint_list:
            f.write(datapoint.model_dump_json())
            f.write("\n")
        f.close()

In [8]:
import os

saveFilePath = os.path.join(os.getcwd(), "cleaned", "combinedWindSpeed.jsonl")
save_datapoint_list(combined_datapoint_list, saveFilePath)