In [None]:
import pandas as pd

# First load & Inspect
raw = pd.read_csv('../data/timesheet.csv', parse_dates=['Date'])
raw.info()
raw.head()

In [None]:
# Defining a function to parse the hours by splitting up the string object
def parse_hours(hstr):
    """Convert strings from '8 h 44 m' -> 8.73333 hours (8 + 44/60).
    Handles '11 h 0 m' etc.
    """
    if pd.isna(hstr):
        return pd.NA
    parts = hstr.strip().split()
    hours = int(parts[0])
    minutes = int(parts[2])
    return hours + minutes / 60

raw["hours_worked"] = raw["Total hrs"].apply(parse_hours)

In [None]:
# Rename & filter
df = (
    raw
    .rename(columns={
        "Name": "employee_name",
    })
    .loc[lambda d: d["hours_worked"] > 0] # drop <= 0
    .dropna(subset=["hours_worked"]) # drop NA hours
)

In [None]:
# Checking my work
print(df["hours_worked"].describe())
print(df.isna().sum().sort_values(ascending=False).head(10))

In [None]:
# Save to Parquet
df.to_parquet("../data/timesheet_clean.parquet", index=False)