-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_to_parquet.py
34 lines (27 loc) · 925 Bytes
/
1_to_parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#%%
import polars as pl
import polars.selectors as cs
from pathlib import Path
data_paths = Path("../_data").glob("*.txt")
Path("out").mkdir(exist_ok=True)
for path in data_paths:
print(path)
if "Alias" in path.name:
continue
# note that I deleted an unescaped | in a text field
# on line 1965440 of CPCMS_SentenceData
df = pl.read_csv(path, separator="|", infer_schema_length=100_000_000_000)
if "CaseData" in path.name:
# hash defendant name and date of birth
res = (
df
.with_columns(
DefendantID=(pl.col("DefendantName") + "---" + pl.col("DefendantDOB")).hash(),
DefendantDOB=pl.col("DefendantDOB").str.to_date().dt.truncate("1mo")
)
.select(cs.all().exclude("DefendantName"))
)
else:
res = df
res.write_parquet(f"out/{path.with_suffix('.parquet').name}")
# %%