docs: add basic tutorial

Aarhus-Psychiatry-Research · Dec 2, 2022 · 8136a1b · 8136a1b
1 parent c569b74
commit 8136a1b
Show file tree

Hide file tree

Showing 10 changed files with 231,311 additions and 40,023 deletions.
diff --git a/src/timeseriesflattener/testing/load_synth_data.py b/src/timeseriesflattener/testing/load_synth_data.py
@@ -41,6 +41,21 @@ def synth_predictor_float(
     return load_raw_test_csv("synth_raw_float_1.csv", n_rows=n_rows)
 
 
+@data_loaders.register("synth_sex")
+def load_synth_sex(
+    n_rows: Optional[int] = None,
+) -> pd.DataFrame:
+    """Load synth sex data.".
+
+    Args:
+        n_rows: Number of rows to return. Defaults to None which returns entire coercion data view.
+
+    Returns:
+        pd.DataFrame
+    """
+    return load_raw_test_csv("synth_sex.csv", n_rows=n_rows)
+
+
 @data_loaders.register("synth_predictor_binary")
 def synth_predictor_binary(
     n_rows: Optional[int] = None,

diff --git a/tests/test_data/raw/create_synth_raw_binary.py b/tests/test_data/raw/create_synth_raw_binary.py
@@ -10,19 +10,23 @@
     # Get project root directory
     project_root = Path(__file__).resolve().parents[3]
 
-    column_specs = {
-        "dw_ek_borger": {
-            "column_type": "uniform_int",
-            "min": 0,
-            "max": 10_000,
+    column_specs = [
+        {
+            "dw_ek_borger": {
+                "column_type": "uniform_int",
+                "min": 0,
+                "max": 10_000,
+            }
         },
-        "timestamp": {
-            "column_type": "datetime_uniform",
-            "min": -5 * 365,
-            "max": 0 * 365,
+        {
+            "timestamp": {
+                "column_type": "datetime_uniform",
+                "min": -5 * 365,
+                "max": 0 * 365,
+            }
         },
-        "value": {"column_type": "uniform_int", "min": 0, "max": 1},
-    }
+        {"value": {"column_type": "uniform_int", "min": 0, "max": 2}},
+    ]
 
     for i in (1, 2):
         df = generate_data_columns(

diff --git a/tests/test_data/raw/create_synth_raw_float.py b/tests/test_data/raw/create_synth_raw_float.py
@@ -9,24 +9,28 @@
 if __name__ == "__main__":
     # Get project root directory
 
-    column_specs = {
-        "dw_ek_borger": {
-            "column_type": "uniform_int",
-            "min": 0,
-            "max": 10_000,
+    column_specs = [
+        {
+            "dw_ek_borger": {
+                "column_type": "uniform_int",
+                "min": 0,
+                "max": 10_000,
+            }
         },
-        "timestamp": {
-            "column_type": "datetime_uniform",
-            "min": -5 * 365,
-            "max": 0 * 365,
+        {
+            "timestamp": {
+                "column_type": "datetime_uniform",
+                "min": -5 * 365,
+                "max": 0 * 365,
+            }
         },
-        "value": {"column_type": "uniform_float", "min": 0, "max": 10},
-    }
+        {"value": {"column_type": "uniform_float", "min": 0, "max": 10}},
+    ]
 
     for i in (1, 2):
         df = generate_data_columns(
             predictors=column_specs,
-            n_samples=10_000,
+            n_samples=100_000,
         )
 
         df.to_csv(

diff --git a/tests/test_data/raw/create_synth_sex.py b/tests/test_data/raw/create_synth_sex.py
@@ -0,0 +1,34 @@
+"""Generate raw binary dataframe."""
+
+from pathlib import Path
+
+from psycopmlutils.synth_data_generator.synth_col_generators import (
+    generate_data_columns,
+)
+
+if __name__ == "__main__":
+    # Get project root directory
+    project_root = Path(__file__).resolve().parents[3]
+
+    column_specs = [
+        {
+            "dw_ek_borger": {
+                "column_type": "uniform_int",
+                "min": 0,
+                "max": 10_000,
+            }
+        },
+        {"female": {"column_type": "uniform_int", "min": 0, "max": 2}},
+    ]
+
+    df = generate_data_columns(
+        predictors=column_specs,
+        n_samples=100_000,
+    )
+
+    df = df.groupby("dw_ek_borger").last().reset_index()
+
+    df.to_csv(
+        project_root / "tests" / "test_data" / "raw" / "synth_sex.csv",
+        index=False,
+    )