# 0 Data Cleaning

In [1]:
from paths import PM
from ml_tools.utilities import load_dataframe, save_dataframe
import polars as pl

Load dataframe

In [2]:
df: pl.DataFrame 
df, _ = load_dataframe(df_path=PM["raw data"], kind="polars", all_strings=True) # type: ignore


🐉2025-09-25 14:51 - ml_tools - INFO - 💾 Loaded POLARS dataset: 'epoxy_raw_data' with shape: (3397, 18)


fill in missing molecular values

In [None]:
def clean_molecular_weight(df: pl.DataFrame):
    """
    Cleans the '分子量' (Molecular Weight) column in the dataset.

    The logic is as follows:
    1.  If a '分子量' value contains a parsable number at the start, that number is extracted.
    2.  For all other rows, the '分子量' is filled based on the most common
        (mode) value for that '环氧' (Epoxy) type, derived from
        the parsable rows.

    Args:
        df: The input Polars DataFrame with '分子量' and '环氧' columns.

    """
    # Define the regex to extract a leading number
    numeric_pattern = r"(\d+)\.?\d*"

    # --- Step 1: Create a mapping from epoxy type to its default molecular weight ---
    # Extract numeric values from the original column to build the mapping
    # We ignore decimals for finding the mode (e.g., 392.16 -> 392)
    epoxy_mw_map = (
        df.with_columns(
            pl.col("分子量")
            .str.extract(numeric_pattern, 1)
            .alias("mw_numeric")
        )
        .filter(pl.col("mw_numeric").is_not_null())
        .group_by("环氧")
        .agg(pl.col("mw_numeric").mode().first().alias("mw_mode_value"))
    )

    # --- Step 2: Apply the cleaning logic to the '分子量' column ---

    # Join the original dataframe with the mapping
    df_with_map = df.join(epoxy_mw_map, on="环氧", how="left")

    # Use a when-then-otherwise expression to create the new cleaned column
    df_filled = df_with_map.with_columns(
        pl.when(pl.col("分子量").str.contains(numeric_pattern))
        .then(pl.col("分子量").str.extract(numeric_pattern, 1))
        .otherwise(pl.col("mw_mode_value"))
        .alias("分子量_filled")
    )
    
    df_filled = df_filled.drop(["分子量","环氧", "mw_mode_value"]).rename({"分子量_filled": "分子量"})
    print(df_filled.shape)
    
    return df_filled, epoxy_mw_map

In [4]:
clean_df, epoxy_map = clean_molecular_weight(df=df)

(3397, 17)


Filter empty rows

In [5]:
df_filtered = clean_df.filter(pl.col("分子量").is_not_null())
df_filtered.shape

(2948, 17)

drop groups with 2 or less samples

In [6]:
df_filtered_final = df_filtered.filter(pl.col("分子量").count().over("分子量") > 2)
df_filtered_final.shape

(2938, 17)

Check value counts for categories

In [7]:
df_filtered_final.get_column("分子量").value_counts(sort=True, name="Molecular Weight Categories")

分子量,Molecular Weight Categories
str,u32
"""392""",1445
"""454""",632
"""340""",355
"""370""",84
"""1667""",24
…,…
"""220""",4
"""465""",4
"""550""",4
"""438""",3


Save dataframe

In [8]:
save_dataframe(df=df_filtered_final, save_dir=PM["clean data"].parent, filename=PM["clean data"].name)


🐉2025-09-25 14:51 - ml_tools - INFO - ✅ Saved dataset: 'cleaned_data.csv' with shape: (2938, 17)


Check mapping

In [9]:
epoxy_map

环氧,mw_fill_value
str,str
"""双酚A型环氧树脂""","""483"""
"""耐高温环氧树脂 BH140""","""422"""
"""环氧树脂E51""","""340"""
"""CYD-128（E51）""","""392"""
"""环氧树脂 E44""","""454"""
…,…
"""双酚 A 环氧树脂（DGEBA）""","""392"""
"""双酚S四缩水甘油醚(SEP)""","""392"""
"""双酚Ａ型环氧树脂（DGEBA）""","""340"""
"""柚皮素环氧树脂""","""483"""


Save molecular weight map

In [10]:
save_dataframe(df=epoxy_map, save_dir=PM["epoxy map"].parent, filename=PM["epoxy map"].name)


🐉2025-09-25 14:51 - ml_tools - INFO - ✅ Saved dataset: 'epoxy_mw_map.csv' with shape: (86, 2)
