In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import json
import random

In [None]:
df = pd.read_csv('/content/cbio_longitudinal_v2.csv')

In [None]:
df_no_nan = df.dropna(how='any')
df_no_nan

In [None]:
df_with_nan = df[df.isna().any(axis=1)]
df_with_nan

In [None]:
cols_with_nan = df_with_nan.columns[df.isna().any()].tolist()
print(cols_with_nan)

In [None]:
def csv_to_qwen_format(
    csv_path: str,
    target_columns = None,
    output_dir: str = "qwen_data",
    mask_ratio: float = 0.3,
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
    test_ratio: float = 0.15,
    system_prompt: str = "You are a data imputation assistant that predicts missing values based on available features.",
    random_seed: int = 42
):

    random.seed(random_seed)

    df = pd.read_csv(csv_path)

    initial_rows = len(df)
    df = df.dropna()
    removed_rows = initial_rows - len(df)

    training_examples = []

    for idx, row in df.iterrows():

        row_dict = row.to_dict()

        target_cols_available = [col for col in target_columns if col in row_dict.keys()]
        feature_cols = [col for col in row_dict.keys() if col not in target_columns]

        if len(target_cols_available) < 1:
            continue

        n_mask = max(1, int(len(target_cols_available) * mask_ratio))
        cols_to_mask = random.sample(target_cols_available, n_mask)

        context_parts = []
        target_parts = []

        for col in feature_cols:
            context_parts.append(f"{col}: {row_dict[col]}")

        for col in target_cols_available:
            if col in cols_to_mask:
                context_parts.append(f"{col}: [MASK]")
                target_parts.append(f"{col}: {row_dict[col]}")
            else:
                context_parts.append(f"{col}: {row_dict[col]}")

        user_content = f"Complete the missing values: {', '.join(context_parts)}"
        assistant_content = ", ".join(target_parts)

        example = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content}
            ]
        }

        training_examples.append(example)

    n = len(training_examples)
    train_end = int(n * train_ratio)
    val_end = train_end + int(n * val_ratio)

    splits = {
        'train': training_examples[:train_end],
        'val': training_examples[train_end:val_end],
        'test': training_examples[val_end:]
    }


    for split_name, split_data in splits.items():
        output_file = f"/content/{split_name}.jsonl"

        with open(output_file, 'w', encoding='utf-8') as f:
            for example in split_data:
                f.write(json.dumps(example, ensure_ascii=False) + '\n')

        print(f"   ✅ {split_name:5s}: {len(split_data):5d} exemplos → {output_file}")

    return splits

In [None]:
csv_to_qwen_format(
        csv_path="/content/cbio_longitudinal_v2.csv",
        output_dir="qwen_data",
        mask_ratio=0.3,
        train_ratio=0.7,
        val_ratio=0.15,
        test_ratio=0.15,
        random_seed=42,
        target_columns=cols_with_nan
    )