# Features engineer

In [5]:
import pandas as pd
import numpy as np

def quantile_normalize_across(df_list, meta_cols):
    """
    跨多個 DataFrame 做真正的 quantile normalization。
    每個 feature column 分別以所有樣本的 rank 平均值取代。
    """

    # 取出所有 features
    feature_names = [c for c in df_list[0].columns if c not in meta_cols]
    n_features = len(feature_names)

    # 檢查形狀一致
    n_rows = [len(df) for df in df_list]
    if len(set(n_rows)) != 1:
        raise ValueError(f"❌ All DataFrames must have the same number of rows. Got: {n_rows}")

    n = n_rows[0]
    print(f"🧩 Performing quantile normalization on {len(df_list)} datasets, each with {n} rows and {n_features} features")

    # 準備 feature 矩陣列表
    features = [df[feature_names].to_numpy() for df in df_list]

    # 建立一個空間保存平均排序值
    mean_sorted_all = np.zeros((n, n_features))

    # 對每個 feature column 做 rank-based 平均
    for j in range(n_features):
        # 取所有 datasets 的該 feature
        vals = np.vstack([f[:, j] for f in features])
        # 對每個 dataset 內做排序
        sorted_each = np.sort(vals, axis=1)
        # 對 rank 取平均
        mean_sorted_all[:, j] = np.mean(sorted_each, axis=0)

    # 定義函式：把平均 rank 值放回原位置
    def apply_quantile_norm(X, mean_sorted_all):
        X_norm = np.zeros_like(X)
        for j in range(X.shape[1]):
            ranks = np.argsort(np.argsort(X[:, j]))
            X_norm[:, j] = mean_sorted_all[ranks, j]
        return X_norm

    # 套用到所有 DataFrame
    normalized = [apply_quantile_norm(X, mean_sorted_all) for X in features]

    # 回存
    dfs_qn = []
    for df, norm_values in zip(df_list, normalized):
        df_qn = pd.concat([
            df[meta_cols].reset_index(drop=True),
            pd.DataFrame(norm_values, columns=feature_names)
        ], axis=1)
        dfs_qn.append(df_qn)

    return dfs_qn


In [6]:
X1 = pd.read_csv("../preprocessed_data/CAGE-merged/4. features selection/X1_all_rank_features_pruned.tsv", sep="\t")
X2 = pd.read_csv("../preprocessed_data/CAGE-merged/4. features selection/X2_all_rank_features_pruned.tsv", sep="\t")

meta_cols = ["gene_name","chr","gene_start","gene_end","TSS_start","TSS_end","strand","gex","gex_rank"]

X1_qn, X2_qn = quantile_normalize_across([X1, X2], meta_cols)

# 儲存結果
X1_qn.to_csv("../preprocessed_data/CAGE-merged/4. features selection/X1_all_rank_features_pruned_qn.tsv", sep="\t", index=False)
X2_qn.to_csv("../preprocessed_data/CAGE-merged/4. features selection/X2_all_rank_features_pruned_qn.tsv", sep="\t", index=False)


🧩 Performing quantile normalization on 2 datasets, each with 16284 rows and 277 features


In [9]:
X2_qn["DNase_gene_z_std"].sort_values()

8043     0.000000
3951     0.000000
5433     0.000000
2617     0.000000
4531     0.000000
           ...   
15661    5.572742
13669    5.928350
9566     6.239169
3391     6.800296
8552     9.069464
Name: DNase_gene_z_std, Length: 16284, dtype: float64

In [10]:
X1_qn["DNase_gene_z_std"].sort_values()


7208     0.000000
4569     0.000000
1220     0.000000
14020    0.000000
14015    0.000000
           ...   
6926     5.572742
14936    5.928350
5802     6.239169
15046    6.800296
998      9.069464
Name: DNase_gene_z_std, Length: 16284, dtype: float64