In [1]:
import gzip
import re
import sys
import os
import pandas as pd

In [2]:
def add_project_root_to_sys_path(target_file="config.py"):
    current_path = os.getcwd()
    while True:
        if target_file in os.listdir(current_path):
            # 找到包含 config.py 的目录，加入 sys.path
            if current_path not in sys.path:
                sys.path.append(current_path)
            break
        else:
            # 向上一级目录查找
            parent_path = os.path.dirname(current_path)
            if parent_path == current_path:
                # 到达根目录还没找到，停止
                raise FileNotFoundError(f"未找到包含 {target_file} 的目录")
            current_path = parent_path

add_project_root_to_sys_path()

In [3]:
from config import breast_expression_gene, breast_data_path, breast_raw_data_path

In [4]:
label_path = breast_raw_data_path + '\GSE70947_series_matrix.txt.gz'

expr_gene = pd.read_csv(breast_expression_gene, index_col=0)

  label_path = breast_raw_data_path + '\GSE70947_series_matrix.txt.gz'


In [5]:
# 读取 series matrix
with gzip.open(label_path, "rt", encoding="utf-8") as f:
    lines = f.readlines()
    
# 提取 GSM 样本ID
for line in lines:
    if line.startswith("!Sample_geo_accession"):
        gsm_ids = line.strip().split("\t")[1:]
        break

# 提取 tissue 标签
for line in lines:
    if line.startswith("!Sample_characteristics_ch1") and "tissue" in line:
        tissue_labels = [re.sub(r"tissue: ?", "", x) for x in line.strip().split("\t")[1:]]
        break

In [6]:
# 标签信息表
sample_info = pd.DataFrame({"Class": tissue_labels}, index=gsm_ids)
print(sample_info.head())

                 Class
"GSM1823702"  "normal"
"GSM1823703"  "normal"
"GSM1823704"  "normal"
"GSM1823705"  "normal"
"GSM1823706"  "normal"


In [7]:
print("表达矩阵样本数：", expr_gene.shape[0])
print("标签表样本数：", sample_info.shape[0])
print("交集样本数：", len(set(expr_gene.index) & set(sample_info.index)))

表达矩阵样本数： 296
标签表样本数： 296
交集样本数： 0


In [8]:
print("表达矩阵前10个样本名：", expr_gene.index[:10].tolist())
print("标签表前10个样本名：", sample_info.index[:10].tolist())

表达矩阵前10个样本名： ['GSM1823702', 'GSM1823703', 'GSM1823704', 'GSM1823705', 'GSM1823706', 'GSM1823707', 'GSM1823708', 'GSM1823709', 'GSM1823710', 'GSM1823711']
标签表前10个样本名： ['"GSM1823702"', '"GSM1823703"', '"GSM1823704"', '"GSM1823705"', '"GSM1823706"', '"GSM1823707"', '"GSM1823708"', '"GSM1823709"', '"GSM1823710"', '"GSM1823711"']


In [9]:
sample_info.index = sample_info.index.str.strip('"')

In [10]:
# 顺序和表达矩阵一致
sample_info = sample_info.reindex(expr_gene.index)
# 合并
expr_gene["Class"] = sample_info["Class"]

In [11]:
print(expr_gene[["Class"]].head(10))

               Class
GSM1823702  "normal"
GSM1823703  "normal"
GSM1823704  "normal"
GSM1823705  "normal"
GSM1823706  "normal"
GSM1823707  "normal"
GSM1823708  "normal"
GSM1823709  "normal"
GSM1823710  "normal"
GSM1823711  "normal"


In [12]:
# (可选) 把 Class 列放在第一列
cols = ["Class"] + [col for col in expr_gene.columns if col != "Class"]
expr_gene = expr_gene[cols]

In [14]:
# 保存
expr_gene.to_csv(breast_data_path, encoding="utf_8_sig")
print(f"完成！含 Class 的新表达矩阵已保存：{breast_data_path}")

完成！含 Class 的新表达矩阵已保存：C:\Users\26494\GA\data\Breast_Cancer/Breast_Cancer.csv
