In [1]:
import pandas as pd
from pandas import DataFrame
import os

In [2]:
# Read the file
df = pd.read_csv("..\\data\\微生态数据.txt",
                 encoding="gbk",
                 delimiter="\t",
                 error_bad_lines=False,
                 dtype=object)
print(df.shape)
df.head()

(520, 33)


Unnamed: 0,分中心,报告编号_检测地,姓名,性别,年龄,临床诊断,样本类型,检测项目,谷禾公司编号,收样时间,...,外送公司,外送日期,外送人,快递,快递单号,外送公司登记日期,外送公司编号,外送公司报告日期,备注,备注.1
0,承颉投资,BT1712210363LCRFX,黄彬,女,51,健康体检,粪便,微生态平衡检测（健康筛查）,78419523,2017/12/21,...,谷禾,,王政,圆通速递,,,,,无样本留存,
1,大同煤总中心,BT1712220362LCRFX,张春荣,女,90,乏力（待查）食欲不振胃肠功能紊乱,粪便,肠道菌群微生态平衡检测,17147677,2017/12/22,...,谷禾,,王政,圆通速递,,,,,,
2,廊坊人民中心,BT1712240427LCRFX,田喜茹,女,1900年2月22日,便秘,粪便,肠道菌群微生态平衡检测,36020309,2017/12/24,...,谷禾,,王政,圆通速递,,,,,,
3,南阳二院中心,BT1712240434HCRFX,高翠,女,63岁,undefined,粪便,肠道菌群微生态平衡检测,71395240,2017/12/24,...,谷禾,科研编号：BTC技术2017016,王政,圆通速递,,,,,,
4,南阳二院中心,BT1712240438HCRFX,石长文,男,71,undefined,粪便,肠道菌群微生态平衡检测,59365756,2017/12/24,...,谷禾,科研编号：BTC技术2017016,王政,圆通速递,,,,,,


In [3]:
# Change LADA to Chinese
LADA_index = df.loc[df["临床诊断"] == "LADA"].index[0]
df.at[LADA_index, "临床诊断"] = "成人隐匿迟发性自身免疫糖尿病"

In [4]:
# Count the # of unique diagnosis and the # of of patients for each
diagnosis = df["临床诊断"].value_counts()
len(diagnosis)

144

In [5]:
# Initialize a dictionary of keywords of each disease
types = ["高血压","结直肠癌","糖尿病","肥胖","胸痛胸闷","胆结石","心血管","便秘","胃肠炎","健康","帕金森","癫痫","对照"]
kw_dict = {key:[key] for key in types}

kw_dict["结直肠癌"] = ["结肠","直肠"] #结直肠癌
kw_dict["胸痛胸闷"] = ["胸痛","胸闷"] #胸痛胸闷
kw_dict["胆结石"] = ["结石"] #胆结石
kw_dict["心血管"] = ["心脏病","后循环缺血","血脂","心力衰竭","心功能不全","心律失常","心悸","心肌梗死","脑出血"] #心血管
kw_dict["胃肠炎"].extend(["胃肠功能紊乱"]) #胃肠炎
kw_dict["癫痫"].extend(["子痫"]) #癫痫
kw_dict

{'高血压': ['高血压'],
 '结直肠癌': ['结肠', '直肠'],
 '糖尿病': ['糖尿病'],
 '肥胖': ['肥胖'],
 '胸痛胸闷': ['胸痛', '胸闷'],
 '胆结石': ['结石'],
 '心血管': ['心脏病', '后循环缺血', '血脂', '心力衰竭', '心功能不全', '心律失常', '心悸', '心肌梗死', '脑出血'],
 '便秘': ['便秘'],
 '胃肠炎': ['胃肠炎', '胃肠功能紊乱'],
 '健康': ['健康'],
 '帕金森': ['帕金森'],
 '癫痫': ['癫痫', '子痫'],
 '对照': ['对照']}

In [6]:
# Count the frequency of each disease
frq_count = {key:0 for key in types}
id_list = {key:[] for key in types}

def select_type(df: DataFrame, name: str, key: str) -> int:
    """ 
        Count the # of instances for the disease given by key 
        and store the ID number of each patient in a dictionary
    """
    selected = df[df["临床诊断"].str.contains(name, na=False)]
    id_list[key].extend(selected["谷禾公司编号"].to_list())
    return len(selected)

for key, val in kw_dict.items():
    for name in val:
        frq_count[key] += select_type(df, name, key)
        
frq_count

{'高血压': 19,
 '结直肠癌': 56,
 '糖尿病': 93,
 '肥胖': 4,
 '胸痛胸闷': 41,
 '胆结石': 16,
 '心血管': 79,
 '便秘': 7,
 '胃肠炎': 12,
 '健康': 8,
 '帕金森': 0,
 '癫痫': 10,
 '对照': 0}

In [7]:
def check_format(fp: str) -> bool:
    """ Check if the format of the fastq file is correct """
    try:
        with open(fp) as file:
            for line_num, line in enumerate(file):
                # a plus sign every 4 lines
                if not (line_num - 2)%4 and line.rstrip() != "+":
                    return False
                # 151 characters long every odd line
                if line_num % 2 == 1 and len(line.rstrip()) != 151:
                    return False
        return True
    except FileNotFoundError:
        return False

In [8]:
# Move two fastq files to folder only if both exist
# NO NEED TO RE-RUN
from shutil import copy

def move_fastq(l: list, folder: str) -> list:
    
    del_list = set()
    for id_num in l:
    
        curdir = "..\\data\\fastq_baoteng" # directory of fastq files
        fn1 = f"{id_num:0>9}_1.fastq" # filename of fastq file #1
        fn2 = f"{id_num:0>9}_2.fastq" # filename of fastq file #2
        fp1 = os.path.join(curdir, fn1) # global path to fastq file 1
        fp2 = os.path.join(curdir, fn2) # global path to fastq file 2
        
        if check_format(fp1) and check_format(fp2):
            copy(fp1, os.path.join("..\\result", folder))
            copy(fp2, os.path.join("..\\result", folder))
            print(f"文件复制成功: {id_num}")
        else:
            del_list.add(id_num)
            print(f"文件损坏或缺失： {id_num}")
            
    return del_list

In [9]:
del_list_cvd = move_fastq(id_list["心血管"], "心血管\\")

文件复制成功: 64272008
文件复制成功: 16607900
文件复制成功: 80098432
文件复制成功: 1063714
文件复制成功: 35625116
文件复制成功: 74668125
文件复制成功: 20540465
文件复制成功: 68404683
文件复制成功: 47897243
文件复制成功: 48981332
文件复制成功: 6433716
文件复制成功: 77815645
文件复制成功: 212406565
文件复制成功: 290246588
文件复制成功: 275334254
文件复制成功: 260199619
文件复制成功: 282386914
文件复制成功: 226447529
文件复制成功: 247443188
文件复制成功: 205338649
文件复制成功: 294408542
文件复制成功: 284171308
文件复制成功: 285016962
文件复制成功: 207466091
文件复制成功: 28413823
文件复制成功: 58712861
文件复制成功: 73027283
文件复制成功: 51989304
文件复制成功: 3940838
文件复制成功: 83825407
文件复制成功: 73746752
文件复制成功: 67377699
文件复制成功: 20338609
文件复制成功: 225497908
文件复制成功: 201359904
文件复制成功: 86138562
文件复制成功: 46522778
文件复制成功: 63526139
文件复制成功: 296585974
文件复制成功: 75707577
文件复制成功: 230810038
文件复制成功: 243234323
文件复制成功: 52489767
文件复制成功: 7559814
文件复制成功: 13252989
文件复制成功: 13790645
文件复制成功: 16339160
文件复制成功: 7204448
文件复制成功: 48392795
文件复制成功: 30061790
文件复制成功: 88631081
文件复制成功: 55156639
文件复制成功: 53653910
文件复制成功: 91197625
文件复制成功: 50861904
文件复制成功: 87017625
文件复制成功: 66033110
文件复制成功: 276415474
文

In [10]:
id_list["心血管"] = [x for x in id_list["心血管"] if x not in del_list_cvd]
len(id_list["心血管"])

79

In [11]:
# Write to file
df[df["谷禾公司编号"].isin(id_list["心血管"])][["临床诊断","谷禾公司编号"]].to_csv("..\\result\\心血管.csv",
                                                                encoding="gbk",
                                                                index=False)

In [12]:
# Write those with missing fastq files to a file
with open("..\\result\\fastq文件缺失.txt", 'w+') as f:
    for num in del_list_cvd:
        f.write(f"{int(num):0>9}\n")

In [13]:
healthy_disease = {"健康查体",
                   "健康体检",
                   "子痫前期",
                   "肺炎",
                   "急性胰腺炎",
                   "支气管肺炎",
                   "急性支气管肺炎",
                   "一般性医学检查",
                   "疱疹性咽峡炎",
                   "癫痫",
                   "甲减肺炎哮喘"
                   ,"睡眠障碍",
                   "甲状腺功能减退症",
                   "疖肿",
                   "颈椎不稳定",
                   "咳嗽",
                   "呕吐2天.发烧一天",
                   "体检",
                   "头晕和眩晕",
                   "未知",
                   "无",
                   "消化不良"}
selected = df[df["临床诊断"].isin(healthy_disease)]
id_list["对照"] = list(selected["谷禾公司编号"])
len(id_list["对照"])

75

In [14]:
del_list_healthy = move_fastq(id_list["对照"], "对照\\")

文件复制成功: 78419523
文件复制成功: 97994914
文件复制成功: 23990796
文件复制成功: 95628714
文件复制成功: 89510985
文件复制成功: 79634669
文件复制成功: 10328212
文件复制成功: 77923690
文件复制成功: 49838963
文件复制成功: 48541647
文件复制成功: 88006179
文件复制成功: 50329624
文件复制成功: 37779913
文件复制成功: 33783588
文件复制成功: 39466366
文件复制成功: 77322595
文件复制成功: 82831840
文件复制成功: 40931638
文件复制成功: 20134530
文件复制成功: 80652682
文件复制成功: 11087318
文件复制成功: 81518468
文件复制成功: 66626534
文件复制成功: 68103971
文件复制成功: 26844805
文件复制成功: 58664581
文件复制成功: 58474301
文件复制成功: 61540721
文件复制成功: 28795968
文件复制成功: 4380559
文件复制成功: 5254362
文件复制成功: 79450463
文件复制成功: 44818711
文件复制成功: 75667750
文件复制成功: 28263878
文件复制成功: 74051281
文件复制成功: 24795759
文件复制成功: 92095935
文件复制成功: 72522653
文件复制成功: 83820369
文件复制成功: 39195460
文件复制成功: 30432977
文件复制成功: 38070812
文件复制成功: 80949288
文件复制成功: 3314627
文件复制成功: 78646901
文件复制成功: 85328708
文件复制成功: 84825914
文件复制成功: 48948354
文件复制成功: 46177168
文件损坏或缺失： 50200953
文件损坏或缺失： 20542857
文件损坏或缺失： 24341940
文件损坏或缺失： 25684689
文件损坏或缺失： 46358340
文件损坏或缺失： 79541007
文件损坏或缺失： 83421188
文件损坏或缺失： 78491631
文件损坏或缺失： 

In [15]:
len(del_list_healthy)

25

In [16]:
id_list["对照"] = [x for x in id_list["对照"] if x not in del_list_healthy]
len(id_list["对照"])

50

In [17]:
# Write to file
df[df["谷禾公司编号"].isin(id_list["对照"])][["临床诊断","谷禾公司编号"]].to_csv("..\\result\\对照.csv", 
                                                               encoding="gbk",
                                                               index=False)