# Data Preparation
## Add "Diagnosis" as the class variable

In [1]:
import pandas as pd
from pandas import DataFrame
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Read genus.reabund.csv into a df

df = pd.read_csv("..\\data\\genus.reabund.csv",
                 error_bad_lines=False)
df.head()

Unnamed: 0.1,Unnamed: 0,1063714,3314627,3940838,4380559,5254362,6433716,7204448,7559814,10328212,...,282386914,283608483,284171308,285016962,288075234,290246588,293150059,294408542,296585974,299673925
0,Enterobacteriaceae_unclassified,0.157034,10.143104,1.771959,28.06076,65.635131,3.940899,7.032138,0.506726,0.035817,...,5.499862,0.230341,2.846002,5.93297,0.131919,0.855014,39.73828,0.930422,8.508547,0.14602
1,Dorea,0.160174,0.393884,0.005998,0.003138,0.0,0.112734,0.111879,0.172646,0.0,...,0.0,0.558827,0.187399,0.009929,0.037102,0.128917,0.165019,0.02377,0.031713,0.026004
2,Betaproteobacteria_unclassified,1.290816,1.114063,0.123386,0.064861,0.019221,0.230266,0.286916,0.612108,0.0,...,0.004096,0.0,0.036912,0.172098,0.05634,0.015803,0.391096,0.320894,0.237846,0.552077
3,Ruminiclostridium_6,0.0,0.009323,0.017137,0.0,0.003844,2.281068,0.068571,0.0,0.0,...,0.0,0.0,0.0,0.0,0.449349,0.00499,0.688955,0.0,0.006343,0.0
4,Denitratisoma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Set id col to be the index and transpose the df
df.rename(columns={"Unnamed: 0":"id"}, inplace=True)
df.set_index('id', inplace=True)
df = df.transpose()
df.head()

id,Enterobacteriaceae_unclassified,Dorea,Betaproteobacteria_unclassified,Ruminiclostridium_6,Denitratisoma,Howardella,Selenomonas,Fastidiosipila,Porphyromonadaceae_unclassified,Clostridiales_unclassified,...,Ruminococcaceae_NK4A214_group,Parabacteroides,Selenomonadales_unclassified,Dysgonomonas,Gordonia,Planococcaceae_unclassified,Tyzzerella,Prevotella_9,Lactobacillus,Desulfovibrio
1063714,0.157034,0.160174,1.290816,0.0,0.0,0.0,0.0,0.0,0.384209,0.097361,...,0.0,0.025125,0.0,0.0,0.0,0.0,0.0,10.19357,11.375509,0.112017
3314627,10.143104,0.393884,1.114063,0.009323,0.0,0.0,0.0,0.0,0.501095,0.41486,...,0.0,0.191115,0.0,0.0,0.0,0.0,0.0,23.798536,0.0,0.291335
3940838,1.771959,0.005998,0.123386,0.017137,0.0,0.0,0.0,0.0,0.200502,0.289614,...,0.006855,0.244201,0.0,0.0,0.005998,0.026562,0.0,12.349731,0.038558,0.005998
4380559,28.06076,0.003138,0.064861,0.0,0.0,0.0,0.0,0.0,0.008369,0.047077,...,0.0,1.021038,0.0,0.0,0.0,0.0,0.024061,0.112984,0.011508,0.0
5254362,65.635131,0.0,0.019221,0.003844,0.0,0.0,0.0,0.0,0.016658,0.00897,...,0.0,0.003844,0.0,0.0,0.0,0.0,0.0,0.421584,7.273286,0.0


In [4]:
# Read 微生态数据.txt into a df

table = pd.read_csv("..\\data\\微生态数据.txt",
                 encoding="gbk",
                 delimiter="\t",
                 error_bad_lines=False,
                 dtype=object)

In [5]:
# keywords for cvd
kw_dict = {"心血管":["心脏病","后循环缺血","血脂","心力衰竭","心功能不全","心律失常","心悸","心肌梗死","脑出血"]}

In [6]:
# Use the patient ids to find their diagnosis

ids = df.index

# find diagnosis with ids for later merging
diagnosis = table[table["谷禾公司编号"].isin(ids)][["谷禾公司编号","临床诊断"]]

# map change the diagnosis to 1 it is cvd and 0 if not
diagnosis["临床诊断"] = diagnosis["临床诊断"].map(lambda x: 1 if any(kw in x for kw in kw_dict["心血管"]) else 0)

# rename column to id and make it the index col
diagnosis.rename(columns={"谷禾公司编号":"id"}, inplace=True)
diagnosis.set_index('id', inplace=True)

diagnosis.head()

Unnamed: 0_level_0,临床诊断
id,Unnamed: 1_level_1
78419523,0
97994914,0
23990796,0
75707577,1
95628714,0


In [7]:
# merge diagnosis on id
df["diagnosis"] = diagnosis
print(df.shape)
df.head()

(129, 206)


id,Enterobacteriaceae_unclassified,Dorea,Betaproteobacteria_unclassified,Ruminiclostridium_6,Denitratisoma,Howardella,Selenomonas,Fastidiosipila,Porphyromonadaceae_unclassified,Clostridiales_unclassified,...,Parabacteroides,Selenomonadales_unclassified,Dysgonomonas,Gordonia,Planococcaceae_unclassified,Tyzzerella,Prevotella_9,Lactobacillus,Desulfovibrio,diagnosis
1063714,0.157034,0.160174,1.290816,0.0,0.0,0.0,0.0,0.0,0.384209,0.097361,...,0.025125,0.0,0.0,0.0,0.0,0.0,10.19357,11.375509,0.112017,1
3314627,10.143104,0.393884,1.114063,0.009323,0.0,0.0,0.0,0.0,0.501095,0.41486,...,0.191115,0.0,0.0,0.0,0.0,0.0,23.798536,0.0,0.291335,0
3940838,1.771959,0.005998,0.123386,0.017137,0.0,0.0,0.0,0.0,0.200502,0.289614,...,0.244201,0.0,0.0,0.005998,0.026562,0.0,12.349731,0.038558,0.005998,1
4380559,28.06076,0.003138,0.064861,0.0,0.0,0.0,0.0,0.0,0.008369,0.047077,...,1.021038,0.0,0.0,0.0,0.0,0.024061,0.112984,0.011508,0.0,0
5254362,65.635131,0.0,0.019221,0.003844,0.0,0.0,0.0,0.0,0.016658,0.00897,...,0.003844,0.0,0.0,0.0,0.0,0.0,0.421584,7.273286,0.0,0


In [8]:
# write the complete file with class to a file
df.to_csv("..\\data\\genus.reabund_complete.csv")