In [16]:
import gzip


def unzip(input, output):

    input_file = input
    output_file = output

    with gzip.open(input_file, "rb") as f_in:
        with open(output_file, "wb") as f_out:

            f_out.write(f_in.read())

    print("file unzipped and save at", output_file)

unzip("/data/LSY/z_preparing_and_parts/commontrait_Preeclampsia/GSE44667_series_matrix.txt.gz",
      "/data/LSY/z_preparing_and_parts/commontrait_Preeclampsia/GSE44667_series_matrix.txt")

file unzipped and save at /data/LSY/z_preparing_and_parts/commontrait_Preeclampsia/GSE44667_series_matrix.txt


In [3]:
import warnings

import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, roc_auc_score,roc_curve
from sklearn.model_selection import GridSearchCV
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")



inputfilepath = "/data/LSY/z_preparing_and_parts/commontrait_Preeclampsia/data_train.txt"
outputfilepath = "/data/LSY/z_preparing_and_parts/commontrait_Preeclampsia/"

# load data
datafile = pd.read_table(
    inputfilepath,
    index_col=0,
    header=0,
    delimiter="\t",
)


In [8]:

import pandas as pd



# read matrix
with open(
    "/data/LSY/gse_matrix_raw/commontrait_Preeclampsia/GSE100197_series_matrix.txt", "r"
) as file:
    lines = file.readlines()

# find metadata
first_empty_line_index = lines.index("\n")
table_begin_index = next(
(
        i
        for i, line in enumerate(lines)
        if line.startswith("!series_matrix_table_begin")
    ),
    None,
)
end_index = table_begin_index if table_begin_index is not None else len(lines)
content_before_table = lines[first_empty_line_index:end_index]


# read metadata
metadata_lines = [
    line.strip()[1:].split("\t")
    for line in content_before_table
    if line.startswith("!")
]
metadata_df = pd.DataFrame(metadata_lines).T


metadata_df.columns = metadata_df.iloc[0]
metadata_df = metadata_df.drop(0, axis=0)

metadata_df.dropna(inplace=True)


In [41]:
# mask = metadata_df.iloc[:, 7] == '"Peripheral blood"'

# metadata_df = metadata_df[mask]



In [9]:
print(metadata_df.iloc[:,10].unique())

['"pathology group: EOPE"' '"pathology group: IUGR"'
 '"pathology group: LOPE"' '"pathology group: PreT"'
 '"pathology group: REPLICATE"' '"pathology group: Term"']


In [10]:

# find status
metadata_df["status"] = metadata_df.iloc[:, 10].apply(
    lambda x: 1 if "PE" in x else 0 if "Term" in x else None
)
metadata_df = metadata_df.set_index("Sample_geo_accession")

In [11]:

metadata_df = metadata_df["status"].apply(pd.to_numeric, errors="coerce")

In [12]:
print(metadata_df)

Sample_geo_accession
"GSM2674413"    1.0
"GSM2674414"    1.0
"GSM2674415"    1.0
"GSM2674416"    1.0
"GSM2674417"    1.0
               ... 
"GSM2674510"    0.0
"GSM2674511"    0.0
"GSM2674512"    0.0
"GSM2674513"    0.0
"GSM2674514"    0.0
Name: status, Length: 102, dtype: float64


In [13]:


# read data
data_lines = [line.strip().split() for line in lines if not line.startswith("!")]
data_df = pd.DataFrame(data_lines).T

data_df.columns = data_df.iloc[0]
data_df = data_df.drop(0, axis=0)

data_df = data_df.set_index('"ID_REF"')
data_df = data_df.iloc[:, 1:]
data_df.columns = [col.strip('"') for col in data_df.columns]

data_df = data_df.apply(pd.to_numeric, errors="coerce")





In [10]:
print(data_df)

              cg00000029  cg00000108  cg00000109  cg00000165  cg00000236  \
"ID_REF"                                                                   
"GSM2674413"    0.073305    0.923194    0.843182    0.706948    0.791106   
"GSM2674414"    0.070529    0.930251    0.757784    0.566484    0.801352   
"GSM2674415"    0.055583    0.931459    0.874118    0.532276    0.808321   
"GSM2674416"    0.050189    0.942209    0.878689    0.469923    0.846638   
"GSM2674417"    0.043204    0.927757    0.857130    0.743771    0.815431   
...                  ...         ...         ...         ...         ...   
"GSM2674510"    0.049234    0.919641    0.789199    0.649706    0.827407   
"GSM2674511"    0.049540    0.932819    0.858778    0.679322    0.841147   
"GSM2674512"    0.052620    0.939848    0.854971    0.667784    0.861020   
"GSM2674513"    0.046779    0.942366    0.890987    0.676946    0.836106   
"GSM2674514"    0.084674    0.946180    0.890475    0.631085    0.847811   

           

In [9]:

def m_to_beta(M):
    return 1 / (1 + 2 ** (-M))


In [10]:
# data_df = data_df.apply(m_to_beta)

In [14]:

# merge metadata and data
data_df["status"] = metadata_df
# drop other disease sample
data_df.dropna(subset=["status"], inplace=True)

In [15]:
data_df

Unnamed: 0_level_0,cg00000029,cg00000108,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000622,...,ch.9.941347R,ch.9.945770F,ch.9.96055087R,ch.9.98463211R,ch.9.98937537R,ch.9.98957343R,ch.9.98959675F,ch.9.98989607R,ch.9.991104F,status
"""ID_REF""",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""GSM2674413""",0.073305,0.923194,0.843182,0.706948,0.791106,0.579048,0.601619,0.607691,0.271943,0.01679,...,0.040178,0.045171,0.135043,0.041653,0.025009,0.035932,0.240965,0.021371,0.187828,1.0
"""GSM2674414""",0.070529,0.930251,0.757784,0.566484,0.801352,0.663254,0.57846,0.521605,0.314578,0.013469,...,0.026982,0.055064,0.167966,0.028627,0.031673,0.026992,0.266654,0.015337,0.132078,1.0
"""GSM2674415""",0.055583,0.931459,0.874118,0.532276,0.808321,0.626985,0.670363,0.506756,0.306765,0.014368,...,0.033826,0.034061,0.114206,0.035298,0.024196,0.03414,0.184527,0.014313,0.130744,1.0
"""GSM2674416""",0.050189,0.942209,0.878689,0.469923,0.846638,0.65461,0.573392,0.478174,0.373154,0.014567,...,0.040933,0.037049,0.102602,0.026733,0.016482,0.032005,0.237158,0.014505,0.053833,1.0
"""GSM2674417""",0.043204,0.927757,0.85713,0.743771,0.815431,0.665747,0.604347,0.360131,0.323337,0.012779,...,0.022829,0.012649,0.11714,0.027661,0.011753,0.028743,0.347082,0.014088,0.140766,1.0
"""GSM2674418""",0.086117,0.934962,0.877392,0.613682,0.846824,0.777238,0.658283,0.503792,0.216306,0.018645,...,0.030275,0.032996,0.111348,0.055365,0.02828,0.041129,0.282191,0.016814,0.082007,1.0
"""GSM2674419""",0.045667,0.936764,0.869118,0.610987,0.825903,0.663243,0.647315,0.515948,0.223175,0.014109,...,0.022056,0.007282,0.146927,0.02625,0.017352,0.024685,0.215919,0.01381,0.092025,1.0
"""GSM2674420""",0.036728,0.93605,0.881453,0.711432,0.865391,0.657535,0.730051,0.540177,0.336616,0.012916,...,0.024654,0.032672,0.090715,0.025872,0.015678,0.029149,0.229544,0.01511,0.073772,1.0
"""GSM2674421""",0.055055,0.943748,0.862869,0.562096,0.846492,0.536064,0.658529,0.487405,0.195156,0.017106,...,0.029638,0.042172,0.131012,0.045436,0.022322,0.042201,0.222355,0.017898,0.086751,1.0
"""GSM2674422""",0.054155,0.937986,0.873617,0.720605,0.85849,0.689254,0.735657,0.472255,0.449223,0.018107,...,0.028655,0.00416,0.084008,0.033762,0.021238,0.030698,0.189232,0.016004,0.074496,1.0


In [17]:
data1=data_df

In [18]:
data1['status']

"ID_REF"
"GSM2674413"    1.0
"GSM2674414"    1.0
"GSM2674415"    1.0
"GSM2674416"    1.0
"GSM2674417"    1.0
"GSM2674418"    1.0
"GSM2674419"    1.0
"GSM2674420"    1.0
"GSM2674421"    1.0
"GSM2674422"    1.0
"GSM2674423"    1.0
"GSM2674424"    1.0
"GSM2674425"    1.0
"GSM2674426"    1.0
"GSM2674427"    1.0
"GSM2674428"    1.0
"GSM2674429"    1.0
"GSM2674430"    1.0
"GSM2674431"    1.0
"GSM2674432"    1.0
"GSM2674433"    1.0
"GSM2674434"    1.0
"GSM2674446"    1.0
"GSM2674447"    1.0
"GSM2674448"    1.0
"GSM2674449"    1.0
"GSM2674450"    1.0
"GSM2674451"    1.0
"GSM2674452"    1.0
"GSM2674453"    1.0
"GSM2674454"    1.0
"GSM2674455"    1.0
"GSM2674456"    1.0
"GSM2674457"    1.0
"GSM2674458"    1.0
"GSM2674459"    1.0
"GSM2674460"    1.0
"GSM2674461"    1.0
"GSM2674462"    1.0
"GSM2674463"    1.0
"GSM2674496"    0.0
"GSM2674497"    0.0
"GSM2674498"    0.0
"GSM2674499"    0.0
"GSM2674500"    0.0
"GSM2674501"    0.0
"GSM2674502"    0.0
"GSM2674503"    0.0
"GSM2674504"    0.0
"GSM2674505

In [19]:

# read matrix
with open(
    "/data/LSY/z_preparing_and_parts/commontrait_Preeclampsia/GSE44667_series_matrix.txt", "r"
) as file:
    lines = file.readlines()

# find metadata
first_empty_line_index = lines.index("\n")
table_begin_index = next(
(
        i
        for i, line in enumerate(lines)
        if line.startswith("!series_matrix_table_begin")
    ),
    None,
)
end_index = table_begin_index if table_begin_index is not None else len(lines)
content_before_table = lines[first_empty_line_index:end_index]


# read metadata
metadata_lines = [
    line.strip()[1:].split("\t")
    for line in content_before_table
    if line.startswith("!")
]
metadata_df = pd.DataFrame(metadata_lines).T


metadata_df.columns = metadata_df.iloc[0]
metadata_df = metadata_df.drop(0, axis=0)

metadata_df.dropna(inplace=True)


In [23]:
print(metadata_df.iloc[:,10])

1       "condition: EOPET"
2       "condition: EOPET"
3       "condition: EOPET"
4       "condition: EOPET"
5       "condition: EOPET"
6       "condition: EOPET"
7       "condition: EOPET"
8       "condition: EOPET"
9       "condition: EOPET"
10      "condition: EOPET"
11      "condition: EOPET"
12      "condition: EOPET"
13      "condition: EOPET"
14      "condition: EOPET"
15      "condition: EOPET"
16      "condition: EOPET"
17      "condition: EOPET"
18      "condition: EOPET"
19      "condition: EOPET"
20      "condition: EOPET"
21    "condition: Control"
22    "condition: Control"
23    "condition: Control"
24    "condition: Control"
25    "condition: Control"
26    "condition: Control"
27    "condition: Control"
28    "condition: Control"
29    "condition: Control"
30    "condition: Control"
31    "condition: Control"
32    "condition: Control"
33    "condition: Control"
34    "condition: Control"
35    "condition: Control"
36    "condition: Control"
37    "condition: Control"
3

In [24]:

# find status
metadata_df["status"] = metadata_df.iloc[:, 10].apply(
    lambda x: 1 if "EOPET" in x else 0 if "Control" in x else None
)
metadata_df = metadata_df.set_index("Sample_geo_accession")

metadata_df = metadata_df["status"].apply(pd.to_numeric, errors="coerce")


In [26]:
metadata_df

Sample_geo_accession
"GSM1088602"    1
"GSM1088603"    1
"GSM1088604"    1
"GSM1088605"    1
"GSM1088606"    1
"GSM1088607"    1
"GSM1088608"    1
"GSM1088609"    1
"GSM1088610"    1
"GSM1088611"    1
"GSM1088612"    1
"GSM1088613"    1
"GSM1088614"    1
"GSM1088615"    1
"GSM1088616"    1
"GSM1088617"    1
"GSM1088618"    1
"GSM1088619"    1
"GSM1088620"    1
"GSM1088621"    1
"GSM1088622"    0
"GSM1088623"    0
"GSM1088624"    0
"GSM1088625"    0
"GSM1088626"    0
"GSM1088627"    0
"GSM1088628"    0
"GSM1088629"    0
"GSM1088630"    0
"GSM1088631"    0
"GSM1088632"    0
"GSM1088633"    0
"GSM1088634"    0
"GSM1088635"    0
"GSM1088636"    0
"GSM1088637"    0
"GSM1088638"    0
"GSM1088639"    0
"GSM1088640"    0
"GSM1088641"    0
Name: status, dtype: int64

In [27]:


# read data
data_lines = [line.strip().split() for line in lines if not line.startswith("!")]
data_df = pd.DataFrame(data_lines).T

data_df.columns = data_df.iloc[0]
data_df = data_df.drop(0, axis=0)

data_df = data_df.set_index('"ID_REF"')
data_df = data_df.iloc[:, 1:]
data_df.columns = [col.strip('"') for col in data_df.columns]

data_df = data_df.apply(pd.to_numeric, errors="coerce")





In [28]:

# merge metadata and data
data_df["status"] = metadata_df
# drop other disease sample
data_df.dropna(subset=["status"], inplace=True)

In [30]:
print(concatenated_data['status'])

"ID_REF"
"GSM2674413"    1.0
"GSM2674414"    1.0
"GSM2674415"    1.0
"GSM2674416"    1.0
"GSM2674417"    1.0
               ... 
"GSM1088637"    0.0
"GSM1088638"    0.0
"GSM1088639"    0.0
"GSM1088640"    0.0
"GSM1088641"    0.0
Name: status, Length: 99, dtype: float64


In [29]:
concatenated_data = pd.concat([data1, data_df], ignore_index=False, join='outer')

In [31]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data_df, test_size=0.2, random_state=42)


# save data
train_data.to_csv(
    "/data/LSY/z_preparing_and_parts/commontrait_Preeclampsia/data_train.txt",
    index=True,
    header=True,
    sep="\t",
)


# save data
test_data.to_csv(
    "/data/LSY/z_preparing_and_parts/commontrait_Preeclampsia/data_test.txt",
    index=True,
    header=True,
    sep="\t",
)
