In [None]:
import json
import pandas as pd
import pickle
import itertools
import numpy as np

In [None]:
with open("../rawdata/hsa00001_20250311.json") as f:
    rawdata = json.load(f)

In [None]:
def parse_protein_str(protein_str):

    parts = protein_str.split('\t')

    gene_info = parts[0].split(';')

    gene_id = gene_info[0].split(' ')[0]
    gene_symbol = gene_info[0].split(' ')[1]
    gene_description = gene_info[1].strip()

    return gene_id, gene_symbol, gene_description

In [None]:
data = []

for i in range(len(rawdata['children'])):
    level1 = rawdata['children'][i]['name']
    level1_info = rawdata['children'][i]['children']
    for j in range(len(level1_info)):
        level2 = level1_info[j]['name']
        level2_info = level1_info[j]['children']
        for k in range(len(level2_info)):
            try:
                level3 = level2_info[k]['name']
                level3_info = level2_info[k]['children']
                for protein in level3_info:
                    protein_info = protein['name']

                    gene_id, gene_symbol, gene_description = parse_protein_str(protein_info)

                    data.append({
                        'Level 1': level1,
                        'Level 2': level2,
                        'Level 3': level3,
                        'Gene ID': gene_id,
                        'Gene Symbol': gene_symbol,
                        'Gene Description': gene_description
                    })
            except:
                pass

data = pd.DataFrame(data)

data['KEGGID'] = data['Level 3'].str.extract(r'\[PATH:(hsa\d+)\]')

data = data.dropna()

data['Levels'] = data['Level 1'] + '---' + data['Level 2']

In [None]:
KEGG_list = sorted(list(set(data['KEGGID'].unique().tolist())))
KEGG_dict = {key: value+7854 for value, key in enumerate(KEGG_list)}

with open('../preprocessed_data/KEGG_dict.pkl', 'wb') as file:
    pickle.dump(KEGG_dict, file)

Target_dict = pd.read_pickle('../preprocessed_data/Target_dict.pkl')

In [None]:
len(KEGG_list)

In [None]:
data_kegg_protein = data[['Gene Symbol', 'KEGGID']].copy()
data_kegg_protein['Gene Symbol'] = data_kegg_protein['Gene Symbol'].map(Target_dict)
data_kegg_protein['KEGGID'] = data_kegg_protein['KEGGID'].map(KEGG_dict)
data_kegg_protein = data_kegg_protein.dropna()
data_kegg_protein['Gene Symbol'] = data_kegg_protein['Gene Symbol'].astype(int)
data_kegg_protein.columns = ['node1', 'node2']
data_kegg_protein['interaction'] = 12

# data_kegg_protein.to_csv('../preprocessed_data/kegg_protein.csv', index=None)

In [None]:
start = list(KEGG_dict.values())[0]
end = list(KEGG_dict.values())[-1]
# nodes = list(range(start, end + 1))

# edges = list(itertools.combinations(nodes, 2))

# df = pd.DataFrame(edges, columns=['node1', 'node2'])

# data['KEGG_id'] = data['KEGGID'].map(KEGG_dict)
# keggid_to_level = dict(zip(data['KEGG_id'], data['Levels']))

# df["level1"] = df["node1"].map(keggid_to_level)
# df["level2"] = df["node2"].map(keggid_to_level)
# df = df.dropna()
# df["Type1_KEGG"] = (df["level1"] == df["level2"]).astype(int)
# df = df.drop(columns=["level1", "level2"])

In [None]:
# data_clean = data.dropna(subset=["KEGG_id", "Gene Symbol"])

# keggid_to_symbol = (
#     data_clean.groupby("KEGG_id")["Gene Symbol"]
#     .apply(set)        # 将同一KEGG_id的Gene Symbol转换为集合
#     .to_dict()         # 转换为字典
# )

# df["symbol1"] = df["node1"].map(lambda x: keggid_to_symbol.get(x, set()))
# df["symbol2"] = df["node2"].map(lambda x: keggid_to_symbol.get(x, set()))

# df["interaction"] = df.apply(
#     lambda row: 13 if (row["symbol1"] & row["symbol2"]) else 0,
#     axis=1
# )

# df = df.drop(columns=["symbol1", "symbol2"])

# df = df[df['interaction'] == 13]

In [None]:
num_nodes = len(KEGG_dict)
print(num_nodes)
self_loops = pd.DataFrame({
    'node1': range(start, end + 1),
    'node2': range(start, end + 1),
    'interaction': [14] * num_nodes
})

df_expanded = pd.concat([self_loops], ignore_index=True)

In [None]:
df_expanded.to_csv('../preprocessed_data/kegg.csv', index=None)

In [None]:
kg1 = pd.read_csv('../preprocessed_data/kg_v1.csv', index_col=None)
kg2 = data_kegg_protein.copy()
kg3 = df_expanded.copy()

kg = pd.concat([kg1, kg2], axis=0)

In [None]:
interaction_to_idx = pd.read_pickle('../preprocessed_data/interaction_to_idx.pkl')
interaction_to_idx

In [None]:
mask = kg['interaction'].isin([3])
subset = kg[mask].copy()
subset['interaction'] = 8
subset[['node1', 'node2']] = subset[['node2', 'node1']].values

kg = pd.concat([kg, subset], ignore_index=True)

In [None]:
mask = kg['interaction'].isin([5])
subset = kg[mask].copy()
subset['interaction'] = 9
subset[['node1', 'node2']] = subset[['node2', 'node1']].values

kg = pd.concat([kg, subset], ignore_index=True)

In [None]:
mask = kg['interaction'].isin([6])
subset = kg[mask].copy()
subset['interaction'] = 10
subset[['node1', 'node2']] = subset[['node2', 'node1']].values

kg = pd.concat([kg, subset], ignore_index=True)

In [None]:
mask = kg['interaction'].isin([7])
subset = kg[mask].copy()
subset['interaction'] = 11
subset[['node1', 'node2']] = subset[['node2', 'node1']].values

kg = pd.concat([kg, subset], ignore_index=True)

In [None]:
mask = kg['interaction'].isin([12])
subset = kg[mask].copy()
subset['interaction'] = 13
subset[['node1', 'node2']] = subset[['node2', 'node1']].values

kg = pd.concat([kg, subset], ignore_index=True)

In [None]:
mask = kg['interaction'].isin([1,2,4])
subset = kg[mask].copy()
subset['interaction'] += 0
subset[['node1', 'node2']] = subset[['node2', 'node1']].values

kg = pd.concat([kg, subset], ignore_index=True)

In [None]:
kg['interaction'].value_counts()

In [None]:
kg.to_csv('../preprocessed_data/kg_v2.csv', index=None)