In [9]:
import os
import time
import re
from itertools import product
from itertools import repeat
import concurrent.futures
import pandas as pd
import statistics
import warnings
from multiprocessing import Process, Pool
from pathlib import *
from Bio.PDB import *
from Bio.PDB.MMCIF2Dict import *
from Bio.PDB.PDBExceptions import PDBConstructionWarning 

warnings.simplefilter('ignore', PDBConstructionWarning) #ignorar warning (PDBConstructionWarning: WARNING: Chain B is discontinuous at line numeroDaLinha.)

def ramification(fileName):
    
    print("B - factor values: " + fileName)

    mmcif_dict = MMCIF2Dict(fileName)

    entity_dict = {"id": mmcif_dict['_entity.id'], 
                   "type": mmcif_dict['_entity.type'], 
                   #"description": mmcif_dict['_struct_conn.ptnr1_auth_seq_id'], 
                    "number_of_molecules": mmcif_dict['_entity.pdbx_number_of_molecules']}
    
    entity_df = pd.DataFrame(data = entity_dict)

    entity_df = entity_df.loc[entity_df['type'] == "branched"]

    for index, row in entity_df.iterrows():
        if('[' in row['description'] or ']' in row['description']):
            # Cria um DataFrame temporário para a linha atual
            linha = pd.DataFrame([row])

            # Escreve a linha no arquivo CSV. Usa mode='a' para adicionar ao arquivo se já existir.
            linha.to_csv('/home/douglas/carboanalysis/carboanalysis/pdb/dataframes/ramifications.csv', mode='a', header=False, index=False)


B - factor values: 2wmg.cif
  chain1_id comp_seq_id_1 vaa atom_symbol atom_label bfactors
0         B           NAG   1           B        GAL        2
1         B           NAG   1           B        FUC        4
2         B           GAL   2           B        FUC        3


In [8]:
#Conta oligossacarídeos
from SIC2023_resolution import *
from tqdm import tqdm
import os
import pandas as pd
import warnings

warnings.simplefilter('ignore', PDBConstructionWarning)
def oligo_count(fileName):
    
    mmcif_dict = MMCIF2Dict(fileName)

    entity_dict = {"id": mmcif_dict['_entity.id'], 
                   "type": mmcif_dict['_entity.type'], 
                   "description": mmcif_dict['_entity.pdbx_description'], 
                    "number_of_molecules": mmcif_dict['_entity.pdbx_number_of_molecules']}
    
    entity_df = pd.DataFrame(data = entity_dict)

    entity_df = entity_df.loc[entity_df['type'] == "branched"]
    entity_df['number_of_molecules'] = entity_df['number_of_molecules'].astype(float)

    return entity_df['number_of_molecules'].sum()

os.chdir("/home/douglas/carboanalysis/data/unzipped")
df = pd.read_csv("/home/douglas/carboanalysis/carboanalysis/pdb/dataframes/SIC/SIC2023_carbo_entrys_res_owab_filtered.txt", names = ['entry_filename'])
fileNames = df['entry_filename'].values

# Inicializando o contador total
total_count = 0

# Iterando sobre cada filename na lista e somando os valores
for file in tqdm(fileNames):
    total_count += oligo_count(file)
print(total_count)

100%|██████████| 6622/6622 [23:34<00:00,  4.68it/s]  

8250.0





In [10]:
#Read
import pandas as pd
ramification_df = pd.read_csv('/home/douglas/carboanalysis/carboanalysis/pdb/dataframes/ramifications.csv', header=None, names = ['id', 'type', 'description', 'number_of_molecules', 'entry'])

In [None]:
#Ramificados x lineares
ramification_df['number_of_molecules'] = ramification_df['number_of_molecules'].astype(float)
print(ramification_df['number_of_molecules'].sum())

In [None]:
#Coleta os dados das ligações glicosídicas
def find_linkages(fileName):
    
    #Cria um dicionário a partir do arquivo .cif
    mmcif_dict = MMCIF2Dict(fileName)

    try:
        print('Linking: ' + fileName)

        #Coleta informações das Branched entities
        branch_dict = {"entity_id": mmcif_dict['_entity.id'], 
                        "num_of_molecules": mmcif_dict['_entity.pdbx_number_of_molecules']}

        #Transforma num dataframe pandas
        branch_df = pd.DataFrame(data = branch_dict)

        #converte a coluna "num_of_molecules" para inteiro
        branch_df['num_of_molecules'] = branch_df['num_of_molecules'].astype(int)

        #Coleta informações das ligações
        linkage_dict = {"entry_id": mmcif_dict['_entry.id'],
                        "link_id": mmcif_dict['_pdbx_entity_branch_link.link_id'], 
                        "entity_id": mmcif_dict['_pdbx_entity_branch_link.entity_id'], 
                        "branch_1_id":  mmcif_dict['_pdbx_entity_branch_link.entity_branch_list_num_1'], 
                        "comp_1_id": mmcif_dict["_pdbx_entity_branch_link.comp_id_1"], 
                        "atom_1_id": mmcif_dict['_pdbx_entity_branch_link.atom_id_1'], 
                        "leaving_atom_1_id": mmcif_dict['_pdbx_entity_branch_link.leaving_atom_id_1'],
                        "branch_2_id":  mmcif_dict['_pdbx_entity_branch_link.entity_branch_list_num_2'], 
                        "comp_2_id": mmcif_dict["_pdbx_entity_branch_link.comp_id_2"], 
                        "atom_2_id": mmcif_dict['_pdbx_entity_branch_link.atom_id_2'], 
                        "leaving_atom_2_id": mmcif_dict['_pdbx_entity_branch_link.leaving_atom_id_2'],
                        "order": mmcif_dict['_pdbx_entity_branch_link.value_order']}
        
        #Transforma num dataframe pandas
        linkage_df = pd.DataFrame(data = linkage_dict)
        
        #Coleta informações de nomenclatura dos açúcares
        identifier_dict = {"comp_id": mmcif_dict['_pdbx_chem_comp_identifier.comp_id'], 
                        "identifier_type": mmcif_dict['_pdbx_chem_comp_identifier.type'], 
                        "identifier":  mmcif_dict['_pdbx_chem_comp_identifier.identifier']}
        
        #Transforma num dataframe pandas
        identifier_df = pd.DataFrame(data = identifier_dict)

        #Leva em consideração o número de moleculas de cada entidade
        num_of_molecules_list = []
        for index, row in linkage_df.iterrows():
            num_of_molecules_list.append(branch_df.loc[branch_df['entity_id'] == row['entity_id'], 'num_of_molecules'].values[0])
        
        #Adiciona esse número como uma nova coluna do dataframe linkage_df
        linkage_df['num_of_molecules'] = num_of_molecules_list
        

        #Escreve a informação das ligações num arquivo .csv
        linkage_df.to_csv(path_or_buf="/home/douglas/carboanalysis/carboanalysis/pdb/dataframes/SIC/all_linkages_v2.csv", mode='a', index=False, header=False, sep=";") 

    except ValueError as error:
        return None
    except KeyError as error:
        return None
