In [43]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [89]:
def getChainIds(lines):
  chainIds = []
  for l in lines:
    if l[21] not in chainIds:
      chainIds.append(l[21])

  return chainIds

def getProteinChain(chainId, lines):
  atoms = {}
  for l in lines:
    if l[21] == chainId and l[13:15] == 'CA':
      resSeq = int(l[22:26])
      x = float(l[30:38])
      y = float(l[39:46])
      z = float(l[47:54])
      atoms[resSeq] = (x, y, z)
  return atoms

def getRNAChain(chainId, lines):
  atoms = {}
  nucleotides = ['A', 'U', 'C', 'G']
  for l in lines:
    if l[21] == chainId and l[18:20].strip() in nucleotides:
      resSeq = int(l[22:26])
      x = float(l[30:38])
      y = float(l[39:46])
      z = float(l[47:54])
      atoms[resSeq] = (x, y, z)
  return atoms

In [90]:
import os

DATABASE_DIR = '/content/drive/MyDrive/structures'

def readFiles():
  proteinChains = {}
  RNAChains = {}
  for filename in os.listdir(DATABASE_DIR):
    with open(f'{DATABASE_DIR}/{filename}', 'r') as fr:
      lines = fr.readlines()
      structCode = filename[:4]
      chainIDs = getChainIds(lines)
      if len(chainIDs) == 2:
        for chainID in chainIDs:
          proteinChain = getProteinChain(chainID, lines)
          if proteinChain != {}:
            proteinChains[structCode] = proteinChain
          RNAChain = getRNAChain(chainID, lines)
          if RNAChain != {}:
            RNAChains[structCode] = RNAChain

  return proteinChains, RNAChains

proteinChains, RNAChains = readFiles()

In [86]:
import pandas as pd

plot_rna_data = []

for key, values in RNAChains.items():
  plot_rna_data.append([key, len(values)])


rna_df = pd.DataFrame(plot_rna_data, columns=['Structure', 'Number of nucleotides'])
print(rna_df)

    Structure  Number of nucleotides
0        1ekz                     30
1        1zbn                     28
2        1x1l                    130
3        1ull                     35
4        1rmv                      3
..        ...                    ...
193      7zap                     28
194      7yyn                     55
195      7yym                     55
196      7yn9                     38
197      7zpi                     55

[198 rows x 2 columns]


In [88]:
plot_protein_data = []

for key, values in proteinChains.items():
  plot_protein_data.append([key, len(values)])

protein_df = pd.DataFrame(plot_protein_data, columns=['Structure', 'Number of amino acids'])
print(protein_df)

    Structure  Number of amino acids
0        1ekz                     76
1        1zbn                     17
2        1x1l                    292
3        1ull                     17
4        1rmv                    156
..        ...                    ...
193      7zap                     97
194      7yyn                    705
195      7yym                   1281
196      7yn9                   1514
197      7zpi                    705

[198 rows x 2 columns]
