In [36]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
def getChainIds(lines):
  chainIds = []
  for l in lines:
    if l[21] not in chainIds:
      chainIds.append(l[21])

  return chainIds

def getProteinChain(chainId, lines):
  atoms = {}
  for l in lines:
    if l[21] == chainId and l[13:15] == 'CA':
      resSeq = int(l[22:26])
      x = float(l[30:38])
      y = float(l[39:46])
      z = float(l[47:54])
      atoms[resSeq] = (x, y, z)
  return atoms

def getRNAChain(chainId, lines):
  atoms = {}
  nucleotides = ['A', 'U', 'C', 'G']
  for l in lines:
    if l[21] == chainId and l[18:20].strip() in nucleotides:
      resSeq = int(l[22:26])
      x = float(l[30:38])
      y = float(l[39:46])
      z = float(l[47:54])
      atoms[resSeq] = (x, y, z)
  return atoms

In [38]:
import os

DATABASE_DIR = '/content/drive/MyDrive/structures'

def readFiles():
  proteinChains = {}
  RNAChains = {}
  for filename in os.listdir(DATABASE_DIR):
    with open(f'{DATABASE_DIR}/{filename}', 'r') as fr:
      lines = fr.readlines()
      structCode = filename[:4]
      chainIDs = getChainIds(lines)
      if len(chainIDs) == 2:
        for chainID in chainIDs:
          proteinChain = getProteinChain(chainID, lines)
          if proteinChain != {}:
            proteinChains[structCode] = proteinChain
          RNAChain = getRNAChain(chainID, lines)
          if RNAChain != {}:
            RNAChains[structCode] = RNAChain

  return proteinChains, RNAChains

proteinChains, RNAChains = readFiles()

In [39]:
import pandas as pd

plot_rna_data = []

for key, values in RNAChains.items():
  plot_rna_data.append([key, len(values)])


rna_df = pd.DataFrame(plot_rna_data, columns=['Structure', 'Number of nucleotides'])
print(rna_df)

    Structure  Number of nucleotides
0        1a4t                     15
1        1biv                     28
2        1aud                     30
3        1exy                     33
4        1emi                    161
..        ...                    ...
193      8fti                     97
194      8e28                     15
195      8e2a                     15
196      8as6                     14
197      8acc                      5

[198 rows x 2 columns]


In [40]:
plot_protein_data = []

for key, values in proteinChains.items():
  plot_protein_data.append([key, len(values)])

protein_df = pd.DataFrame(plot_protein_data, columns=['Structure', 'Number of amino acids'])
print(protein_df)

    Structure  Number of amino acids
0        1a4t                     19
1        1biv                     17
2        1aud                    101
3        1exy                     16
4        1emi                    136
..        ...                    ...
193      8fti                    737
194      8e28                    690
195      8e2a                    686
196      8as6                   1566
197      8acc                    231

[198 rows x 2 columns]


In [41]:
df = pd.merge(protein_df, rna_df, on = "Structure")

df

Unnamed: 0,Structure,Number of amino acids,Number of nucleotides
0,1a4t,19,15
1,1biv,17,28
2,1aud,101,30
3,1exy,16,33
4,1emi,136,161
...,...,...,...
193,8fti,737,97
194,8e28,690,15
195,8e2a,686,15
196,8as6,1566,14


In [47]:
import numpy as np
from google.colab import autoviz

def aminoacids_scatter_plot(df, x_colname, y_colname, figscale=1, alpha=.8):
  from matplotlib import pyplot as plt
  plt.figure(figsize=(6 * figscale, 6 * figscale))
  df.plot(kind='scatter', x=x_colname, y=y_colname, s=(32 * figscale), alpha=alpha)
  plt.gca().spines[['top', 'right',]].set_visible(False)
  plt.tight_layout()
  return autoviz.MplChart.from_current_mpl_state()

chart = aminoacids_scatter_plot(_df_3, *['index', 'Number of amino acids'], **{})
chart

<Figure size 600x600 with 0 Axes>

In [48]:
import numpy as np
from google.colab import autoviz

def aminoacids_histogram(df, colname, num_bins=20, figscale=1):
  from matplotlib import pyplot as plt
  df[colname].plot(kind='hist', bins=num_bins, title=colname, figsize=(8*figscale, 4*figscale))
  plt.gca().spines[['top', 'right',]].set_visible(False)
  plt.tight_layout()
  return autoviz.MplChart.from_current_mpl_state()

chart = aminoacids_histogram(_df_1, *['Number of amino acids'], **{})
chart

In [49]:
import numpy as np
from google.colab import autoviz

def nucleotides_scatter_plot(df, x_colname, y_colname, figscale=1, alpha=.8):
  from matplotlib import pyplot as plt
  plt.figure(figsize=(6 * figscale, 6 * figscale))
  df.plot(kind='scatter', x=x_colname, y=y_colname, s=(32 * figscale), alpha=alpha)
  plt.gca().spines[['top', 'right',]].set_visible(False)
  plt.tight_layout()
  return autoviz.MplChart.from_current_mpl_state()

chart = nucleotides_scatter_plot(_df_3, *['index', 'Number of nucleotides'], **{})
chart

<Figure size 600x600 with 0 Axes>

In [50]:
import numpy as np
from google.colab import autoviz

def nucleotides_histogram(df, colname, num_bins=20, figscale=1):
  from matplotlib import pyplot as plt
  df[colname].plot(kind='hist', bins=num_bins, title=colname, figsize=(8*figscale, 4*figscale))
  plt.gca().spines[['top', 'right',]].set_visible(False)
  plt.tight_layout()
  return autoviz.MplChart.from_current_mpl_state()

chart = nucleotides_histogram(_df_2, *['Number of nucleotides'], **{})
chart

In [46]:
import numpy as np
from google.colab import autoviz

def scatter_plot(df, x_colname, y_colname, figscale=1, alpha=.8):
  from matplotlib import pyplot as plt
  plt.figure(figsize=(6 * figscale, 6 * figscale))
  df.plot(kind='scatter', x=x_colname, y=y_colname, s=(32 * figscale), alpha=alpha)
  plt.gca().spines[['top', 'right',]].set_visible(False)
  plt.tight_layout()
  return autoviz.MplChart.from_current_mpl_state()

chart = scatter_plot(_df_4, *['Number of amino acids', 'Number of nucleotides'], **{})
chart

<Figure size 600x600 with 0 Axes>