In [15]:
import os
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from collections import Counter

DATABASE_DIR = './alpha-database'

aminoacids = [
    'ALA',
    'ARG',
    'ASN',
    'ASP',
    'CYS',
    'GLN',
    'GLU',
    'GLY',
    'HIS',
    'LEU',
    'ILE',
    'LYS',
    'MET',
    'PHE',
    'PRO',
    'SER',
    'THR',
    'TRP',
    'TYR',
    'VAL',
]

nucleotides = [
    'A',
    'U',
    'C',
    'G',
]

In [16]:
def getChainIdsAlpha(lines):
  chainIds = []
  for l in lines:
    lineInfos = l.split()
    if lineInfos[6] not in chainIds:
      chainIds.append(lineInfos[6])

  return chainIds

def getProteinChainAlpha(chainId, lines):
  atoms = {}
  for l in lines:
    lineInfos = l.split()
    if lineInfos[6] == chainId and lineInfos[3] == 'CA':
      resSeq = int(lineInfos[8])
      x = float(lineInfos[10])
      y = float(lineInfos[11])
      z = float(lineInfos[12])
      atoms[resSeq] = {
          'name': lineInfos[5],
          'position': (x, y, z),
      }
  return atoms

def getRNAChainAlpha(chainId, lines):
  atoms = {}
  nucleotides = ['A', 'U', 'C', 'G']
  for l in lines:
    lineInfos = l.split()
    if lineInfos[6] == chainId and lineInfos[5] in nucleotides:
      resSeq = int(lineInfos[8])
      x = float(lineInfos[10])
      y = float(lineInfos[11])
      z = float(lineInfos[12])
      atoms[resSeq] = {
          'name': lineInfos[5],
          'position': (x, y, z),
      }
  return atoms

def getProteinAminoacidsAlpha(chainId, lines):
  atoms = {}
  for l in lines:
    lineInfos = l.split()
    if lineInfos[6] == chainId and lineInfos[3] == 'CA':
      resSeq = int(lineInfos[8])
      atoms[resSeq] = lineInfos[5]
  return atoms

def getRNANucleotidesAlpha(chainId, lines):
  atoms = {}
  nucleotides = ['A', 'U', 'C', 'G']
  for l in lines:
    lineInfos = l.split()
    if lineInfos[6] == chainId and lineInfos[5] in nucleotides:
      resSeq = int(lineInfos[8])
      atoms[resSeq] = lineInfos[5]
  return atoms

def dist(a1, a2):
  p1 = a1['position']
  p2 = a2['position']
  x = 0
  y = 1
  z = 2
    
  return math.sqrt((p1[x] - p2[x])**2 + (p1[y] - p2[y])**2 + (p1[z] - p2[z])**2)

def distMap(proteinAtoms, RNAAtoms):
  m = []
  i = 0
  for a1 in proteinAtoms:
    for a2 in RNAAtoms:
      distance = dist(proteinAtoms[a1], RNAAtoms[a2])

      if distance < 5:
        m.append({
            'from': proteinAtoms[a1]['name'],
            'to': RNAAtoms[a2]['name'],
            'dist': distance
        })
        i+=1
          
  return m

def getList(aminoacid):
  resp = []
  for i in interactions:
    if i['from'] == aminoacid:
      resp.append(i['to'])

  return resp

In [17]:
def readAlphaFiles():
  proteinChains = {}
  RNAChains = {}
    
  for dir_name in os.listdir(DATABASE_DIR):
    if dir_name != '.DS_Store' and dir_name != 'cleaner.py':
        aux = 0
        for filename in os.listdir(os.path.join(DATABASE_DIR, dir_name)):
            with open(os.path.join(DATABASE_DIR, dir_name, filename), 'r') as fr:
              lines = fr.readlines()
              structCode = f'{filename[5:9]}-{aux}'
              aux += 1
              chainIDs = getChainIdsAlpha(lines)
              if len(chainIDs) == 2:
                for chainID in chainIDs:
                  proteinChain = getProteinChainAlpha(chainID, lines)
                  if proteinChain != {}:
                    proteinChains[structCode] = proteinChain
                      
                  RNAChain = getRNAChainAlpha(chainID, lines)
                  if RNAChain != {}:
                    RNAChains[structCode] = RNAChain

  return proteinChains, RNAChains

proteinChainsAlpha, RNAChainsAlpha = readAlphaFiles()

In [18]:
interactionsAlpha = {}
all_interactions = []

for key, values in proteinChainsAlpha.items():
  d = distMap(proteinChainsAlpha[key], RNAChainsAlpha[key])
  interactionsAlpha[key] = d

  for val in d:
    all_interactions.append(val)

In [19]:
aminoacid_interaction_stats = {}

for val in all_interactions:
  aminoacid_interaction_stats[val['from']] = getList(val['from'])

frequency = getAminoacidFreq(aminoacid_interaction_stats)
frequency_df = mountFrequencyDf(frequency)
frequency_df.sort_values(by=['Aminoacid'])

NameError: name 'interactions' is not defined

In [None]:
def plotNucleotideFrequency(frequency_df):
  for indices, row in frequency_df.iterrows():
    fig1, ax1 = plt.subplots()
    plt.title(row['Aminoacid'])
    ax1.pie([
            row[nucleotides[0]],
            row[nucleotides[1]],
            row[nucleotides[2]],
            row[nucleotides[3]]
          ],
          labels=nucleotides,
          autopct='%1.1f%%')

    ax1.axis('equal')
      
    plt.savefig(f"imgs/general/db-alpha-nucleotides-frequency/{row['Aminoacid']}.png", bbox_inches='tight')
      
plotNucleotideFrequency(frequency_df)

In [None]:
nucleotides_interaction_stats = {}

for val in all_interactions:
  nucleotides_interaction_stats[val['to']] = getListNumclAmin(val['to'])

frequencyNumclAmin = getNucleotideFreq(nucleotides_interaction_stats)
frequency_df = mountNucleotideFrequencyDf(frequencyNumclAmin)
frequency_df.sort_values(by=['Nucleotide'])

In [None]:
def plotNucleotideFreq(frequency_df, img_dir = 'db-aminoacids-frequency'):
  for indices, row in frequency_df.iterrows():
    fig1, ax1 = plt.subplots()
    plt.title(row['Nucleotide'])
    ax1.pie([
            row[aminoacids[0]],
            row[aminoacids[1]],
            row[aminoacids[2]],
            row[aminoacids[3]],
            row[aminoacids[4]],
            row[aminoacids[5]],
            row[aminoacids[6]],
            row[aminoacids[7]],
            row[aminoacids[8]],
            row[aminoacids[9]],
            row[aminoacids[10]],
            row[aminoacids[11]],
            row[aminoacids[12]],
            row[aminoacids[13]],
            row[aminoacids[14]],
            row[aminoacids[15]],
            row[aminoacids[16]],
            row[aminoacids[17]],
            row[aminoacids[18]],
            row[aminoacids[19]]
          ],
          labels=aminoacids,
          autopct='%1.1f%%')

    ax1.axis('equal')
        
    plt.savefig(f"imgs/general/db-alpha-aminoacids-frequency/{row['Nucleotide']}.png", bbox_inches='tight')

plotNucleotideFreq(frequency_df)

In [None]:
def plotHeatMap(df, file_dir):
    ax = sns.heatmap(df, annot=True, fmt="g", xticklabels=True, yticklabels=True)

    ax.set_xticklabels(ax.get_xticklabels(), fontsize=9)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=9)

    plt.savefig(file_dir)
    plt.clf()

In [None]:
for aminoacid, interactions in interactionsPDB.items():
    if interactions:
        df = pd.DataFrame(0, aminoacids, columns=nucleotides)
        
        for interaction in interactions:
            df.loc[interaction['from'],interaction['to']] = df.loc[interaction['from'],interaction['to']] + 1
        
        plotHeatMap(df, f"imgs/db-pdb/{aminoacid}.png")

In [None]:
for aminoacid, interactions in interactionsAlpha.items():
    if interactions:
        df = pd.DataFrame(0, aminoacids, columns=nucleotides)
        aminoacid_name = aminoacid.split('-')
        
        for interaction in interactions:
            df.loc[interaction['from'],interaction['to']] = df.loc[interaction['from'],interaction['to']] + 1

        file_dir = f"imgs/db-alpha/{aminoacid_name[0]}"
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)
        
        plotHeatMap(df, f"{file_dir}/{aminoacid_name[1]}.png")