# -*- coding: utf-8 -*-
# Created by Arunabh Athreya, December 9, 2024

# The script is created to perform the following functions:
    1. Membrane proximity trajectory of the COM of protein w.r.t the lipid bilayer
    2. contribution of individual residues in the protein-bilayer interaction
    3. specificity of interaction of residue vs type of lipid.
    4. magnitude of penetration into the bilayer at a global level.


In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Function to get the file path from the user
def get_file_path():
  while True:
    file_path = input("Enter the path to your file in Google Drive (e.g., /content/drive/MyDrive/your_file.txt): ")
    if os.path.exists(file_path):
      return file_path
    else:
      print("File not found. Please enter a valid path.")

# Get the file path from the user
file_path = get_file_path()

# Now you can access the file using the file_path variable
# For example, to read the file contents:
with open(file_path, 'r') as f:
  file_contents = f.read()
  # Process file_contents

In [27]:
b=input("""enter the IDs of the lipids that were used in the simulation
and is seen in the PDB trajectory. For example, \n POPE,POPC,POPG,PMCL \n input:\n""")
name=input("Enter filename:\t")

enter the IDs of the lipids that were used in the simulation
and is seen in the PDB trajectory. For example, 
 POPE,POPC,POPG,PMCL 
 input:
POPE,POPG,PMCL
Enter filename:	t5.pdb


In [29]:
import numpy as np
import math
from scipy import stats

In [30]:
"""
Block 1:
    This block helps in identifying the residues in the protein molecule and adds
    the coordinates of their Centre of Geometry (CoG) in a numpy array. Runs on
    only the first frame before reaching breakpoint.
"""
reslist=[]  #listing all residues in the protein.
prot_atoms=0
lipid_atoms=0
lipidcount=0
nframes=0   #frames of trajectory
content=open(str(name),'r').readlines()
lipids=b.split(",")
lipidlist=[]    #all residue numbers assigned to lipids
for line in content:
    if line.startswith("MODEL"):
        print(line)
    elif line.startswith("ATOM"):
        if line[21:22] == "A":
            prot_atoms=prot_atoms+1
            if line[22:26].strip() not in reslist:
                reslist.append(line[22:26].strip())
        for a in lipids:
            if a in line:
                lipid_atoms=lipid_atoms+1
                if line[22:26].strip() not in lipidlist:
                    lipidlist.append(line[22:26].strip())
    elif line.startswith("ENDMDL"):
        break
#print(lipidlist)
#print(len(lipidlist))
#print('\n'*3)
#print(reslist)
#print(len(reslist))
print("Number of protein atoms are: ", prot_atoms,'\n')
print("number of lipid atoms are: ",lipid_atoms,'\n')
for line in content:
    if line.startswith("MODEL"):
        nframes=nframes+1
        print("processed frame no.",nframes,'... \n')

MODEL        1

Number of protein atoms are:  6852 

number of lipid atoms are:  41856 

processed frame no. 1 ... 

processed frame no. 2 ... 

processed frame no. 3 ... 

processed frame no. 4 ... 

processed frame no. 5 ... 

processed frame no. 6 ... 

processed frame no. 7 ... 

processed frame no. 8 ... 

processed frame no. 9 ... 

processed frame no. 10 ... 

processed frame no. 11 ... 

processed frame no. 12 ... 

processed frame no. 13 ... 

processed frame no. 14 ... 

processed frame no. 15 ... 

processed frame no. 16 ... 

processed frame no. 17 ... 

processed frame no. 18 ... 

processed frame no. 19 ... 

processed frame no. 20 ... 

processed frame no. 21 ... 

processed frame no. 22 ... 

processed frame no. 23 ... 

processed frame no. 24 ... 

processed frame no. 25 ... 

processed frame no. 26 ... 

processed frame no. 27 ... 

processed frame no. 28 ... 

processed frame no. 29 ... 

processed frame no. 30 ... 

processed frame no. 31 ... 

processed frame no. 3



    The arrays will have the following columns:
    column    class
    ---

    0         frame no.
    1         atom no.
    2         x-coord
    3         y-coord
    4         z-coord
    5         residue number
    6         atom-ID
    7         lipidtype
    8         in case of lipids: whether its a phosphorus atom (1) or not (0) in lipid_array.

In [31]:
prot_array=np.ndarray(shape=(nframes,prot_atoms,7),dtype=object)
print("creating array in "+str(prot_array.ndim)+" dimensions for protein atoms.")
print(prot_array.shape)
#print(prot_array.dtype)
#print(prot_array)

lipid_array=np.ndarray(shape=(nframes,lipid_atoms,9),dtype=object)
print("creating array in "+str(lipid_array.ndim)+" dimensions for lipids.")
print(lipid_array.shape)
#print(lipid_array.dtype)
#print(lipid_array)

creating array in 3 dimensions for protein atoms.
(76, 6852, 7)
creating array in 3 dimensions for lipids.
(76, 41856, 9)



This block does the following:
1.   Adds all protein atom coordinates to prot_array
2.   Add all lipid atom coordinates to lipid_array
3.   Add an additional information of whether a certain line in lipid array belonged to phosphorus atom.

It also adds frame number for later purposes, when filtering the array for P
atoms will reduce the dimensions of the array.

In [32]:
for line in content:
    if "MODEL" in line:
        rcount = 0
        lcount = 0
        model=int(line[6:14].strip())
        print(model)
    if line.startswith("ATOM"):
        if line[22:26].strip() in reslist:
            prot_array[model-1,rcount,0]=model
            prot_array[model-1,rcount,1]=line[5:11].strip()
            prot_array[model-1,rcount,2]=line[27:38].strip()
            prot_array[model-1,rcount,3]=line[38:46].strip()
            prot_array[model-1,rcount,4]=line[46:54].strip()
            prot_array[model-1,rcount,5]=line[22:26].strip()
            prot_array[model-1,rcount,6]=line[11:16].strip()
#            print(prot_array[model-1,rcount])
            rcount = rcount + 1
        if line[22:26].strip() in lipidlist:
            lipid_array[model-1,lcount,0]=model
            lipid_array[model-1,lcount,1]=line[5:11].strip()
            lipid_array[model-1,lcount,2]=line[27:38].strip()
            lipid_array[model-1,lcount,3]=line[38:46].strip()
            lipid_array[model-1,lcount,4]=line[46:54].strip()
            lipid_array[model-1,lcount,5]=line[22:26].strip()
            lipid_array[model-1,lcount,6]=line[11:16].strip()
            if 'P' in line[12:17].strip():  #adds a marker for a line of phosphorus
                lipid_array[model-1,lcount,8]=True
            for lipid in lipids:
                if lipid in line:
                    lipid_array[model-1,lcount,7]=lipid
            lcount = lcount + 1
print(prot_array)
print(lipid_array)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
[[[1 '1' '56.090' ... '161.907' '1' 'N']
  [1 '2' '56.520' ... '161.417' '1' 'HT1']
  [1 '3' '55.260' ... '162.397' '1' 'HT2']
  ...
  [1 '6850' '66.350' ... '163.467' '415' 'C']
  [1 '6851' '67.020' ... '164.507' '415' 'OT1']
  [1 '6852' '66.660' ... '162.467' '415' 'OT2']]

 [[2 '1' '48.450' ... '167.278' '1' 'N']
  [2 '2' '48.050' ... '167.158' '1' 'HT1']
  [2 '3' '47.930' ... '167.938' '1' 'HT2']
  ...
  [2 '6850' '69.980' ... '161.738' '415' 'C']
  [2 '6851' '70.740' ... '161.568' '415' 'OT1']
  [2 '6852' '70.260' ... '161.458' '415' 'OT2']]

 [[3 '1' '58.730' ... '168.559' '1' 'N']
  [3 '2' '58.160' ... '168.119' '1' 'HT1']
  [3 '3' '58.250' ... '168.549' '1' 'HT2']
  ...
  [3 '6850' '78.480' ... '160.239' '415' 'C']
  [3 '6851' '79.350' ... '160.589' '415' 'OT1']


There are two functions of this block. The first is to determine the limits of the bilayer along the Z-axis. For this the path is:

1.   Find the consensus Z axis min and max
2.   Repeat for every frame

This helps in initiating further functions that calculate protein-lipid interacts only in frames where any atom of the protein is near the boundary of the lipid bilayer, saving computational time.

The second part of the code block generates a difference matrix for distance between every protein atom to every lipid atom in every frame.






In [None]:
"""Iterate over all frames to find interacting atoms (between protein and lipid,
not lipid-lipid or intraprotein) to make an empty list to cumulatively store
diff_matrix (difference matrix) for each frame.
"""
# STEP-BY-STEP EXPLANATION:
# 1. Reading individual frames:
all_diff_matrices = []
lipid_count = {}
# all_diff_matrices: An empty list to store distance matrices (explained later) for each frame.
# lipid_count: An empty dictionary to store the count of interactions for each lipid type.
for frame in range(1,nframes+1):
    print(f"Processing frame {frame}...")
    diff_filter = [] # a subarray for protein atoms that come close to the bilayer
    # Create a mask for the current frame:
    bilayer_mask = lipid_array[:,:,0] == frame
    protein_mask = prot_array[:,:,0] == frame
    # Apply the mask to create the subarray
    lipid_subarray = lipid_array[bilayer_mask] #isolated all lipid data for one frame
    protein_subarray = prot_array[protein_mask] #isolated all protein data for one frame
    zcoord = lipid_subarray[:,4].astype(float) #get z-coordinates list for lipid array
    zmax = np.max(zcoord)
    zmin = np.min(zcoord)
    print(f"ZMax and Zmin are: '{zmax}','{zmin}'")
    # checking for atoms in the protein data whose either (Zcoord-zmin) or (Zcoord-zmax) <4:
    diffa=np.abs(protein_subarray[:,4].astype(float)-zmin) # Here Zcoordinate of a protein atom
    diffb=np.abs(protein_subarray[:,4].astype(float)-zmax) # is the 5th column in protein_subarray
    diff_filter=protein_subarray[np.logical_or(diffa < 4, diffb < 4)]
    print(f"There are {len(diff_filter)} protein atoms close to the bilayer in frame {frame}.")
    #print(diffb_filter)
    # Print the subarray for the current value
    #print(f"Frame '{frame}' protein:")
    #print(protein_subarray)
    print("-"*20)
    #print(f"Frame '{frame}' lipids:")
    #print(lipid_subarray)
    #print(f"The bilayer lies between '{zmin}' and '{zmax}'")
    if len(diff_filter) > 0:
        px = diff_filter[:, 2].astype(float) #Protein dataset's 3rd column, i.e., the X-coord.
        py = diff_filter[:, 3].astype(float) #Y-coord
        pz = diff_filter[:, 4].astype(float) #Z-coord
        lx = lipid_subarray[:, 2].astype(float) #lipid atom's X-coord
        ly = lipid_subarray[:, 3].astype(float) #lipid atom's Y-coord
        lz = lipid_subarray[:, 4].astype(float) #lipid atom's Z-coord
        diff_matrix = np.sqrt(np.power(px[:, np.newaxis] - lx, 2) +
                      np.power(py[:, np.newaxis] - ly, 2) +
                      np.power(pz[:, np.newaxis] - lz, 2))
        print(f"diff_matrix is: '{diff_matrix}'")
        # This will fetch the indices where diff_matrix values are <4Å
        row_indices, col_indices = np.where(diff_matrix < 4)
        print(f"row_indices are: '{row_indices}'")
        print(f"col_indices are: '{col_indices}'")
        # To extract the corresponding residue numbers from protein_array:
        prot_res_no = diff_filter[row_indices,5]
        lipi_res_no = lipid_subarray[col_indices,5]
        for row,col in zip(row_indices, col_indices):
            print(f"The interactions are: Residue {diff_filter[row,5]} and {lipid_subarray[col,7]} whose atoms {diff_filter[row,6]} and {lipid_subarray[col,6]} respectively are {diff_matrix[row, col]} Å apart.")
        print(prot_res_no,lipi_res_no)
        print(diff_matrix.shape) #should be (x,y), where x= number of protein atoms that
        # come close to bilayer, and y= total number of lipid atoms
        """
        To find which lipids interact the most:
          This will take the count of number of instances in the diff_matrix with
          values less than 4Å. Then it will ask to what kind of lipid do these instances
          belong to, by calling out lipid_subarray[col_indices,7] and appending it
          to a list, and then counting each lipidtype.
        """
        for col in col_indices:
            lipid_type = lipid_subarray[col, 7]
            if lipid_type in lipid_count:
                lipid_count[lipid_type] += 1
            else:
                lipid_count[lipid_type] = 1
    #print lipid counts here for each lipid type.
    print(f"cumulative lipid counts till frame {frame}:")
    for lipid_type, count in lipid_count.items():
        print(f"{lipid_type}: {count}")
    lipid_count.clear()
    all_diff_matrices.append(diff_matrix)
    print("=" * 20)  # Separator between frames

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [ 34.93912563  35.95834396  34.32704619 ...  99.68267703 100.6690285
  101.05741784]]'
row_indices are: '[  8  10  10  11  11  11  11  11  11  12  13  13  13  18  18  49  50  50
  50  52  52  54  54  54  56  56  57  57  57  57  58  59  60  99  99  99
 100 100 100 100 101 101 101 102 107 107 107 108 108 108 108 108 108 108
 108 109 109 109 109 110 112 113 113 144 144 145 145 145 146 146 147 147
 147 147 147 148 150 150 196 366 367 367 420]'
col_indices are: '[ 3979  3981  3983  3978  3979  3981  3982  3983  3984  3983  3981  3982
  3983  3979  3981  4608  4607  4608  4609  4608  4609  3978  3979  3980
  5104 35647  4609  4611 34618 34619  4609  4609  4609   702   703   704
   702   703   704   705   702   703   704 10240   705   711   715   699
   702   703   705   710   711   714   715   711   714   715   718   715
   711   711   715   704   705 12113 12114 12115 35004 35009 35003 35004
 35007 35008 35009 35009 35008 350