# DensiTree
## Protein Density Prediction
DensiTree allows for the prediction of protein density (in g cm<sup>3</sup>) via the use of pre-trained random forest regressors. DensiTree is housed in the [DensiTree GitHub repo](https://github.com/Degiacomi-Lab/DensiTree). These random forest regressors have been trained (see the [README.md](https://github.com/Degiacomi-Lab/DensiTree?tab=readme-ov-file#densitree)) on different features selected from data derived from molecular dynamics simulations of a dataset of 260-proteins at two separate temperatures (300 K and 310.15 K, respectively), including two regressors trained purely on the amino acid compositions of the proteins in the dataset.

## Using DensiTree
DensiTree can predict protein densities from Protein Databank .pdb files ([PDB](https://www.rcsb.org/) files), [FASTA files](https://en.wikipedia.org/wiki/FASTA_format), or strings (of [one-letter or three-letter amino acid codes](https://www.ebi.ac.uk/pdbe/docs/roadshow_tutorial/msdtarget/AAcodes.html)). However, only by using PDB files can regressors trained on structured-derived features be used.

In [None]:
#@title Install conda for colab
#@markdown Kernel will restart automatically, then you can continue to the next cell.
#@markdown <br> (Don't worry if you get a 'Your session crashed for an unknown reason.' error, this is the expected behaviour.)
!if [ -n "$COLAB_RELEASE_TAG" ]; then pip install condacolab; fi
import condacolab
condacolab.install()

In [None]:
#@title Get density prediction code from GitHub repository
%%capture
import condacolab
%cd ~/../content

!rm -rf DensiTree
!git clone "https://github.com/Degiacomi-Lab/DensiTree.git"


In [None]:
#@title Install dependencies and enable widgets
%%capture
!mamba env update -n base -f DensiTree/environment.yml

#from google.colab import output
#output.enable_custom_widget_manager()


# Upload protein structures or sequences as appropriate below.
Multiple PDB files, FASTA files and sequence strings can be uploaded.

In [None]:
# @title Upload Protein Structure PDB File(s)
from google.colab import files
import ipywidgets as widgets
from IPython.display import display, HTML
import os
import shutil

# Ensure correct cwd
%cd /content

if not os.path.exists('structure_data'):
    os.makedirs('structure_data')

# filename variable
uploaded_filename = None

# file upload function
def file_upload(b):
    global uploaded_filename
    uploaded = files.upload()
    for fname in uploaded.keys():
        # Move each file into the 'data' folder
        shutil.move(fname, os.path.join('structure_data', fname))
        uploaded_filename = os.path.join('structure_data', fname)
        print(f"✅ Uploaded file saved to: {uploaded_filename}")

# button widget
btn = widgets.Button(
    description='📁 Upload PDB File(s)',
    button_style='success',
    layout=widgets.Layout(width='300px', height='75px'),
    style={'font_weight': 'bold', 'font_size': '16px'}
)

btn.on_click(file_upload)
display(HTML("<h3 style='font-family:sans-serif;'>Upload Protein Structure PDB File</h3>"))
display(btn)



In [None]:
# @title Upload Protein Sequences as FASTA file(s)

# Ensure correct cwd
%cd /content

if not os.path.exists('sequence_data'):
    os.makedirs('sequence_data')

# filename variable
uploaded_filename = None

# file upload function
def file_upload(b):
    global uploaded_filename
    uploaded = files.upload()
    for fname in uploaded.keys():
        # Move each file into the 'data' folder
        shutil.move(fname, os.path.join('sequence_data', fname))
        uploaded_filename = os.path.join('sequence_data', fname)
        print(f"✅ Uploaded file saved to: {uploaded_filename}")

# button widget
btn = widgets.Button(
    description='📁 Upload FASTA File(s)',
    button_style='success',
    layout=widgets.Layout(width='300px', height='75px'),
    style={'font_weight': 'bold', 'font_size': '16px'}
)

btn.on_click(file_upload)
display(HTML("<h3 style='font-family:sans-serif;'>Upload Protein Structure PDB File</h3>"))
display(btn)

In [None]:
#@title Add Protein Sequences as Text
#@markdown Protein amino residue sequences in either one or three letter codes separated by commas.

textarea = widgets.Textarea(
    description='Sequence(s):',
    disabled=False,
    placeholder='Add comma-separated protein sequences',
    layout=widgets.Layout(width='600px', height='100px'),
    style={'font_weight': 'bold', 'font_size': '16px'}
)

button = widgets.Button(
    description='Submit',
    button_style='success'
)
box_layout = widgets.Layout(display='flex',
                flex_flow='column',
                align_items='center',
                width='50%')
box = widgets.HBox(children=[button],layout=box_layout)

sequences_text = ""
def on_button_click(b):
  global sequences_text
  sequences_text = textarea.value
  print(f"Sequence(s) = {sequences_text}")
  return sequences_text
button.on_click(on_button_click)

display(textarea, box)

In [None]:
#@title Choose Random Forest Regressor options
#@markdown <br>'Sequence & Structure' estimates protein density using **both** the protein structure-derived characteristic features and the protein sequence (valid if protein structures in PDB format have been uploaded).
#@markdown <br>'20 best features' refers to the most important features when using structure-derived features.
import ipywidgets as widgets
from IPython.display import display

options = ["Sequence & Structure?"]

checkboxes = []
for option in options:
    if "recommended" in option:
      checkbox = widgets.Checkbox(description=option, value=True)
    else:
      checkbox = widgets.Checkbox(description=option, value=False)
    checkboxes.append(checkbox)

checks1 = widgets.VBox(checkboxes)
display(checks1)

temp_options = ["300 K (recommended)","310.15 K"]

dropdown1 = widgets.Dropdown(
    options=temp_options,
    value="300 K (recommended)",
    description='Temperature:',
    disabled=False,
)
display(dropdown1)


feature_options = ["20 best features (recommended)", "all features"]
dropdown2 = widgets.Dropdown(
    options=feature_options,
    value="20 best features (recommended)",
    description='Features:',
    disabled=False,
)
display(dropdown2)




# Calculate predicted density results below
Results will be printed to the console and saved in comma-separated lines to results/predictions.txt.

In [None]:
#@title Calculate results and save to file (predictions.txt) in results folder

%cd ~/../content/DensiTree

import DensiTree.DensiTree as DT

structure_data_files = [file for file in os.listdir("../structure_data") if file.endswith(".pdb")]

if (checkboxes[0].value == True) and not (os.path.exists("../sequence_data")):
  !mkdir "../sequence_data"
  !cp ../structure_data/* ../sequence_data/

#ensure hidden files not added
sequence_data_files = []
for file in os.listdir("../sequence_data"):
    if not file.startswith('.'):
        sequence_data_files.append(file)


for data_file in structure_data_files:
  if not data_file.endswith(".pdb"):
    print(f"{data_file} is not a PDB file, please only upload files with the '.pdb' extension to the data folder.")

if not os.path.exists('results'):
    os.makedirs('results')

if not os.path.exists('results/predictions_1.txt'):
  results_file = 'results/predictions_1.txt'
else:
  results_files = os.listdir('results')
  results = [int(results_file.split('_')[1].split('.')[0]) for results_file in results_files if results_file.endswith(".txt")]
  results.sort()
  results_file = f"results/predictions_{results[-1] + 1}.txt"

temp = dropdown1.value.strip("(recommended)").rstrip()

if dropdown2.value == "20 best features (recommended)":
  important_features = True
else:
  important_features = False

with open(results_file, "w") as w_file:
  for data_file in sequence_data_files:
    seq = DT.Sequence(f"../sequence_data/{data_file}", temp=temp)
    prediction, feats = seq.predict()
    print(seq.sequence.split("/")[-1], prediction)
    w_file.write(f"{seq.sequence.split('/')[-1]}, {prediction}, {temp}, sequence\n")

  for data_file in structure_data_files:
    if not data_file.endswith(".pdb"): continue
    struct = DT.Structure(f"../structure_data/{data_file}", temp=temp, important_features=important_features)
    prediction, feats = struct.predict()
    print(struct.structure.split("/")[-1], prediction)
    w_file.write(f"{struct.structure.split('/')[-1]}, {prediction}, {temp}, structure\n")
    if checkboxes[0].value == True:
      seq = DT.Sequence(f"../structure_data/{data_file}", temp=temp)
      prediction, feats = seq.predict()
      print(seq.sequence.split("/")[-1], prediction)
      w_file.write(f"{seq.sequence.split('/')[-1]}, {prediction}, {temp}, sequence\n")
  try:
    if len(sequences_text) != 0 and sequences_text != 'Add comma-separated protein sequences':
      for sequence in sequences_text.split(","):
        sequence = sequence.strip()
        seq = DT.Sequence(sequence, temp=temp)
        prediction, feats = seq.predict()
        print(seq.sequence, prediction)
        w_file.write(f"{seq.sequence}, {prediction}, {temp}, sequence\n")
  except:
    print("No sequence text added.")




In [None]:
# @title Download results
from google.colab import files
import ipywidgets as widgets
from IPython.display import display, HTML
import os
import shutil

# Ensure correct cwd
%cd ~/../content/DensiTree

# filename variable
download_filenames = [os.path.join("results", file) for file in os.listdir("results") if file.startswith("predictions")]

# file download function
def file_download(b):
    global download_filenames
    for download_file in download_filenames:
      downloaded = files.download(download_file)

# button widget
btn = widgets.Button(
    description='📁 Download results.txt File(s)',
    button_style='success',
    layout=widgets.Layout(width='300px', height='75px'),
    style={'font_weight': 'bold', 'font_size': '16px'}
)

btn.on_click(file_download)
display(HTML("<h3 style='font-family:sans-serif;'>Download protein density results file(s)</h3>"))
display(btn)

