# Standard packages

In [1]:
import os
import pandas as pd
import time

# Specific installs

In [2]:
method_name = "MXfold2"

In [3]:
# install conda on colab
!pip install -q condacolab
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
!conda create --name mxfold2 python=3.8

In [5]:
!wget -q https://github.com/keio-bioinformatics/mxfold2/releases/download/v0.1.1/mxfold2-0.1.1-cp38-cp38-linux_x86_64.whl

In [6]:
!source activate mxfold2 && pip3 -q install mxfold2-0.1.1-cp38-cp38-linux_x86_64.whl 

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 KB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m881.9/881.9 MB[0m [31m81.9 MB/s[0m eta [36m0:00:01[0mtcmalloc: large alloc 1102422016 bytes == 0x561e29aa2000 @  0x7f5d83475615 0x561df27755dc 0x561df284aeab 0x561df2777de2 0x561df27bafe9 0x561df27f9393 0x561df27dfea3 0x561df27e1288 0x561df27f9393 0x561df27dfea3 0x561df27e1288 0x561df27f9393 0x561df27dfea3 0x561df27e1288 0x561df27f9393 0x561df27dfea3 0x561df27e1288 0x561df27f9393 0x561df27dfea3 0x561df27e1288 0x561df27f9393 0x561df27e10b6 0x561df27f9393 0x561df27dfea3 0x561df27e1288 0x561df27fa0c6 0x561df27dfea3 0x561df27e1288 0x561df27f9393 0x561df27dfea3 0x561df27e1288
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m881.9/881.9 MB[0m [31m1.8 MB/s[0m eta 

In [7]:
# for local install (tested on Ubuntu 18.04)

#conda create --name mxfold2 python=3.8
#conda activate mxfold2
#wget -q https://github.com/keio-bioinformatics/mxfold2/releases/download/v0.1.1/mxfold2-0.1.1-cp38-cp38-linux_x86_64.whl
#pip3 -q install mxfold2-0.1.1-cp38-cp38-linux_x86_64.whl 

# S. cerevisiae (sce) 18 long non-coding RNA dataset
Data source: https://genie.weizmann.ac.il/pubs/PARS10/data/sce_genes_folded.tab.gz

In [8]:
gh_path = "https://raw.githubusercontent.com/sinc-lab/lncRNA-folding/master/data/"
sce = pd.read_csv(gh_path + "sce_genes_folded.tab", delimiter='\t', 
                  header=None, index_col=0, 
                  names=("Gene ID", "sequence", "PARS-assisted folding"))

In [9]:
# Sequences to process
yeast18lnc = ["snR81", "snR34", "snR43", "snR44",  "snR31",  "snR10",
              "snR63", "snR11", "snR82", "snR17b", "snR17a", "snR37",
              "SCR1",  "SRG1",  "snR19", "snR30",  "LSR1",   "TLC1"]

# Compute structures

In [18]:
def run_folding(fasta_name):

    # Compute structure
    !source activate mxfold2 && mxfold2 predict {fasta_name} > tmp.dot

    return "tmp.dot"

In [19]:
out_fasta_name = method_name + "_yeast18"
if os.path.exists(out_fasta_name + ".fasta"): os.remove(out_fasta_name + ".fasta")

lnc_ids = yeast18lnc
print("   \t lnc \t len \t time")
for i, lnc in enumerate(lnc_ids): 

    start_time = time.time()
    seq = sce.loc[lnc]["sequence"]
    print(f"{i+1}/{len(lnc_ids)}\t{lnc} \t {len(seq)}", end='\t')

    # Write a one-sequence fasta
    with open("tmp.fasta", "w") as ofile: 
      ofile.write(f">{lnc}\n{seq}\n")
  
    dot_file_name = run_folding("tmp.fasta")

    # Concatenate outputs
    os.system("cat " + dot_file_name + " >> " + out_fasta_name + ".fasta") 

    print(f"{time.time() - start_time: .1f} s")

   	 lnc 	 len 	 time
1/18	snR81 	 201	 3.1 s
2/18	snR34 	 203	 3.0 s
3/18	snR43 	 209	 3.2 s
4/18	snR44 	 211	 3.2 s
5/18	snR31 	 225	 3.6 s
6/18	snR10 	 245	 3.9 s
7/18	snR63 	 255	 4.3 s
8/18	snR11 	 258	 4.3 s
9/18	snR82 	 268	 4.5 s
10/18	snR17b 	 332	 6.4 s
11/18	snR17a 	 333	 6.5 s
12/18	snR37 	 386	 8.3 s
13/18	SCR1 	 522	 14.4 s
14/18	SRG1 	 551	 15.5 s
15/18	snR19 	 568	 16.7 s
16/18	snR30 	 606	 19.0 s
17/18	LSR1 	 1175	 72.6 s
18/18	TLC1 	 1301	 90.5 s
