# TSV Mutation Map Generator

### This notebook just converts CSV mutation map generated files into tsv files with some minor modifications

In [1]:
import os
import pandas as pd

In [119]:
path = "/media/pgdrive/sharif/exosomians/predictions/ExoCNN/"

In [5]:
results_path = "/media/pgdrive/sharif/exosomians/final/"

In [132]:
os.makedirs(results_path, exist_ok=True)

In [133]:
filenames = os.listdir(path)
filenames

['ExoCNN.ic.extreme.99.probabilities.csv',
 'gggg.predictions.csv',
 'ExoCNN.ev.extreme.90.hc50.cl1.mo3.probabilities.csv',
 'ExoCNN.ev.extreme.90.probabilities.csv',
 'ExoCNN.ev.extreme.99.probabilities.csv',
 'ExoCNN.ic.extreme.probabilities.csv',
 'ExoCNN.ic.extreme.90.probabilities.csv',
 'ExoCNN.final.probabilities.csv',
 'ExoCNN.ev.extreme.probabilities.csv',
 'ExoCNN.ev.extreme.90.hc50.cl1.mo1.probabilities.csv']

In [123]:
filenames = [filenames[0]] + filenames[3:5] + [filenames[6]]
filenames

['ExoCNN.ic.extreme.99.probabilities.csv',
 'ExoCNN.ev.extreme.90.probabilities.csv',
 'ExoCNN.ev.extreme.99.probabilities.csv',
 'ExoCNN.ic.extreme.90.probabilities.csv']

In [124]:
columns_to_preserve = ['id', 'seq', 'yes']

In [129]:
for filename in filenames:
    df = pd.read_csv(os.path.join(path, filename))
    df = df[columns_to_preserve]
    df['secretion_prob'] = df['yes']
    df.drop(['yes'], axis=1, inplace=True)
    print(".".join(filename.replace("probabilities.csv", "seqs.tsv").split(".")[1:]))
    df.to_csv(os.path.join(results_path, ".".join(filename.replace("probabilities.csv", "seqs.tsv").split(".")[1:])), index=False, sep = "\t")

ic.extreme.99.seqs.tsv
ev.extreme.90.seqs.tsv
ev.extreme.99.seqs.tsv
ic.extreme.90.seqs.tsv


In [27]:
os.listdir(results_path)

['ExoCNN.ev.extreme.90.tsv',
 'ExoCNN.ic.extreme.90.csv',
 'ExoCNN.ic.extreme.99.csv',
 'ExoCNN.ic.extreme.99.tsv',
 'ExoCNN.ev.extreme.99.csv',
 'ExoCNN.ev.extreme.90.csv',
 'ExoCNN.ev.extreme.99.tsv',
 'ExoCNN.ic.extreme.90.tsv']

In [2]:
mutated_path = "/media/pgdrive/sharif/exosomians/MutationMaps/ExoCNN"

In [3]:
filenames = os.listdir(mutated_path)
filenames

['ExoCNN.ev.extreme.90.unique.probabilities.mutation.map.2.npy',
 'ic.random.99.mutation.map.1.csv',
 'ExoCNN.ic.extreme.90.unique.probabilities.mutation.map.1.csv',
 'random.extreme.99.mutation.map.1.csv',
 'ev.random.99.mutation.map.1.csv',
 'ev.random.99.mutation.map.2.csv',
 'ev.extreme.99.mutation.map.2.csv',
 'npys',
 'motifs',
 'ExoCNN.ic.extreme.90.unique.probabilities.mutation.map.2.npy',
 'ic.extreme.99.mutation.map.2.csv',
 'ev.random.90.mutation.map.2.csv',
 'ic.extreme.99.mutation.map.1.csv',
 'ev.random.90.mutation.map.2.npy',
 'ev.random.90.mutation.map.1.csv',
 'random.extreme.99.mutation.map.1.npy',
 'ev.extreme.90.mutation.map.2.csv',
 'ExoCNN.ev.extreme.90.unique.probabilities.mutation.map.1.npy',
 'ic.random.99.mutation.map.2.csv',
 'ic.random.99.mutation.map.2.npy',
 'ExoCNN.ev.extreme.90.unique.probabilities.mutation.map.1.csv',
 'ExoCNN.ic.extreme.90.unique.probabilities.mutation.map.2.csv',
 'ev.extreme.90.mutation.map.1.csv',
 'ic.extreme.90.mutation.map.1.csv'

In [7]:
filenames = [
    "ExoCNN.ic.extreme.90.unique.probabilities.mutation.map.1.csv",
    "ExoCNN.ev.extreme.90.unique.probabilities.mutation.map.1.csv",
    "ExoCNN.ic.extreme.90.unique.probabilities.mutation.map.2.csv",
    "ExoCNN.ev.extreme.90.unique.probabilities.mutation.map.2.csv"
]

In [8]:
transition_dict = {"ev": "EV->IC", "ic": "IC->EV", "ra": "IC->EV"}
for filename in filenames:
    if filename.endswith(".csv"):
        n_positions = int(filename[-5])
        column_dict = {
            "id": "id",
            "seq": "seq",
            "mutated_seq": "mutated_seq",
            "Score": "secretion_prob_after_mutation",
            "prob_yes": "secretion_prob",
        }

        columns_order = ['id', 'seq', 'mutated_seq']

        for i in range(1, n_positions+1):
            column_dict[f"pos_{i}"] = f"mutated_pos_{i}"
            column_dict[f"mutation_{i}"] = f"mutation_alt_{i}"

            columns_order += [f'mutated_pos_{i}']

        for i in range(1, n_positions + 1):
            columns_order += [f'mutation_alt_{i}']

        columns_order += ['secretion_prob', 'secretion_prob_after_mutation']

        df = pd.read_csv(os.path.join(mutated_path, filename))
        df.drop(['prob_no'], axis=1, inplace=True)

        df.rename(columns=column_dict, inplace=True)

        df['secretion_prob_after_mutation'] = abs(df['secretion_prob_after_mutation'] + df['secretion_prob'])

        df = df[columns_order]

        df['transition'] = transition_dict[filename[7:9]]
        
        if filename.startswith("random"):
            df.loc[df['secretion_prob_after_mutation'] < 0.5, 'transition'] = 'EV->IC'
            
        df.to_csv(os.path.join(results_path, filename.replace(".csv", ".tsv")), index=False, sep='\t')

In [54]:
random_path = "/media/pgdrive/sharif/exosomians/RandomSequences/"

In [56]:
filenames = os.listdir(random_path)
filenames

['ic.extreme.99.sequences.3.csv',
 'ev.extreme.random.90.3.predictions.csv',
 'ic.extreme.random.99.3.predictions.csv',
 'ic.extreme.90.sequences.3.csv',
 'ev.extreme.90.sequences.3.csv',
 'ev.extreme.random.99.3.predictions.csv',
 'ev.extreme.99.sequences.3.csv',
 'ic.extreme.random.90.3.predictions.csv']

In [110]:
dfs = []
for filename in filenames:
    if filename.__contains__("predictions"):
        df = pd.read_csv(os.path.join(random_path, filename))
        df = df.loc[((df['yes'] >= 0.99) | (df['no'] >= 0.999))]
        dfs.append(df)
random_df = pd.concat(dfs, axis=0)
random_df.head()

Unnamed: 0,seq,no,yes
1,CCCCUUGAUAUUACCAAAUAGGGAAGACUUU,0.004143,0.995857
3,GUUUUUCUUAAACCAGUGCAAAGAGAGAGACACCUAUGUCUA,0.99905,0.00095
10,CUGGAGCAUGAUCUAGGAGUUUGCUGCGACUGGCAGAG,0.999771,0.000229
11,CUGGGCAGCUUGGCGCUAUUCUGAGAUGAGAGACAGUG,0.999581,0.000419
19,CGGUAGUUGAGUCCCUGACAGUUA,0.999946,5.4e-05


In [111]:
random_df.drop_duplicates(subset='seq', keep="last", inplace=True)

In [112]:
random_df.shape

(20041, 3)

In [113]:
random_df.drop(['no'], axis=1, inplace=True)

In [114]:
random_df.rename(columns={"seq": "seq", "yes": "secretion_prob"}, inplace=True)

In [115]:
random_df['label'] = 'IC'
random_df.loc[random_df['secretion_prob'] >= 0.5, 'label'] = 'EV'

In [116]:
random_df.head()

Unnamed: 0,seq,secretion_prob,label
1,CCCCUUGAUAUUACCAAAUAGGGAAGACUUU,0.995857,EV
3,GUUUUUCUUAAACCAGUGCAAAGAGAGAGACACCUAUGUCUA,0.00095,IC
10,CUGGAGCAUGAUCUAGGAGUUUGCUGCGACUGGCAGAG,0.000229,IC
11,CUGGGCAGCUUGGCGCUAUUCUGAGAUGAGAGACAGUG,0.000419,IC
19,CGGUAGUUGAGUCCCUGACAGUUA,5.4e-05,IC


In [117]:
random_df['label'].value_counts()

IC    17144
EV     2897
Name: label, dtype: int64

In [118]:
random_df.to_csv(os.path.join(results_path, 'random.extreme.99.seqs.tsv'), sep='\t', index=False)

In [131]:
random_df['seq'].to_csv(os.path.join(random_path, "random.extreme.99.seqs.csv"), index=False)

  """Entry point for launching an IPython kernel.
