## Imports

In [1]:
import pandas as pd
import numpy as np

import os

from sklearn.model_selection import StratifiedKFold, train_test_split

import torch

dtype = torch.float
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
torch.get_default_device()



device(type='cuda', index=0)

# Data Load

In [3]:
def augmentation_bernoulli(seq, prob=0.005):
    idx = torch.bernoulli(prob * torch.ones(len(seq))).nonzero().squeeze(dim=1)
    s = list(seq)

    for i in idx.tolist():
        s[i] = "N"

    return "".join(s)

def sequences_augmentation(data, level, cat, n):
    to_copy = data.loc[data[level] == cat]

    new_data = to_copy[0:1]
    new_data = new_data.drop(new_data.index[0])

    while new_data.shape[0] < n:
        qnt = ((n-(new_data.shape[0])) / to_copy.shape[0]).__ceil__()

        new_data = pd.concat(([to_copy]*qnt)+[new_data])
        new_data["truncated_sequence"] = new_data["truncated_sequence"].apply(augmentation_bernoulli, prob=0.002)
        new_data = new_data.drop_duplicates(subset=["truncated_sequence"])
    
    new_data = new_data[:n-to_copy.shape[0]]
    return new_data

def data_augmentation(data, level, lower, upper):
    class_count = data.groupby(level)[level].count().reset_index(name="count")
    
    cats = class_count.loc[(class_count["count"] < upper) & (class_count["count"] >= lower)][level].to_list()

    clones = sequences_augmentation(data, level, cats[0], upper)
    for cat in cats[1:]:
        clones = pd.concat([clones, sequences_augmentation(data, level, cat, upper)])

    return pd.concat([data, clones])


# Load and filter the data from csv
def load_data(dataset, level, minimun_entries):
    data = dataset.loc[dataset[level].notna()]
    data = data.loc[data["truncated_sequence"].str.len() >= 900].sample(frac=1, random_state=42)

    # Remove sequences classified in more than one class
    tmp = data.groupby("truncated_sequence")[level].nunique().reset_index()
    tmp = tmp.loc[tmp[level]>1]["truncated_sequence"]
    data = data.loc[~data.truncated_sequence.isin(tmp)]

    # Remove duplicates on current level
    data.drop_duplicates(subset=[level, "truncated_sequence"], inplace=True)

    # Remove entries from classes with lass than "minimun_entries" datapoints
    count_classes = data[level].value_counts().reset_index()
    selected_classes = count_classes.loc[count_classes["count"] >= minimun_entries]
    data = data.loc[data[level].isin(selected_classes[level])]
    
    return data

In [4]:
# Reference map for IUPAC sequences encode
base_map = {
    "A":[1.0, 0.0, 0.0, 0.0],
    "T":[0.0, 1.0, 0.0, 0.0],
    "G":[0.0, 0.0, 1.0, 0.0],
    "C":[0.0, 0.0, 0.0, 1.0],

    'W':[0.5, 0.5, 0.0, 0.0],
    'S':[0.0, 0.0, 0.5, 0.5],
    'M':[0.5, 0.0, 0.0, 0.5],
    'K':[0.0, 0.5, 0.5, 0.0],
    'R':[0.5, 0.0, 0.5, 0.0],
    'Y':[0.0, 0.5, 0.0, 0.5],
    
    'B':[0.0, 0.3, 0.3, 0.3],
    'D':[0.3, 0.3, 0.3, 0.0],
    'H':[0.3, 0.3, 0.0, 0.3],
    'V':[0.3, 0.0, 0.3, 0.3],

    'N':[0.25, 0.25, 0.25, 0.25],
}

def encode_sequence(sequence):
    encoded_seq = []

    for base in sequence:
        encoded_seq.append(base_map[base])
    
    return torch.tensor(encoded_seq)

In [5]:
# Load the base dataset
csv = pd.read_csv("./data/cleaned_sequences.csv", 
                  usecols=[
                      'domain', 
                      'supergroup', 
                      'division', 
                      'subdivision', 
                      'class', 
                      'order', 
                      'family', 
                      'genus', 
                      'species', 
                      'truncated_sequence'
                     ])
csv.head(1)

Unnamed: 0,domain,supergroup,division,subdivision,class,order,family,genus,species,truncated_sequence
0,Eukaryota,Amoebozoa,,,,UI13E03-lineage,,,,GATAAGCCATGCAAATTTAAATTTAAGCCGGTTTCGGCGAAATTGT...


# Data Export 

In [6]:
# Base path to export the generated data
base_path = "./new_datas"

In [7]:
# # Taxonomy levels to filter
levels = ["domain", "class", "order", "family", "genus", "species"]

# # Format the row to the content format of the taxonomy file
# def taxonomy_format(row, target_level):
#     tax = []
#     for level in levels:
#         if level in row.index:
#             tax.append(str(level[0])+"__"+("" if pd.isna(row[level]) else row[level]))
#             if level == target_level:
#                 break
#     row["taxonomy"] = "; ".join(tax)
#     return row

# # Export data to a taxonomy file
# def taxonomy_generate(df, target_level, name, path):
#     tsv = df.apply(taxonomy_format, axis=1, args=(target_level,)).reset_index(names="seq_id")
#     tsv[["seq_id", "taxonomy"]].to_csv(path+"/"+name+"_taxonomy.txt", sep="\t", header=False, index=False, )


In [24]:
# # Generate the fasta file with the dataset data
# def fasta_generate(df, name, path):
#     with open(path+"/"+name+".fasta", "w+") as fasta:
#         for index, row in df.iterrows():
#             fasta.write(">"+str(index)+"\n")
#             fasta.write(row["truncated_sequence"]+"\n")
                
#         fasta.close()

In [25]:
# prop = 0.10        # Train size
# k_min = 10          # Minimum n of entries per class
# k_splits = k_min    # N of clusters for StratifiedSplit with KFold

# def StratifiedSplit(data, level, rand=42):
#     _, (X, y) = next(enumerate(StratifiedKFold(n_splits=k_splits, shuffle=True, random_state=rand).split(data.index, data[level])))
#     return (data.iloc[X], data.iloc[y])

# def StratifiedSplit2(data, level, rand=42):
#     return train_test_split(data, test_size=prop, stratify=data[level], random_state=rand)

# def RandomSplit(data, level=None, rand=42):
#     test_data = data.sample(frac=prop, random_state=rand)
#     return (data.drop(test_data.index), test_data)

In [9]:
# Split functions to be executed
# splitters = [
#     # StratifiedSplit, 
#     StratifiedSplit2,
#     RandomSplit,
#     ]

# Generate and export the files for each of selected level
for target_level in ["class", "order", "family", "genus", "species"]:

    # Load data and filter the classes with at least K entries
    dataset = load_data(csv, target_level, 1)
    
    #Remove subsequent levels
    # for l in levels[levels.index(target_level)+1:]:
    #     dataset[l] = np.nan
    # dataset=dataset.dropna(subset=levels[:levels.index(target_level)])

    print(dataset.shape)
    
    


(123696, 10)
(101709, 10)
(92766, 10)
(95802, 10)
(66870, 10)


In [11]:
dataset = pd.read_csv("./data/cleaned_sequences.csv")
dataset

Unnamed: 0,domain,supergroup,division,subdivision,class,order,family,genus,species,truncated_sequence,start,end,label,gene,reference_sequence,taxo_id,seq_id,ambiguities,gb_taxonomy,silva_taxonomy
0,Eukaryota,Amoebozoa,,,,UI13E03-lineage,,,,GATAAGCCATGCAAATTTAAATTTAAGCCGGTTTCGGCGAAATTGT...,1,1543,U,18S_rRNA,1.0,2409,12004,0,,Eukaryota;Amorphea;Amoebozoa;Discosea;Flabelli...
1,Eukaryota,Amoebozoa,,,Lobosa-G1,,,,,GTTAAAACTCGTAATCGGAGTGTTCGGATAGAGGATAAAATGAATT...,1,1177,U,18S_rRNA,,2890,8962,0,Eukaryota; Amoebozoa; unclassified Amoebozoa,Eukaryota;Amorphea;Amoebozoa;Discosea;Flabelli...
2,Eukaryota,Amoebozoa,,,Lobosa-G1,,,,,CCTTGGAATAGNATAGNTAGTACGTTTAGAAGCTCGTGATCGGAGT...,1,1024,U,18S_rRNA,,2890,69164,3,Eukaryota; Amoebozoa; unclassified Amoebozoa,Eukaryota;Amorphea;Amoebozoa;Discosea;Flabelli...
3,Eukaryota,Amoebozoa,,,Lobosa-G1,,,,,AGACAGTTAAAAAGCTCGTAGTCGGAGTTGTCCGGATAGAGGATAA...,1,1210,U,18S_rRNA,,2890,144724,6,Eukaryota; Amoebozoa; unclassified Amoebozoa,Eukaryota;Amorphea;Amoebozoa;Discosea;Flabelli...
4,Eukaryota,Amoebozoa,Discosea,,Centramoebia,Acanthopodida,Acanthamoebidae,Acanthamoeba,Acanthamoeba_astronyxis,TCATATGCTTGTCTCAAAGATTAAGCCATGCATGTCTAAGTATAAA...,1,2559,U,18S_rRNA,,2834,68573,0,Eukaryota; Amoebozoa; Discosea; Longamoebia; C...,Eukaryota;Amorphea;Amoebozoa;Discosea;Longamoe...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156676,Eukaryota,TSAR,Telonemia,,,,Telonemia-Group-2,,,TACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTA...,1,1791,U,18S_rRNA,,10354,166134,0,Eukaryota; environmental samples,Eukaryota;Incertae Sedis;Telonema;uncultured e...
156677,Eukaryota,TSAR,Telonemia,,,,Telonemia-Group-2,,,AACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTA...,1,1788,U,18S_rRNA,,10354,166198,0,Eukaryota; environmental samples,Eukaryota;Incertae Sedis;Telonema;uncultured e...
156678,Eukaryota,TSAR,Telonemia,,,,Telonemia-Group-2,,,TACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTA...,1,1791,U,18S_rRNA,,10354,166276,0,Eukaryota; environmental samples,Eukaryota;Incertae Sedis;Telonema;uncultured e...
156679,Eukaryota,TSAR,Telonemia,,,,Telonemia-Group-2,,,TACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTA...,1,1795,U,18S_rRNA,,10354,166324,0,Eukaryota; environmental samples,Eukaryota;Incertae Sedis;Telonema;uncultured e...


In [12]:
for target_level in ["class", "order", "family", "genus", "species"]:
    d = dataset.dropna(subset=[target_level])
    break

d

Unnamed: 0,domain,supergroup,division,subdivision,class,order,family,genus,species,truncated_sequence,start,end,label,gene,reference_sequence,taxo_id,seq_id,ambiguities,gb_taxonomy,silva_taxonomy
1,Eukaryota,Amoebozoa,,,Lobosa-G1,,,,,GTTAAAACTCGTAATCGGAGTGTTCGGATAGAGGATAAAATGAATT...,1,1177,U,18S_rRNA,,2890,8962,0,Eukaryota; Amoebozoa; unclassified Amoebozoa,Eukaryota;Amorphea;Amoebozoa;Discosea;Flabelli...
2,Eukaryota,Amoebozoa,,,Lobosa-G1,,,,,CCTTGGAATAGNATAGNTAGTACGTTTAGAAGCTCGTGATCGGAGT...,1,1024,U,18S_rRNA,,2890,69164,3,Eukaryota; Amoebozoa; unclassified Amoebozoa,Eukaryota;Amorphea;Amoebozoa;Discosea;Flabelli...
3,Eukaryota,Amoebozoa,,,Lobosa-G1,,,,,AGACAGTTAAAAAGCTCGTAGTCGGAGTTGTCCGGATAGAGGATAA...,1,1210,U,18S_rRNA,,2890,144724,6,Eukaryota; Amoebozoa; unclassified Amoebozoa,Eukaryota;Amorphea;Amoebozoa;Discosea;Flabelli...
4,Eukaryota,Amoebozoa,Discosea,,Centramoebia,Acanthopodida,Acanthamoebidae,Acanthamoeba,Acanthamoeba_astronyxis,TCATATGCTTGTCTCAAAGATTAAGCCATGCATGTCTAAGTATAAA...,1,2559,U,18S_rRNA,,2834,68573,0,Eukaryota; Amoebozoa; Discosea; Longamoebia; C...,Eukaryota;Amorphea;Amoebozoa;Discosea;Longamoe...
5,Eukaryota,Amoebozoa,Discosea,,Centramoebia,Acanthopodida,Acanthamoebidae,Acanthamoeba,Acanthamoeba_astronyxis,TCATATGCTTGTCTCAAAGATTAAGCCATGCATGTCTAAGTATAAA...,1,2682,U,18S_rRNA,,2834,92137,2,Eukaryota; Amoebozoa; Discosea; Longamoebia; C...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156328,Eukaryota,TSAR,Stramenopiles,,Stramenopiles_X-Group-7,,,,,TCGTAGTTGAATTTCTGGTGACGCGGTTTGGCCTGTTGCTTAATTG...,1,968,U,18S_rRNA,,47902,109321,0,Eukaryota; Sar; Stramenopiles; environmental s...,Eukaryota;SAR;Stramenopiles;Ochrophyta;Diatome...
156329,Eukaryota,TSAR,Stramenopiles,,Stramenopiles_X-Group-7,,,,,AAAAAGCTCGTAGTTGAATTTCTGGTGACGCGGTTTGGCCTGTTGC...,1,975,U,18S_rRNA,,47902,135392,0,Eukaryota; Sar; Stramenopiles; environmental s...,Eukaryota;SAR;Stramenopiles;Ochrophyta;Diatome...
156330,Eukaryota,TSAR,Stramenopiles,,Stramenopiles_X-Group-8,,,,,CGGTAATTCCAGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAA...,1,1080,U,18S_rRNA,,47904,47657,0,Eukaryota; environmental samples,Eukaryota;SAR;Stramenopiles;Incertae Sedis;Pir...
156331,Eukaryota,TSAR,Stramenopiles,,Stramenopiles_X-Group-8,,,,,CGGTAATTCCAGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAA...,1,1081,U,18S_rRNA,,47904,115942,0,Eukaryota; environmental samples,Eukaryota;SAR;Stramenopiles;Incertae Sedis;Pir...


False

# Divisoes geradas

In [2]:
experiment_id = "1734322688"
data = pd.read_csv("../CNN/results/summarized/"+experiment_id+"_models_train_test_400.csv")
data.head(2)

Unnamed: 0.1,Unnamed: 0,id,start_time,end_time,level,splitter,augmentation,batch_size,epochs,model,...,optimizer,mat_mul,obs,reserved_memory,error,best_epoch,train_acc_best_epoch,train_loss_best_epoch,test_acc_best_epoch,test_loss_best_epoch
0,0,0,1734323000.0,1734328000.0,class,prop_0-1/min_5/RandomSplit_0,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,...,"AdamW (params: {'weight_decay': 1.0, 'amsgrad'...",False,9:1 _ min:5,18446.0,,301,1.0,0.002273,0.98924,0.057505
1,1,1,1734329000.0,1734334000.0,class,prop_0-1/min_5/RandomSplit_14,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,...,"AdamW (params: {'weight_decay': 1.0, 'amsgrad'...",False,9:1 _ min:5,18494.0,,309,1.0,0.002153,0.989726,0.062389


In [6]:
props = {
    "prop_0-2":"80:20",
    "prop_0-1":"90:10",
    "prop_0-05":"95:05",
    }
mins = {
    "min_5":5,
    "min_10":10,
}

splitters = {
    "RandomSplit":"Aleatória Simples",
    "StratifiedSplit2":"Estratificada"
}


divs = []

for i, row in data.iterrows():
    div = {
        "level":row["level"],
        }

    splitter = row["splitter"].split("/")
    div["prop"] = props[splitter[0]]
    div["min"] = mins[splitter[1]]
    splitter = splitter[2].split("_")
    div["splitter"] = splitters[splitter[0]]
    div["seed"] = splitter[1]

    div["train"] = pd.read_csv("../new_data/"+row["splitter"]+"/"+row["level"]+"/train_dataset.csv").shape[0]
    div["test"] = pd.read_csv("../new_data/"+row["splitter"]+"/"+row["level"]+"/test_dataset.csv").shape[0]

    divs.append(div)

pd.DataFrame(divs).to_csv("./datas/divisoes.csv", index=False)



    

# CNN Results

In [13]:
cnn = pd.read_csv("../CNN/results/summarized/1734322688_models_train_test_400.csv", index_col=0)

cnn[["Prop", "Min", "Splitter_Seed"]] = cnn["splitter"].str.split("/", expand=True)
cnn

cnn[["Splitter","Seed"]] = cnn["Splitter_Seed"].str.split("_", expand=True)
cnn

Unnamed: 0,id,start_time,end_time,level,splitter,augmentation,batch_size,epochs,model,loss_function,...,best_epoch,train_acc_best_epoch,train_loss_best_epoch,test_acc_best_epoch,test_loss_best_epoch,Prop,Min,Splitter_Seed,Splitter,Seed
0,0,1.734323e+09,1.734328e+09,class,prop_0-1/min_5/RandomSplit_0,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,301,1.000000,0.002273,0.989240,0.057505,prop_0-1,min_5,RandomSplit_0,RandomSplit,0
1,1,1.734329e+09,1.734334e+09,class,prop_0-1/min_5/RandomSplit_14,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,309,1.000000,0.002153,0.989726,0.062389,prop_0-1,min_5,RandomSplit_14,RandomSplit,14
2,2,1.734334e+09,1.734340e+09,class,prop_0-1/min_5/RandomSplit_56,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,305,1.000000,0.002795,0.990454,0.048591,prop_0-1,min_5,RandomSplit_56,RandomSplit,56
3,3,1.734340e+09,1.734346e+09,class,prop_0-1/min_5/RandomSplit_84,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,309,0.999991,0.002693,0.992072,0.042495,prop_0-1,min_5,RandomSplit_84,RandomSplit,84
4,4,1.734346e+09,1.734352e+09,class,prop_0-1/min_5/RandomSplit_92,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,299,1.000000,0.002376,0.991506,0.046833,prop_0-1,min_5,RandomSplit_92,RandomSplit,92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,1.735814e+09,1.735815e+09,species,prop_0-05/min_10/StratifiedSplit2_84,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,603,0.998015,0.010183,0.948630,0.361868,prop_0-05,min_10,StratifiedSplit2_84,StratifiedSplit2,84
396,396,1.735815e+09,1.735816e+09,species,prop_0-05/min_10/StratifiedSplit2_92,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,627,0.999098,0.006085,0.933219,0.339491,prop_0-05,min_10,StratifiedSplit2_92,StratifiedSplit2,92
397,397,1.735816e+09,1.735817e+09,species,prop_0-05/min_10/StratifiedSplit2_101,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,613,0.998195,0.007373,0.943493,0.373032,prop_0-05,min_10,StratifiedSplit2_101,StratifiedSplit2,101
398,398,1.735817e+09,1.735818e+09,species,prop_0-05/min_10/StratifiedSplit2_105,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,CrossEntropyLoss (<class 'torch.nn.modules.los...,...,608,0.997925,0.009456,0.950342,0.344410,prop_0-05,min_10,StratifiedSplit2_105,StratifiedSplit2,105


In [19]:
cnn = cnn[['level', 'Splitter', 'Prop', 'Min', 'Seed', 'epochs', 'best_epoch', 'test_acc_best_epoch']]
cnn


Unnamed: 0,level,Splitter,Prop,Min,Seed,epochs,best_epoch,test_acc_best_epoch
0,class,RandomSplit,prop_0-1,min_5,0,700,301,0.989240
1,class,RandomSplit,prop_0-1,min_5,14,700,309,0.989726
2,class,RandomSplit,prop_0-1,min_5,56,700,305,0.990454
3,class,RandomSplit,prop_0-1,min_5,84,700,309,0.992072
4,class,RandomSplit,prop_0-1,min_5,92,700,299,0.991506
...,...,...,...,...,...,...,...,...
395,species,StratifiedSplit2,prop_0-05,min_10,84,700,603,0.948630
396,species,StratifiedSplit2,prop_0-05,min_10,92,700,627,0.933219
397,species,StratifiedSplit2,prop_0-05,min_10,101,700,613,0.943493
398,species,StratifiedSplit2,prop_0-05,min_10,105,700,608,0.950342


In [20]:
cnn.to_csv("./datas/CNN_results.csv", index=False)