# Preparing Labels for ML

## Import Required Modules

In [1]:
import os
from pathlib import Path
import re
import sys

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

BITOME_KB_PATH = Path('..', 'bitome-kb')
sys.path.append(BITOME_KB_PATH.absolute().as_posix())

from bitome.core import Bitome

### Define Global Variables

In [None]:
LOCAL_CACHE_PATH = Path('..', 'local_cache')
DATA_PATH = Path('..', 'data')
CLASS_DATA_PATH = Path('..', 'data', 'classification')

## Load Pre-Prepared Bitome Knowledgebase

In [2]:
bitome = Bitome.init_from_file(Path(LOCAL_CACHE_PATH, 'bitome.pkl'))

## Load ALE SNP Data

In [3]:
# --- SNP frequency by nt information frequency ---
ale_data_path = Path(DATA_PATH, 'aledb_muts_public')
snp_df = pd.DataFrame(columns=['Position', 'Details'])
for mutation_data_file in os.listdir(ale_data_path):
    mutation_df = pd.read_csv(
        Path(ale_data_path, mutation_data_file),
        index_col=None,
        usecols=[0, 1, 2, 3, 10],
        names=['Mut ID', 'Reference Seq', 'Position', 'Mutation Type', 'Details']
    )
    # verified that there are no variants of the NC_000913 ref seq tag in any of the files
    mutation_df_ecoli = mutation_df[mutation_df['Reference Seq'] == 'NC_000913']
    mutation_df_snps = mutation_df_ecoli[mutation_df_ecoli['Mutation Type'] == 'SNP'][[
        'Position',
        'Details'
    ]]
    snp_df = pd.concat([snp_df, mutation_df_snps])

def snp_type_from_details(snp_row) -> str:
    """
    Parses the details string from an ALEdb file to determine if a SNP is intergenic, synonymous or non-synonymous

    :param snp_row: the raw details string to parse
    :return str type_of_snp: the resulting SNP type
    """
    details = snp_row.Details
    if 'intergenic' in details or 'pseudogene' in details or 'noncoding' in details:
        type_of_snp = 'intergenic'
    else:
        # assume just one
        snp_aa_code = list(re.findall(r'[A-Z*][0-9]{1,5}[A-Z*]', details))[0]
        if snp_aa_code[0] == snp_aa_code[-1]:
            type_of_snp = 'synonymous'
        elif snp_aa_code[-1] == '*':
            type_of_snp = 'nonsense'
        else:
            type_of_snp = 'missense'
    return type_of_snp

def snp_aa_from_details(snp_row, mutated=False) -> str:
    """
    Parses the details string from an ALEdb file to determine the amino acid that a SNP targeted

    :param snp_row: the raw details string to parse
    :param bool mutated: indicates if the amino acid that was mutated TO should be returned
    :return str snp_aa: the AA the SNP targeted
    """
    details = snp_row.Details
    if 'intergenic' in details or 'pseudogene' in details or 'noncoding' in details:
        snp_aa = None
    else:
        # assume just one
        snp_aa_code = list(re.findall(r'[A-Z*][0-9]{1,5}[A-Z*]', details))[0]
        if mutated:
            snp_aa = snp_aa_code[-1]
        else:
            snp_aa = snp_aa_code[0]
        if snp_aa == '*':
            snp_aa = None
    return snp_aa


# cleaning up the dataframe to make sure we have integer positions, some extra mutation information
snp_df['Position'] = snp_df.apply(lambda row: int(row.Position.replace(',', '')), axis=1)
snp_df['Type'] = snp_df.apply(snp_type_from_details, axis=1)
snp_df['From'] = snp_df.apply(snp_aa_from_details, axis=1)
snp_df['To'] = snp_df.apply(lambda row: snp_aa_from_details(row, mutated=True), axis=1)

# import this separate file that has all K12 mutants; this one has midpoints and tech replicates removed, though
# so we can use it to filter down the public; this is a little hacky instead of using Patrick's pipeline to
# actually just get this filtering done on public ALEdb
filtered_mut_df_private = pd.read_pickle(Path(DATA_PATH, 'filtered_K12_mut_df.pkl'))
snp_df_private = filtered_mut_df_private[filtered_mut_df_private['Mutation Type'] == 'SNP']
filtered_coding_snp_df_private = snp_df_private[snp_df_private['coding'] == 1]

# only take the positions in our snp_df that also show up in the filtered db
snp_df = snp_df[snp_df['Position'].isin(filtered_coding_snp_df_private['Position'].astype(int))]
snp_df

Unnamed: 0,Position,Details,Type,From,To
2,1196220,H366H (CAC→CAT),synonymous,H,H
3,1196232,T370T (ACC→ACT),synonymous,T,T
4,1196245,L375M (TTA→CTG),missense,L,M
5,1196247,L375M (TTA→CTG),missense,L,M
6,1677707,S66* (TCA→TAA),nonsense,S,
...,...,...,...,...,...
28,3060146,Q132K (CAG→AAG),missense,Q,K
29,3471274,G19V (GGC→GTC),missense,G,V
33,4430521,I220V (ATA→GTA),missense,I,V
34,1405,S357R (AGC→CGC),missense,S,R


## Determine Gene Labels

Just need to do this for ALE SNPs; essentiality is an attribute of the gene objects

In [4]:
protein_coding_genes = bitome.coding_genes
rna_coding_genes = [gene for gene in bitome.genes if gene.gene_type in ['tRNA', 'tmRNA', 'antisense_RNA', 'rRNA']]
genes_to_use = protein_coding_genes + rna_coding_genes

essential_labels = [int(g.essential) for g in genes_to_use]

In [5]:
# ensure that the SNP indices are shifted from 1-indexing to 0-indexing
unique_snp_locs = snp_df['Position'].unique() - 1

# for each gene, determine if it has a SNP or not
snp_labels = []

for i, gene in enumerate(genes_to_use):
    gene_range = gene.location.start.position, gene.location.end.position
    overlap = len(set(range(*gene_range)).intersection(set(unique_snp_locs))) > 0
    snp_labels.append(int(overlap))

## Define and Save Locus Tag to Label Lookups

We want to try out some different featurization schemes, so we will implement those in the main ML notebook.

BUT we must define our test/lockbox set NOW to avoid data leakage. Let's do an 80/20 train/test split.

In [23]:
locus_tags = [g.locus_tag for g in genes_to_use]

genes_snp_train, genes_snp_test, y_train_snp, y_test_snp = train_test_split(
    locus_tags,
    snp_labels,
    test_size=0.2,
    random_state=42,
    stratify=snp_labels
)

genes_ess_train, genes_ess_test, y_train_ess, y_test_ess = train_test_split(
    locus_tags,
    essential_labels,
    test_size=0.2,
    random_state=42,
    stratify=essential_labels
)

pd.Series(y_train_snp, name='SNP', index=genes_snp_train).to_csv(Path(CLASS_DATA_PATH, 'snp_train.csv'))
pd.Series(y_test_snp, name='SNP', index=genes_snp_test).to_csv(Path(CLASS_DATA_PATH, 'snp_test.csv'))

pd.Series(y_train_ess, name='SNP', index=genes_ess_train).to_csv(Path(CLASS_DATA_PATH, 'essential_train.csv'))
pd.Series(y_test_ess, name='SNP', index=genes_ess_test).to_csv(Path(CLASS_DATA_PATH, 'essential_test.csv'))