# Label processing
This notebook will go over how processing for labels is done.

In [2]:
%pip install pandas numpy

Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the 'C:\Users\Cory\AppData\Local\Programs\Python\Python37\python.exe -m pip install --upgrade pip' command.


In [3]:
import pandas as pd
import numpy as np

In [5]:
data_dir = "../data/main/"

# DEPRECATED
possible_mics = [0.001, 0.003, 0.007, 0.01, 0.015, 0.02, 0.03, 0.06, 0.12, 0.25, 0.5, 1., 2., 4., 8., 16., 32., 64., 128.,
                 256., 512., 1024.]

# Update 2
We will be working over 3 genes. Each set of genes may have different sets of isolates being used since some isolates could have missing coverage for one gene and not another. If an isolate has missing coverage (or holes), then it is not included for that gene. We want to concatenate all antibiotic MIC values together though, so we will load all antibiotic files and concatenate them first.

In [118]:
ompk35 = pd.read_csv(f'{data_dir}antibiotics_OMPK35.tsv', sep='\t', index_col=0)
ompk36 = pd.read_csv(f'{data_dir}antibiotics_OMPK36.tsv', sep='\t', index_col=0)
ompk37 = pd.read_csv(f'{data_dir}antibiotics_OMPK37.tsv', sep='\t', index_col=0)

# This was attempt number 1, ends up making each column have a list of 3 values for each gene.
#labels_df = pd.concat([ompk35, ompk36, ompk37], axis=0)

# Attempt 2, really only keeps OMPK35 or OMPK36. Gets no MICs from OMPK37.
# labels_df = pd.merge(ompk35, ompk36, how='outer', on='Name', suffixes=('', '_y'))
# labels_df = pd.merge(labels_df, ompk37, how='outer', on='Name', suffixes=('', '_y'))

# Code retrieved from: https://stackoverflow.com/a/38034085
# labels_df.drop(labels_df.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)

# Attempt 3 (and correct way), merges all columns in each dataframe while keeping the non-null values.
# We expect all isolates to have the same MIC over each gene, so it does not matter which
# gene's MIC value is taken if both are not null.
labels_df = ompk35
labels_df = labels_df.combine_first(ompk36)
labels_df = labels_df.combine_first(ompk37)
labels_df = labels_df[labels_df.index != 'consensus']                   # Remove consensus row
labels_df = labels_df[labels_df.index != 'reference']                   # Remove reference row


In [119]:
# labels_df = pd.read_csv(f'{data_dir}antibiotics.tsv', sep='\t', index_col=0)
labels_df.index

Index(['Sentry-2016-933272', 'Sentry-2016-933582', 'Sentry-2016-934664',
       'Sentry-2016-934829', 'Sentry-2016-934925', 'Sentry-2016-934954',
       'Sentry-2016-935073', 'Sentry-2016-935482', 'Sentry-2016-935518',
       'Sentry-2016-935521',
       ...
       'Sentry-2018-1087745', 'Sentry-2018-1087757', 'Sentry-2018-1087767',
       'Sentry-2018-1087789', 'Sentry-2018-1087802', 'Sentry-2018-1087803',
       'Sentry-2018-1087818', 'Sentry-2018-1087824', 'Sentry-2018-1087829',
       'Sentry-2018-1087836'],
      dtype='object', name='Name', length=2864)

In [120]:
labels_df.head()

Unnamed: 0_level_0,Meropenem,Tetracycline,Cephalexin,TrimSulfa,Piperacillin-tazobactam,Doxycycline,Aztreonam,Gentamicin,Levofloxacin,Tobramycin,...,Cefoperazone-sulbactam,Amikacin,Delafloxacin,Cefuroxime,TigecyclineMR,Cefazolin,Cefepime-tazobactam,Doripenem,Ceftazidime,MeroRPX7009_fixed8
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sentry-2016-933272,0.06,4,,2,4,4,0.12,0.5,1,0.5,...,0.5,2,4,,0.5,,,0.5,0.5,0.12
Sentry-2016-933582,0.03,1,,<=0.50,2,1,0.06,0.25,0.06,0.25,...,<=0.25,1,0.06,,0.25,,,<=0.06,0.12,0.03
Sentry-2016-934664,0.03,2,,<=0.50,2,1,0.06,0.25,<=0.03,0.25,...,<=0.25,2,0.06,,0.5,,,<=0.06,0.12,0.03
Sentry-2016-934829,32.0,8,,>4.00,>64.00,8,>16.00,2,>4.00,>8.00,...,>32.00,>32.00,>8.00,,1.0,,,>8.00,>32.00,0.03
Sentry-2016-934925,32.0,>16.00,,>4.00,>64.00,>8.00,>16.00,>8.00,>4.00,>8.00,...,>32.00,4,>8.00,,1.0,,,>8.00,>32.00,0.06


# Update 3 (Only include Beta-Lactams and antibiotics that were tested on at least 90%)
For the research, we only want to test on Beta-lactam antibiotics, and we only want to include anitbiotics that have MIC values for at least 90% of the isolates. That means, that we want each column to have <10% 0's.

In [121]:
# Only get Beta-lactam antibiotics
beta_lactams = ['Penicillin', 'Amoxicillin', 'Ampicillin', 'Piperacillin', 'Oxacillin', 'Mecillinam',
                           'Amoxicillin-clavulanate', 'Ampicillin-sulbactam', 'Aztreonam-avibactam',
                           'Cefepime-tazobactam', 'Cefepime-zidebactam', 'Cefoperazone-sulbactam',
                           'Ceftaroline-avibactam', 'Ceftazidime-avibactam', 'Ceftibuten-clavulanate_fixed_2', 'Ceftibuten-clavulanate_2_to_1',
                           'Ceftolozane-tazobactam', 'Meropenem-nacubactam', 'Meropenem-vaborbactam',
                           'Piperacillin-tazobactam', 'Ticarcillin-clavulanate', 'Cefazolin', 'Cefuroxime',
                           'Cefoperazone', 'Ceftazidime', 'Ceftriaxone', 'Cefepime', 'Ceftaroline', 'Ceftobiprole',
                           'Cefoxitin', 'Cefiderocol', 'Cefpodoxime', 'Cefpodoxime_ETX1317', 'Ceftibuten', 'Cefuroxime', 'Cephalexin',
                           'Aztreonam', 'Biapenem', 'Doripenem', 'Ertapenem', 'Imipenem', 'Meropenem', 'MeroRPX7009_fixed8', 'Razupenem',
                           'Tebipenem', 'Faropenem', 'Sulopenem']

print(len(labels_df.columns))
labels_df = labels_df[[c for c in labels_df.columns if c in beta_lactams]]

# Filter out columns with >=10% 0's (NaN's/null values)
# Code found from: https://stackoverflow.com/a/31618099
max_number_of_nans = len(labels_df.index) * 0.1
print(max_number_of_nans)
print(len(labels_df.columns))
labels_df = labels_df.drop(labels_df.columns[labels_df.apply(lambda col: col.isnull().sum() >= max_number_of_nans)], axis=1)
print(len(labels_df.columns))

47
286.40000000000003
27
14


In [122]:
labels_df.head()

Unnamed: 0_level_0,Meropenem,Piperacillin-tazobactam,Aztreonam,Ceftriaxone,Ceftaroline,Cefepime,Ampicillin-sulbactam,Ceftazidime-avibactam,Ceftobiprole,Imipenem,Ceftolozane-tazobactam,Doripenem,Ceftazidime,MeroRPX7009_fixed8
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Sentry-2016-933272,0.06,4,0.12,0.12,0.25,<=0.12,16,0.25,0.06,2,0.5,0.5,0.5,0.12
Sentry-2016-933582,0.03,2,0.06,<=0.06,0.06,<=0.12,8,0.12,0.06,<=0.12,0.25,<=0.06,0.12,0.03
Sentry-2016-934664,0.03,2,0.06,<=0.06,0.06,<=0.12,8,0.12,0.03,<=0.12,0.12,<=0.06,0.12,0.03
Sentry-2016-934829,32.0,>64.00,>16.00,>8.00,>32.00,>16.00,>32.00,0.5,>16.00,>8.00,>32.00,>8.00,>32.00,0.03
Sentry-2016-934925,32.0,>64.00,>16.00,>8.00,>32.00,>16.00,>32.00,2.0,>16.00,>8.00,>32.00,>8.00,>32.00,0.06


## Dropping characters
Now that we have the data in a dataframe, we need to drop the `>` and `<=` characters from all values. When these are present, the type is of string, but if it is not then the type could be int or float. We will need to check that and remove the characters if they are present. Then, make all output values floats. We will make a function to do this and apply that to all columns.

Good help on efficient ways to accomplish this task can be found in [this StackOverflow answer](https://stackoverflow.com/a/54302517)

# Update 1 (for dropping characters and encoding)
The `encode_mics` function has been separated out into `get_mics` and `encode_mics`. This is because there could be any number of MIC values within the dataframes. We will have to first drop all characters and make NaN values -1. Then, once we union all genes, we can get the set of MICs found and encode on that.

In [123]:
def get_mics(col):
    def try_extract(x):
        if isinstance(x, str):
            return float(x.lstrip('<=').lstrip('>'))
        elif np.isnan(x):
            return -1.0
        else:
            return float(x)
    
    return pd.Series([try_extract(x) for x in col], dtype=float)

def encode_mics(col, set_mics=[]):
    return pd.Series([set_mics.index(x) for x in col], dtype=int)

In [124]:
labels_df = labels_df.apply(get_mics, axis=1, result_type='broadcast')
labels_df.head()

Unnamed: 0_level_0,Meropenem,Piperacillin-tazobactam,Aztreonam,Ceftriaxone,Ceftaroline,Cefepime,Ampicillin-sulbactam,Ceftazidime-avibactam,Ceftobiprole,Imipenem,Ceftolozane-tazobactam,Doripenem,Ceftazidime,MeroRPX7009_fixed8
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Sentry-2016-933272,0.06,4.0,0.12,0.12,0.25,0.12,16.0,0.25,0.06,2.0,0.5,0.5,0.5,0.12
Sentry-2016-933582,0.03,2.0,0.06,0.06,0.06,0.12,8.0,0.12,0.06,0.12,0.25,0.06,0.12,0.03
Sentry-2016-934664,0.03,2.0,0.06,0.06,0.06,0.12,8.0,0.12,0.03,0.12,0.12,0.06,0.12,0.03
Sentry-2016-934829,32.0,64.0,16.0,8.0,32.0,16.0,32.0,0.5,16.0,8.0,32.0,8.0,32.0,0.03
Sentry-2016-934925,32.0,64.0,16.0,8.0,32.0,16.0,32.0,2.0,16.0,8.0,32.0,8.0,32.0,0.06


In [125]:
set_mics = list(set(np.concatenate(labels_df.values)))
set_mics.sort()
labels_df = labels_df.apply(encode_mics, axis=1, result_type='broadcast', set_mics=set_mics)

In [126]:
labels_df.head()

Unnamed: 0_level_0,Meropenem,Piperacillin-tazobactam,Aztreonam,Ceftriaxone,Ceftaroline,Cefepime,Ampicillin-sulbactam,Ceftazidime-avibactam,Ceftobiprole,Imipenem,Ceftolozane-tazobactam,Doripenem,Ceftazidime,MeroRPX7009_fixed8
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Sentry-2016-933272,3,9,4,4,5,4,11,5,3,8,6,6,6,4
Sentry-2016-933582,2,8,3,3,3,4,10,4,3,4,5,3,4,2
Sentry-2016-934664,2,8,3,3,3,4,10,4,2,4,4,3,4,2
Sentry-2016-934829,12,13,11,10,12,11,12,6,11,10,12,10,12,2
Sentry-2016-934925,12,13,11,10,12,11,12,8,11,10,12,10,12,3


## Saving
Lastly, we just need to save the file.

In [127]:
labels_df.to_csv('labels.csv')

In [128]:
pd.read_csv('labels.csv').head()

Unnamed: 0,Name,Meropenem,Piperacillin-tazobactam,Aztreonam,Ceftriaxone,Ceftaroline,Cefepime,Ampicillin-sulbactam,Ceftazidime-avibactam,Ceftobiprole,Imipenem,Ceftolozane-tazobactam,Doripenem,Ceftazidime,MeroRPX7009_fixed8
0,Sentry-2016-933272,3,9,4,4,5,4,11,5,3,8,6,6,6,4
1,Sentry-2016-933582,2,8,3,3,3,4,10,4,3,4,5,3,4,2
2,Sentry-2016-934664,2,8,3,3,3,4,10,4,2,4,4,3,4,2
3,Sentry-2016-934829,12,13,11,10,12,11,12,6,11,10,12,10,12,2
4,Sentry-2016-934925,12,13,11,10,12,11,12,8,11,10,12,10,12,3
