In [1]:
import pandas as pd

amr_mics_1 = pd.read_csv("PRJNA292661_PATRIC_genome_amr.csv")
amr_mics_2 = pd.read_csv("PRJNA292666_PATRIC_genome_amr.csv")
frames = [amr_mics_1, amr_mics_2]

amr_mics = pd.concat(frames, axis=0, join='outer')

amr_mics

Unnamed: 0,Taxon ID,Genome ID,Genome Name,Antibiotic,Resistant Phenotype,Measurement,Measurement Sign,Measurement Value,Measurement Unit,Laboratory Typing Method,...,Laboratory Typing Platform,Vendor,Testing Standard,Testing Standard Year,Computational Method,Computational Method Version,Computational Method Performance,Evidence,Source,PubMed
0,590,590.16443,Salmonella enterica SRR4280633,ceftriaxone,,<=0.25,<=,0.250,mg/L,broth microdilution,...,Sensititre,Thermo Scientific,,,,,,Laboratory Method,,30333126
1,590,590.16009,Salmonella enterica SRR3295560,trimethoprim/sulfamethoxazole,,<=0.12,<=,0.120,mg/L,broth microdilution,...,Sensititre,Thermo Scientific,,,,,,Laboratory Method,,30333126
2,590,590.12910,Salmonella enterica SRR3664721,ciprofloxacin,,<=0.015,<=,0.015,mg/L,broth microdilution,...,Sensititre,Thermo Scientific,,,,,,Laboratory Method,,30333126
3,590,590.15053,Salmonella enterica SRR3933070,gentamicin,,0.5,,0.500,mg/L,broth microdilution,...,Sensititre,Thermo Scientific,,,,,,Laboratory Method,,30333126
4,590,590.14158,Salmonella enterica SRR3933004,chloramphenicol,,8,,8.000,mg/L,broth microdilution,...,Sensititre,Thermo Scientific,,,,,,Laboratory Method,,30333126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15200,590,590.12438,Salmonella enterica SRR3038498,nalidixic acid,,4,,4.000,mg/L,broth microdilution,...,Sensititre,Thermo Scientific,,,,,,Laboratory Method,,30333126
15201,590,590.14449,Salmonella enterica SRR3295778,streptomycin,,>64,>,64.000,mg/L,broth microdilution,...,Sensititre,Thermo Scientific,,,,,,Laboratory Method,,30333126
15202,590,590.12605,Salmonella enterica SRR3180052,sulfisoxazole,,32,,32.000,mg/L,broth microdilution,...,Sensititre,Thermo Scientific,,,,,,Laboratory Method,,30333126
15203,590,590.14903,Salmonella enterica SRR3295810,chloramphenicol,,=8,=,8.000,mg/L,broth microdilution,...,,Trek Diagnostics,CLSI,2015.0,,,,Laboratory Method,28784677.0,30333126


In [2]:
# 'Antibiotics' array
antibiotics_array = amr_mics['Antibiotic'].unique()
# 'Antibiotics' list
antibiotics_list = antibiotics_array.tolist()

# List of Antibiotic MIC DataDrames 
amr_frames=[]
for i in range(len(antibiotics_list)):
    single_amr = amr_mics.loc[amr_mics.Antibiotic==antibiotics_list[i]].loc[:,['Genome ID','Measurement Value']].rename(columns={'Measurement Value': antibiotics_list[i]})
    aux = single_amr.drop_duplicates(subset='Genome ID', keep='first').sort_values(by='Genome ID')
    amr_frames.append(aux)
    
# Merge de DataFrames
from functools import reduce

amr = reduce(lambda  left,right: pd.merge(left,right,on='Genome ID',how='outer'), amr_frames)

amr

Unnamed: 0,Genome ID,ceftriaxone,trimethoprim/sulfamethoxazole,ciprofloxacin,gentamicin,chloramphenicol,nalidixic acid,ampicillin,sulfisoxazole,cefoxitin,tetracycline,ceftiofur,amoxicillin/clavulanic acid,streptomycin,azithromycin,kanamycin
0,590.11958,0.25,0.125,0.015,0.50,8.0,4.0,1.0,16.0,4.0,32.0,0.5,1.0,8.0,8.0,
1,590.11959,0.25,0.500,0.015,0.50,8.0,4.0,1.0,256.0,4.0,32.0,1.0,1.0,64.0,8.0,
2,590.11960,0.25,0.125,0.030,0.50,8.0,4.0,1.0,32.0,4.0,32.0,0.5,1.0,4.0,8.0,
3,590.11961,0.25,0.125,0.030,0.25,8.0,4.0,1.0,32.0,4.0,32.0,1.0,1.0,8.0,8.0,
4,590.11962,0.25,0.125,0.500,0.50,8.0,32.0,1.0,64.0,2.0,4.0,0.5,1.0,4.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1932,590.17889,0.25,0.120,0.015,1.00,8.0,4.0,1.0,32.0,2.0,32.0,1.0,1.0,4.0,8.0,
1933,590.17891,4.00,0.250,0.030,8.00,8.0,4.0,32.0,256.0,16.0,32.0,8.0,32.0,64.0,4.0,
1934,590.17894,8.00,0.250,0.030,1.00,32.0,4.0,32.0,256.0,32.0,32.0,8.0,32.0,64.0,4.0,64.0
1935,590.17895,8.00,0.120,0.015,0.25,8.0,4.0,32.0,256.0,32.0,32.0,8.0,32.0,4.0,4.0,


In [3]:
# Oligonucleotides composition: 

# Oligonucleotides are DNA or RNA molecules made up of a sequence
# of nucleotides. The length of the oligonucleotide is represented as
# "k-mer" (kk being the number of nucleotides in one oligonulceotide).

# In this analysis, I'll be looking at the 3-mers (or trinucleotides)
# and 4-mer (or tetranucleotides) compositions against their
#normalized frquency of occurrences.

# tri-nucleotide compositions
trimers = ["AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", "AGA", "AGC", "AGG", "ATA", "ATC", "ATG", "CAA", 
           "CAC", "CAG", "CCA","CCC","CCG","CGA","CGC","CTA","CTC","GAA","GAC","GCA","GCC","GGA","GTA","TAA","TCA"]

# function that counts the times a trimer appears
def trimer_composition(genome):
    trimer_dict = dict()
    for trimer in trimers:
        trimer_dict[trimer] = genome.count(trimer)
    return trimer_dict

# tetra-nucleotide compositions
tetramers = ['AAAA','AAAC','AAAG','AAAT','AACA','AACC','AACG','AACT','AAGA','AAGC','AAGG','AAGT','AATA','AATC',
             'AATG','AATT','ACAA','ACAC','ACAG','ACAT','ACCA','ACCC','ACCG','ACCT','ACGA','ACGC','ACGG','ACGT',
             'ACTA','ACTC','ACTG','AGAA','AGAC','AGAG','AGAT','AGCA','AGCC','AGCG','AGCT','AGGA','AGGC','AGGG',
             'AGTA','AGTC','AGTG','ATAA','ATAC','ATAG','ATAT','ATCA','ATCC','ATCG','ATGA','ATGC','ATGG','ATTA',
             'ATTC','ATTG','CAAA','CAAC','CAAG','CACA','CACC','CACG','CAGA','CAGC','CAGG','CATA','CATC','CATG',
             'CCAA','CCAC','CCAG','CCCA','CCCC','CCCG','CCGA','CCGC','CCGG','CCTA','CCTC','CGAA','CGAC','CGAG',
             'CGCA','CGCC','CGCG','CGGA','CGGC','CGTA','CGTC','CTAA','CTAC','CTAG','CTCA','CTCC','CTGA','CTGC',
             'CTTA','CTTC','GAAA','GAAC','GACA','GACC','GAGA','GAGC','GATA','GATC','GCAA','GCAC','GCCA','GCCC',
             'GCGA','GCGC','GCTA','GGAA','GGAC','GGCA','GGCC','GGGA','GGTA','GTAA','GTAC','GTCA','GTGA','GTTA',
             'TAAA','TACA','TAGA','TATA','TCAA','TCCA','TCGA','TGAA','TGCA','TTAA']

# function that counts the times a 4-mer appears
def tetramer_composition(genome):
    tetramer_dict = dict()
    for tetramer in tetramers:
        if tetramer in genome:
            tetramer_dict[tetramer] = genome.count(tetramer)
        else:
            tetramer_dict[tetramer] = 0
    return tetramer_dict

In [5]:
# Genomic 3-mers composition

import numpy as np

# List of float 'Genome ID's
genome_list = amr_mics.loc[:,'Genome ID'].sort_values().unique().tolist()

# List of str 'Genome ID's
genome_ids=[]
for i in range(800):
    gen_aux = str(float('%.8g' %genome_list[i]))
    if len(gen_aux)==9:
        genome_ids.append(gen_aux)
    elif len(gen_aux)==8:
        genome_ids.append(gen_aux+'0')
    elif len(gen_aux)==7:
        genome_ids.append(gen_aux+'00')
    elif len(gen_aux)==6:
        genome_ids.append(gen_aux+'000')
        
        
def threemers_spectrum(i, genome_id):
    
    # reading and spliting the genome file
    aux = []
    with open ("/home/cristian/projects/salmonella_enterica/genomes/"+genome_id+"/"+genome_id+".fna", "r") as file:
        aux = file.read().split(genome_id+"]")
    aux_2 = []
    for j in range(len(aux)-1):
        aux_2.append(aux[j+1].split('>')[0].replace("\n", "").upper())
    genome = ''.join(aux_2)
    
    # 3-mer composition dictionary
    composition = trimer_composition(genome)
    # Total number of 3-mers
    total_composition = sum(composition.values())
    # List of 3-mers compostion frequencies
    norm_freq = [count/total_composition for count in composition.values()]

    # save the list in a file
    with open('three_mers.txt', 'a+') as f:
        #f.write("%s\n" % norm_freq)
        f.write('[{}, {}]\n'.format(i, norm_freq))

import time
import multiprocessing 
            
if __name__ == '__main__':
    starttime = time.time()
    processes = []
    with open('three_mers.txt', 'w+') as f:
        f.write('')
    for i, genome_id in enumerate(genome_ids):
        p = multiprocessing.Process(target=threemers_spectrum, args=(i, genome_id))
        processes.append(p)
        p.start()
    
    for process in processes:
        process.join()

with open("three_mers.txt", "r") as file:
    aux_3 = file.read().split('\n')[:800]
    

three_mers = []
for item in aux_3:
    three_mers.append(eval(item))

three_mers.sort()

spectrum_list = pd.DataFrame(three_mers).loc[:,1].tolist()
spectrum = pd.DataFrame(spectrum_list, columns=trimers)

print('Time: {} seconds'.format(time.time() - starttime))
spectrum

Time: 405.00709319114685 seconds


Unnamed: 0,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,AGA,AGC,...,CTA,CTC,GAA,GAC,GCA,GCC,GGA,GTA,TAA,TCA
0,0.032689,0.033219,0.026641,0.035482,0.022756,0.030556,0.034822,0.018978,0.022754,0.037276,...,0.012865,0.018088,0.035001,0.026226,0.037797,0.046905,0.027009,0.024180,0.030599,0.034887
1,0.033627,0.033219,0.027929,0.036025,0.022657,0.030048,0.034508,0.019134,0.023087,0.037241,...,0.013143,0.017872,0.035895,0.026236,0.037596,0.045997,0.027243,0.024661,0.031182,0.034428
2,0.032420,0.033699,0.025912,0.035068,0.023445,0.031450,0.034607,0.019167,0.022374,0.036822,...,0.012598,0.018895,0.033789,0.025656,0.037760,0.047537,0.025467,0.023747,0.030332,0.035766
3,0.033040,0.032997,0.027447,0.035513,0.022467,0.030157,0.034945,0.018968,0.022879,0.037379,...,0.012935,0.017715,0.035564,0.026612,0.037693,0.046644,0.027436,0.024597,0.031035,0.034157
4,0.033244,0.033400,0.027070,0.035686,0.022698,0.030388,0.034927,0.019277,0.022415,0.036906,...,0.013295,0.018196,0.035137,0.026318,0.037616,0.046606,0.026942,0.024417,0.031064,0.034516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.033350,0.033599,0.027025,0.035968,0.023339,0.030587,0.034256,0.019672,0.023068,0.036791,...,0.013179,0.018500,0.035082,0.026104,0.037460,0.046143,0.026510,0.024206,0.031317,0.035092
796,0.033152,0.033196,0.027967,0.035463,0.022731,0.030054,0.034541,0.019187,0.022976,0.037502,...,0.013281,0.017967,0.036135,0.026618,0.037821,0.046003,0.027666,0.024655,0.030803,0.034032
797,0.032857,0.033498,0.027290,0.035203,0.022979,0.030695,0.034501,0.019303,0.022759,0.037150,...,0.013146,0.018704,0.035042,0.026374,0.037594,0.046650,0.026579,0.024181,0.030560,0.034679
798,0.032945,0.033383,0.026786,0.035637,0.022806,0.030635,0.035024,0.019229,0.022564,0.036923,...,0.013116,0.018148,0.035140,0.026345,0.037679,0.046503,0.026957,0.024194,0.030711,0.034819


In [6]:
# Spectrums mics matrix
frames = [spectrum ,amr.drop(['Genome ID'], axis=1)]
matrix = pd.concat(frames, axis=1, join='outer')

matrix

Unnamed: 0,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,AGA,AGC,...,nalidixic acid,ampicillin,sulfisoxazole,cefoxitin,tetracycline,ceftiofur,amoxicillin/clavulanic acid,streptomycin,azithromycin,kanamycin
0,0.032689,0.033219,0.026641,0.035482,0.022756,0.030556,0.034822,0.018978,0.022754,0.037276,...,4.0,1.0,16.0,4.0,32.0,0.5,1.0,8.0,8.0,
1,0.033627,0.033219,0.027929,0.036025,0.022657,0.030048,0.034508,0.019134,0.023087,0.037241,...,4.0,1.0,256.0,4.0,32.0,1.0,1.0,64.0,8.0,
2,0.032420,0.033699,0.025912,0.035068,0.023445,0.031450,0.034607,0.019167,0.022374,0.036822,...,4.0,1.0,32.0,4.0,32.0,0.5,1.0,4.0,8.0,
3,0.033040,0.032997,0.027447,0.035513,0.022467,0.030157,0.034945,0.018968,0.022879,0.037379,...,4.0,1.0,32.0,4.0,32.0,1.0,1.0,8.0,8.0,
4,0.033244,0.033400,0.027070,0.035686,0.022698,0.030388,0.034927,0.019277,0.022415,0.036906,...,32.0,1.0,64.0,2.0,4.0,0.5,1.0,4.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1932,,,,,,,,,,,...,4.0,1.0,32.0,2.0,32.0,1.0,1.0,4.0,8.0,
1933,,,,,,,,,,,...,4.0,32.0,256.0,16.0,32.0,8.0,32.0,64.0,4.0,
1934,,,,,,,,,,,...,4.0,32.0,256.0,32.0,32.0,8.0,32.0,64.0,4.0,64.0
1935,,,,,,,,,,,...,4.0,32.0,256.0,32.0,32.0,8.0,32.0,4.0,4.0,


In [8]:
# Genomic 3-mers composition

import numpy as np
import time

starttime = time.time()

# List of float 'Genome ID's
genome_list = amr_mics.loc[:,'Genome ID'].sort_values().unique().tolist()

# List of str 'Genome ID's
genome_ids=[]
for i in range(len(genome_list)):
    gen_aux = str(float('%.8g' %genome_list[i]))
    if len(gen_aux)==9:
        genome_ids.append(gen_aux)
    elif len(gen_aux)==8:
        genome_ids.append(gen_aux+'0')
    elif len(gen_aux)==7:
        genome_ids.append(gen_aux+'00')
    elif len(gen_aux)==6:
        genome_ids.append(gen_aux+'000')
        

three_mers = []
for genome_id in genome_ids:
    with open ("/home/cristian/projects/salmonella_enterica/genomes/"+genome_id+"/"+genome_id+".fna", "r") as file:
        aux = file.read().split(genome_id+"]")
    aux_2 = []
    for i in range(len(aux)-1):
        aux_2.append(aux[i+1].split('>')[0].replace("\n", "").upper())
    genome = ''.join(aux_2)
    
    # 3-mer composition dictionary
    composition = trimer_composition(genome)
    # Total number of 3-mers
    total_composition = sum(composition.values())
    # List of 3-mers compostion frequencies
    norm_freq = [count/total_composition for count in composition.values()]
    # Matrix array of frequencies 
    three_mers.append(norm_freq)

spectrum = pd.DataFrame(three_mers, columns=trimers)

print('Time: {} seconds'.format(time.time() - starttime))
spectrum

Time: 1844.720778465271 seconds


Unnamed: 0,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,AGA,AGC,...,CTA,CTC,GAA,GAC,GCA,GCC,GGA,GTA,TAA,TCA
0,0.032689,0.033219,0.026641,0.035482,0.022756,0.030556,0.034822,0.018978,0.022754,0.037276,...,0.012865,0.018088,0.035001,0.026226,0.037797,0.046905,0.027009,0.024180,0.030599,0.034887
1,0.033627,0.033219,0.027929,0.036025,0.022657,0.030048,0.034508,0.019134,0.023087,0.037241,...,0.013143,0.017872,0.035895,0.026236,0.037596,0.045997,0.027243,0.024661,0.031182,0.034428
2,0.032420,0.033699,0.025912,0.035068,0.023445,0.031450,0.034607,0.019167,0.022374,0.036822,...,0.012598,0.018895,0.033789,0.025656,0.037760,0.047537,0.025467,0.023747,0.030332,0.035766
3,0.033040,0.032997,0.027447,0.035513,0.022467,0.030157,0.034945,0.018968,0.022879,0.037379,...,0.012935,0.017715,0.035564,0.026612,0.037693,0.046644,0.027436,0.024597,0.031035,0.034157
4,0.033244,0.033400,0.027070,0.035686,0.022698,0.030388,0.034927,0.019277,0.022415,0.036906,...,0.013295,0.018196,0.035137,0.026318,0.037616,0.046606,0.026942,0.024417,0.031064,0.034516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1932,0.032315,0.033581,0.026122,0.035137,0.023211,0.031134,0.035051,0.019219,0.022476,0.036801,...,0.012747,0.018379,0.034128,0.026141,0.037756,0.047199,0.026255,0.024023,0.030466,0.035453
1933,0.033433,0.033391,0.028279,0.035625,0.023124,0.030113,0.034067,0.019714,0.023152,0.037226,...,0.013436,0.018486,0.036291,0.026451,0.037870,0.045183,0.027698,0.024632,0.031082,0.034832
1934,0.032771,0.033541,0.027095,0.035151,0.022893,0.030615,0.034811,0.019497,0.022603,0.037078,...,0.013245,0.018615,0.035288,0.026564,0.037658,0.046499,0.027179,0.024211,0.030485,0.034579
1935,0.032870,0.033419,0.027375,0.035166,0.023030,0.030439,0.034605,0.019368,0.022836,0.037386,...,0.013199,0.018581,0.035437,0.026371,0.037734,0.046326,0.026708,0.024296,0.030435,0.034535


In [9]:
# Spectrums mics matrix
frames = [spectrum ,amr.drop(['Genome ID'], axis=1)]
matrix = pd.concat(frames, axis=1, join='outer')

matrix

Unnamed: 0,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,AGA,AGC,...,nalidixic acid,ampicillin,sulfisoxazole,cefoxitin,tetracycline,ceftiofur,amoxicillin/clavulanic acid,streptomycin,azithromycin,kanamycin
0,0.032689,0.033219,0.026641,0.035482,0.022756,0.030556,0.034822,0.018978,0.022754,0.037276,...,4.0,1.0,16.0,4.0,32.0,0.5,1.0,8.0,8.0,
1,0.033627,0.033219,0.027929,0.036025,0.022657,0.030048,0.034508,0.019134,0.023087,0.037241,...,4.0,1.0,256.0,4.0,32.0,1.0,1.0,64.0,8.0,
2,0.032420,0.033699,0.025912,0.035068,0.023445,0.031450,0.034607,0.019167,0.022374,0.036822,...,4.0,1.0,32.0,4.0,32.0,0.5,1.0,4.0,8.0,
3,0.033040,0.032997,0.027447,0.035513,0.022467,0.030157,0.034945,0.018968,0.022879,0.037379,...,4.0,1.0,32.0,4.0,32.0,1.0,1.0,8.0,8.0,
4,0.033244,0.033400,0.027070,0.035686,0.022698,0.030388,0.034927,0.019277,0.022415,0.036906,...,32.0,1.0,64.0,2.0,4.0,0.5,1.0,4.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1932,0.032315,0.033581,0.026122,0.035137,0.023211,0.031134,0.035051,0.019219,0.022476,0.036801,...,4.0,1.0,32.0,2.0,32.0,1.0,1.0,4.0,8.0,
1933,0.033433,0.033391,0.028279,0.035625,0.023124,0.030113,0.034067,0.019714,0.023152,0.037226,...,4.0,32.0,256.0,16.0,32.0,8.0,32.0,64.0,4.0,
1934,0.032771,0.033541,0.027095,0.035151,0.022893,0.030615,0.034811,0.019497,0.022603,0.037078,...,4.0,32.0,256.0,32.0,32.0,8.0,32.0,64.0,4.0,64.0
1935,0.032870,0.033419,0.027375,0.035166,0.023030,0.030439,0.034605,0.019368,0.022836,0.037386,...,4.0,32.0,256.0,32.0,32.0,8.0,32.0,4.0,4.0,


In [12]:
# Save to csv file

matrix.to_csv('threemers_mics_matrix.csv')

In [13]:
# Genomic 4-mers composition

import numpy as np
import time

starttime = time.time()

# List of float 'Genome ID's
genome_list = amr_mics.loc[:,'Genome ID'].sort_values().unique().tolist()

# List of str 'Genome ID's
genome_ids=[]
for i in range(len(genome_list)):
    gen_aux = str(float('%.8g' %genome_list[i]))
    if len(gen_aux)==9:
        genome_ids.append(gen_aux)
    elif len(gen_aux)==8:
        genome_ids.append(gen_aux+'0')
    elif len(gen_aux)==7:
        genome_ids.append(gen_aux+'00')
    elif len(gen_aux)==6:
        genome_ids.append(gen_aux+'000')
        

tetra_mers = []
for genome_id in genome_ids:
    with open ("/home/cristian/projects/salmonella_enterica/genomes/"+genome_id+"/"+genome_id+".fna", "r") as file:
        aux = file.read().split(genome_id+"]")
    aux_2 = []
    for i in range(len(aux)-1):
        aux_2.append(aux[i+1].split('>')[0].replace("\n", "").upper())
    genome = ''.join(aux_2)
    
    # 4-mer composition dictionary
    composition = tetramer_composition(genome)
    # Total number of 4-mers
    total_composition = sum(composition.values())
    # List of 4-mers compostion frequencies
    norm_freq = [count/total_composition for count in composition.values()]
    # Matrix array of frequencies 
    tetra_mers.append(norm_freq)

spectrum = pd.DataFrame(tetra_mers, columns=tetramers)

print('Time: {} seconds'.format(time.time() - starttime))
spectrum

Time: 4431.803725242615 seconds


Unnamed: 0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,TAAA,TACA,TAGA,TATA,TCAA,TCCA,TCGA,TGAA,TGCA,TTAA
0,0.010195,0.009592,0.009397,0.010176,0.007437,0.007577,0.010206,0.005423,0.006263,0.008236,...,0.009499,0.004205,0.002480,0.004103,0.007466,0.006980,0.005664,0.009481,0.006261,0.008409
1,0.010449,0.009637,0.009843,0.010463,0.007509,0.007464,0.010087,0.005524,0.006698,0.008495,...,0.009783,0.004155,0.002550,0.004285,0.007519,0.006763,0.005651,0.009822,0.006287,0.008544
2,0.010097,0.009737,0.009135,0.010075,0.007710,0.007822,0.010148,0.005425,0.006208,0.008100,...,0.009514,0.004238,0.002465,0.004078,0.007625,0.007260,0.005509,0.008955,0.006283,0.008249
3,0.010247,0.009551,0.009677,0.010202,0.007336,0.007510,0.010134,0.005421,0.006495,0.008521,...,0.009746,0.004154,0.002500,0.004194,0.007356,0.006686,0.005628,0.009709,0.006223,0.008475
4,0.010307,0.009660,0.009587,0.010329,0.007443,0.007574,0.010206,0.005528,0.006474,0.008381,...,0.009767,0.004200,0.002443,0.004289,0.007484,0.006737,0.005667,0.009587,0.006223,0.008556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1932,0.009980,0.009666,0.009202,0.009970,0.007589,0.007733,0.010247,0.005414,0.006218,0.008093,...,0.009495,0.004233,0.002532,0.004115,0.007548,0.007102,0.005538,0.009120,0.006277,0.008322
1933,0.010330,0.009561,0.009833,0.010424,0.007569,0.007630,0.009912,0.005661,0.006756,0.008575,...,0.009718,0.004338,0.002485,0.004273,0.007443,0.006734,0.005729,0.010173,0.006501,0.008546
1934,0.010122,0.009620,0.009437,0.010167,0.007503,0.007677,0.010154,0.005604,0.006500,0.008380,...,0.009516,0.004264,0.002459,0.004195,0.007456,0.006835,0.005641,0.009669,0.006311,0.008381
1935,0.010025,0.009611,0.009521,0.010214,0.007612,0.007600,0.010122,0.005471,0.006532,0.008427,...,0.009489,0.004250,0.002483,0.004122,0.007420,0.006911,0.005811,0.009747,0.006263,0.008309


In [14]:
# Spectrums mics matrix
frames = [spectrum ,amr.drop(['Genome ID'], axis=1)]
matrix = pd.concat(frames, axis=1, join='outer')

matrix

Unnamed: 0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,nalidixic acid,ampicillin,sulfisoxazole,cefoxitin,tetracycline,ceftiofur,amoxicillin/clavulanic acid,streptomycin,azithromycin,kanamycin
0,0.010195,0.009592,0.009397,0.010176,0.007437,0.007577,0.010206,0.005423,0.006263,0.008236,...,4.0,1.0,16.0,4.0,32.0,0.5,1.0,8.0,8.0,
1,0.010449,0.009637,0.009843,0.010463,0.007509,0.007464,0.010087,0.005524,0.006698,0.008495,...,4.0,1.0,256.0,4.0,32.0,1.0,1.0,64.0,8.0,
2,0.010097,0.009737,0.009135,0.010075,0.007710,0.007822,0.010148,0.005425,0.006208,0.008100,...,4.0,1.0,32.0,4.0,32.0,0.5,1.0,4.0,8.0,
3,0.010247,0.009551,0.009677,0.010202,0.007336,0.007510,0.010134,0.005421,0.006495,0.008521,...,4.0,1.0,32.0,4.0,32.0,1.0,1.0,8.0,8.0,
4,0.010307,0.009660,0.009587,0.010329,0.007443,0.007574,0.010206,0.005528,0.006474,0.008381,...,32.0,1.0,64.0,2.0,4.0,0.5,1.0,4.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1932,0.009980,0.009666,0.009202,0.009970,0.007589,0.007733,0.010247,0.005414,0.006218,0.008093,...,4.0,1.0,32.0,2.0,32.0,1.0,1.0,4.0,8.0,
1933,0.010330,0.009561,0.009833,0.010424,0.007569,0.007630,0.009912,0.005661,0.006756,0.008575,...,4.0,32.0,256.0,16.0,32.0,8.0,32.0,64.0,4.0,
1934,0.010122,0.009620,0.009437,0.010167,0.007503,0.007677,0.010154,0.005604,0.006500,0.008380,...,4.0,32.0,256.0,32.0,32.0,8.0,32.0,64.0,4.0,64.0
1935,0.010025,0.009611,0.009521,0.010214,0.007612,0.007600,0.010122,0.005471,0.006532,0.008427,...,4.0,32.0,256.0,32.0,32.0,8.0,32.0,4.0,4.0,


In [15]:
# Save to csv file

matrix.to_csv('tetramers_mics_matrix.csv')