In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis


## Defining paths for each and every omic

In [2]:
path_root_data = os.path.join ('..', 'Data', 'Extracted', 'Databases')
path_all_fasta = os.path.join (path_root_data, 'fasta_files', 'AllBins')
path_normalised_metabolomics = os.path.join (path_root_data, 'Metabolomics', 'Normalised_Tables')


---
## Importing MAGs
---

In [3]:
# There are 1364 MAGs enclosed in FASTA files
# I have to traverse every FASTA file, and in each file every sequence

list_of_series = []

fasta_files = [i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')]

# This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
for i, fasta_file_name in enumerate(fasta_files):
    
    if i == 100:
        break
    else:
        with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
            #sequence = []
            sequence = ''
            for fasta_string in SeqIO.parse(input_file, "fasta"):
                #sequence.append(str(fasta_string.seq))
                sequence = sequence + str(fasta_string.seq) + ' '
                
            #list_of_series.append(pd.Series(sequence))
            list_of_series.append(sequence)


In [4]:
# Memory usage for all FASTA files is ~ 4.5GB, and for only first 100 is ~1.5GB
list_of_series[0][0:100]

'AAAGCGGAACCCGTTGGCGGCAATGGTGGCACTTTCCATGTCCAGGGCGATGGCGCGGCTCTGGCTAAAACGGCGCTGGGGCTGGTTGTCGGGCAGCAGT'

---
## Importing Metabolomic data

---

In [5]:
metabolomics_file_name = os.path.join(path_normalised_metabolomics, os.listdir(path_normalised_metabolomics)[0])
metabolomics_df = pd.read_csv (metabolomics_file_name, delimiter = '\t')
metabolomics_df

Unnamed: 0,Metabolite,tp,date,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,!Phosphoric_acid_3TMS,D1,2012-04-17,snp,known,0.757113,0.737943,0.098650,3,0.056956,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
1,!Phosphoric_acid_3TMS,D10,2012-01-25,snp,known,0.720935,0.873509,0.266593,3,0.153918,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
2,!Phosphoric_acid_3TMS,D11,2012-03-22,snp,known,0.625047,0.652268,0.057820,3,0.033382,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
3,!Phosphoric_acid_3TMS,D12,2012-01-19,snp,known,0.964956,0.941878,0.053754,3,0.031035,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
4,!Phosphoric_acid_3TMS,D13,2011-05-13,snp,known,0.963984,0.974479,0.071659,3,0.041373,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,Valine_2TMS,D51,2011-08-11,bp,known,0.704821,0.478251,0.428368,3,0.247319,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18866,Valine_2TMS,D6,2011-06-03,bp,known,1.085888,1.197471,0.487484,3,0.281449,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18867,Valine_2TMS,D7,2012-04-04,bp,known,2.958023,2.813145,0.341213,3,0.197000,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18868,Valine_2TMS,D8,2012-02-08,bp,known,2.515980,2.322248,0.618556,3,0.357123,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine


In [6]:
metabolomics_df['date'] = pd.to_datetime(metabolomics_df['date'])
metabolomics_df.insert (0, 'date', metabolomics_df.pop('date'))
metabolomics_df.sort_values ('date', inplace = True, ignore_index = True)
metabolomics_df

Unnamed: 0,date,Metabolite,tp,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,2011-03-21,Leucine_2TMS,D37,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar,"C16439,C00123,C01570",leucine;L-leucine;D-leucine,leucine;L-leucine;D-leucine
1,2011-03-21,No match: 1181.78_EM_SP_D1_1_223,D37,sp,unknown,3.778469,3.801417,1.924377,3,1.111040,4.302653,extracellular,polar,,,
2,2011-03-21,unknown#emu_WW_2144.25,D37,bnp,unknown,0.949545,0.947209,0.065422,3,0.037771,4.302653,intracellular,nonpolar,,,
3,2011-03-21,Unknown#bth_pae_001,D37,bp,unknown,2.095116,1.809891,1.916896,3,1.106721,4.302653,intracellular,polar,,,
4,2011-03-21,No match: 2220.34_EM_BP_D1_1_192,D37,bp,unknown,1.631643,1.571125,0.168501,3,0.097284,4.302653,intracellular,polar,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,D48,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar,D01924,octadecan-1-ol,octadecan-1-ol
18866,2012-05-03,No match: 1447.93_EM_BP_D2_1_57,D48,bp,unknown,4.815774,1.160165,6.835474,3,3.946463,4.302653,intracellular,polar,,,
18867,2012-05-03,Lactulose_8TMS,D48,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar,C07064,lactulose,lactulose
18868,2012-05-03,No match: 2004.78_EM_BNP_D10_1_205,D48,bnp,unknown,0.785996,0.744529,0.139901,3,0.080772,4.302653,intracellular,nonpolar,,,


## MAG examination

In [7]:
max_metagenome_size = max([len(i) for i in list_of_series])
max_metagenome_size

5332942

In [10]:
#list_of_series[0]

In [11]:
#max_genome_size = max([max([len(j) for j in i.values]) for i in list_of_series])

In [12]:
#max_genome_size

---
Important info:

> We can look at metagenome as a sentence which consists of words (that are represented with genomes). Therefore, we can create metagenomes as strings where words are seperated with space character (' '). This is one way of tackling this problem

To do:
* Encode every sentence with some encoding method (TF-IDF, Word2Vec...)
* Try some simple clustering method (K-means, Hierarchical...)
* Improve this with autoencoder encoding + clustering

> Important: this is not temporal data, since there are no timestamps!
