In [14]:
import pandas as pd
import numpy as np
import altair as alt
import os, random
from Bio import SeqIO
import gensim
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import altair as alt


## Defining paths for each and every omic

In [2]:
path_root_data = os.path.join ('..', 'Data', 'Extracted', 'Databases')
path_all_fasta = os.path.join (path_root_data, 'fasta_files', 'AllBins')
path_normalised_metabolomics = os.path.join (path_root_data, 'Metabolomics', 'Normalised_Tables')
path_model_save_root = 'Saved_models'


In [3]:
num_of_mags = len([i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')])
SEED = 42
END = 25
num_of_workers = 3
random.seed(SEED)
np.random.seed(SEED)

---
## MAG-related functions
---

In [4]:
# Function that splits each genome into k-mers thus creating even longer sentence (MAG)
# It returns tokenized genome i.e. [kmer, kmer,...]
def split_genome (genome, k = 5):
    new_genome = []
    n = len(genome)
    
    if n-k <=0:
        return genome
    else:
        for i in range(n-k):
            new_genome.append(genome[i:i+k])
        
        return new_genome


def vectorize_one_mag (one_mag, w2v_model):
    
    # We have to generate vectors for each word in one MAG and then create vector representation of that MAG
    # by averaging vectors of its words
    zero_vector = np.zeros(w2v_model.vector_size)
    word_vectors = []
    one_mag_vector = []
    
    for sentence in one_mag:
        for word in sentence:
            if word in w2v_model.wv:
                try:
                    word_vectors.append(w2v_model.wv[word])
                except KeyError:
                    print ('Key Error')
                    continue
    
    if word_vectors:
        word_vectors = np.asarray(word_vectors)
        one_mag_vector = word_vectors.mean (axis=0)
    
    else:
        one_mag_vector = zero_vector
    
    return one_mag_vector


# Function that vectorizes a MAG (document) with a pretrained word2vec model. It returns vector representation of a given MAG
# Vectorization is done by averaging word (k-mer) vectors for the whole document (MAG)
def vectorize_mags (w2v_model, end = 25):
    
    print ('Vectorizing MAGs')
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')]
    list_of_mag_vectors = []
    
    # This was done so that I could work with first 'end' FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                # Vectorizing MAGs one by one
                list_of_mag_vectors.append (vectorize_one_mag (one_mag, w2v_model))
    
    print ('Finished vectorizing')
    
    return list_of_mag_vectors
    
    

# If one wants to import MAGs in order to vectorize them, one should use start argument in order to skip first 'start' MAGs
# If one wants to import MAGs to train word2vec model, one should use only end argument, so that first 'end' MAGs are used for training
# Todo: Implement randomisation in picking MAGs for training, and don't use first 'start' MAGs for training
# (create list of indexes from 0 to 1364, use sklearn split train test, then traverse directory and use only MAGs with indexes in train for training w2v)
def import_mags_and_build_model (end = 25, path_all_fasta = path_all_fasta):
    
    print ('Importing MAGs')
    
    # There are 1364 MAGs enclosed in FASTA files
    # I have to traverse every FASTA file, and in each file every sequence
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')]
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                # If we do not have a model, we build one
                if i == 0:
                    print ('Building w2v model')
                    # We build our model on the first MAG
                    w2v_model = Word2Vec (sentences = one_mag, size = 100, workers = num_of_workers, seed=SEED)
                
                # Else we just expand its vocabulary
                else:
                    # Now we expand our vocabulary
                    w2v_model.build_vocab (one_mag, update = True)
                    
    print ('Finished building')
    
    return w2v_model


def train_model (w2v_model, epochs, end = 25):
    
    print ('Starting model training')
    
    # There are 1364 MAGs enclosed in FASTA files
    # I have to traverse every FASTA file, and in each file every sequence
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')]
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                
                w2v_model.train (one_mag, total_examples = w2v_model.corpus_count, epochs = epochs)
                    
    print ('Model training finished')
    
    return w2v_model


---
## Importing Metabolomic data

---

In [5]:
metabolomics_file_name = os.path.join(path_normalised_metabolomics, os.listdir(path_normalised_metabolomics)[0])
metabolomics_df = pd.read_csv (metabolomics_file_name, delimiter = '\t')
metabolomics_df

Unnamed: 0,Metabolite,tp,date,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,!Phosphoric_acid_3TMS,D1,2012-04-17,snp,known,0.757113,0.737943,0.098650,3,0.056956,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
1,!Phosphoric_acid_3TMS,D10,2012-01-25,snp,known,0.720935,0.873509,0.266593,3,0.153918,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
2,!Phosphoric_acid_3TMS,D11,2012-03-22,snp,known,0.625047,0.652268,0.057820,3,0.033382,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
3,!Phosphoric_acid_3TMS,D12,2012-01-19,snp,known,0.964956,0.941878,0.053754,3,0.031035,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
4,!Phosphoric_acid_3TMS,D13,2011-05-13,snp,known,0.963984,0.974479,0.071659,3,0.041373,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,Valine_2TMS,D51,2011-08-11,bp,known,0.704821,0.478251,0.428368,3,0.247319,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18866,Valine_2TMS,D6,2011-06-03,bp,known,1.085888,1.197471,0.487484,3,0.281449,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18867,Valine_2TMS,D7,2012-04-04,bp,known,2.958023,2.813145,0.341213,3,0.197000,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18868,Valine_2TMS,D8,2012-02-08,bp,known,2.515980,2.322248,0.618556,3,0.357123,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine


In [6]:
metabolomics_df['date'] = pd.to_datetime(metabolomics_df['date'])
metabolomics_df.insert (0, 'date', metabolomics_df.pop('date'))
metabolomics_df.sort_values ('date', inplace = True, ignore_index = True)
metabolomics_df

Unnamed: 0,date,Metabolite,tp,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,2011-03-21,Leucine_2TMS,D37,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar,"C16439,C00123,C01570",leucine;L-leucine;D-leucine,leucine;L-leucine;D-leucine
1,2011-03-21,No match: 1181.78_EM_SP_D1_1_223,D37,sp,unknown,3.778469,3.801417,1.924377,3,1.111040,4.302653,extracellular,polar,,,
2,2011-03-21,unknown#emu_WW_2144.25,D37,bnp,unknown,0.949545,0.947209,0.065422,3,0.037771,4.302653,intracellular,nonpolar,,,
3,2011-03-21,Unknown#bth_pae_001,D37,bp,unknown,2.095116,1.809891,1.916896,3,1.106721,4.302653,intracellular,polar,,,
4,2011-03-21,No match: 2220.34_EM_BP_D1_1_192,D37,bp,unknown,1.631643,1.571125,0.168501,3,0.097284,4.302653,intracellular,polar,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,D48,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar,D01924,octadecan-1-ol,octadecan-1-ol
18866,2012-05-03,No match: 1447.93_EM_BP_D2_1_57,D48,bp,unknown,4.815774,1.160165,6.835474,3,3.946463,4.302653,intracellular,polar,,,
18867,2012-05-03,Lactulose_8TMS,D48,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar,C07064,lactulose,lactulose
18868,2012-05-03,No match: 2004.78_EM_BNP_D10_1_205,D48,bnp,unknown,0.785996,0.744529,0.139901,3,0.080772,4.302653,intracellular,nonpolar,,,


## MAG examination

---
Important info:

> We can look at metagenome as a sentence which consists of words (that are represented with genomes). Therefore, we can create metagenomes as strings where words are seperated with space character (' '). This is one way of tackling this problem

To do:
1. Make kmers of some length, out of every word in every sentence
2. Encode every sentence with Word2Vec and CBOW (because it's faster than skip-gram)
3. Reduce dimensionality with PCA or with autoencoder
    > Maybe it is good to do the first 3 steps while importing, for each and every sentence. That way, I might use less memory
4. Cluster similar sentences (MAGs) to get rMAGs

> Important: this is not temporal data, since there are no timestamps!


In [7]:

# FOR CLUSTERING I SHOULD CREATE A DATAFRAME WITH MAGs INDEXES AND THEIR VECTOR REPRESENTATIONS

final_model = import_mags_and_build_model (end = END, path_all_fasta = path_all_fasta)


Importing MAGs
Building w2v model
Finished building


In [8]:
final_model.wv.most_similar('ACGGC')

[('CCGGC', 0.5894356966018677),
 ('TCGGC', 0.5622138977050781),
 ('ACGGA', 0.5432533025741577),
 ('ACGGG', 0.5209246873855591),
 ('GCGGC', 0.5119935274124146),
 ('ACGGT', 0.4680752158164978),
 ('ACGAT', 0.2959875166416168),
 ('ACGAC', 0.2835376560688019),
 ('TACGA', 0.28112727403640747),
 ('TACGC', 0.2611392140388489)]

In [9]:
# Train model. It tooks ~10 minutes for END = 25 amount of MAGs
epochs = 10

final_model = train_model (final_model, epochs = epochs, end = END)


Starting model training
Model training finished


In [10]:
final_model.wv.save_word2vec_format(os.path.join (path_model_save_root, 'model_25.bin'), binary=True) 

Now I should vectorize documents with this model. For further use, I could save this model's weights, and use it to vectorize all mags. That would take a lot, but every MAG will have its vector representation
> This could be done by importing one MAG at a time, then tokenizing it (like before), then getting vector representations of that MAG's sentences (genomes) and then finding the vector representation of the whole MAG (document). If I do that for one MAG at a time, There is no need to worry about memory


In [11]:
list_of_mag_vectors = vectorize_mags (final_model, end = END)

Vectorizing MAGs
Finished vectorizing


In [12]:
list_of_mag_vectors[0:2]

[array([-1.1965473e+00, -3.4466343e+00,  1.1225539e+00,  2.6312834e-01,
        -2.5361535e-01,  1.8168186e+00,  1.3943011e+00,  5.9793311e-01,
        -2.0670781e+00, -2.0657931e-01,  2.3056791e+00, -1.0977945e+00,
         1.2557960e+00, -9.9643773e-01,  1.1149993e+00,  1.8037397e+00,
        -2.9583761e-01, -4.4174930e-01, -8.8848591e-01,  1.2939855e+00,
        -4.4530161e-02, -1.0593354e+00, -1.1706048e+00, -2.1949111e-01,
        -8.0128688e-01, -6.9839859e-01, -1.0369163e-01,  2.2203580e-01,
         1.0739380e+00, -1.4342694e+00, -1.4528227e-01,  2.1896055e-01,
         4.8607826e-01,  1.1318611e+00,  1.5093282e+00,  2.1629584e+00,
         7.7518100e-01, -6.0288894e-01,  4.9686894e-01,  7.2373733e-02,
        -1.9086123e+00, -6.7348307e-01, -1.8191979e+00,  2.6915163e-01,
         7.9759979e-01,  1.7312871e+00, -2.3905370e-01, -3.9395061e-01,
         1.1420479e+00, -1.6216646e+00,  3.4147367e-01, -3.3497950e-01,
        -2.4591829e-03, -2.2679090e+00,  2.6078584e+00,  1.72881

In [13]:
mags_df = pd.DataFrame (list_of_mag_vectors)
mags_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-1.196547,-3.446634,1.122554,0.263128,-0.253615,1.816819,1.394301,0.597933,-2.067078,-0.206579,...,-0.158636,1.425906,1.025101,-0.028354,-0.048041,1.62941,-0.953644,-0.283592,-0.700354,-1.090133
1,0.049059,2.82388,0.362629,-1.508203,-0.325315,-1.464902,-0.792796,1.246105,1.036349,1.331887,...,-0.887114,-1.587529,-1.264382,0.973971,-0.845243,-0.687133,1.482277,0.226025,1.390829,1.959994
2,-1.076538,-4.783527,-0.096637,0.892442,0.856198,2.473337,0.901526,0.557027,-1.793551,-0.557365,...,0.062814,0.682193,0.660506,-0.592617,-0.380943,0.848278,-1.756477,-0.635433,-2.427694,-1.583026
3,0.164475,3.217961,-0.096349,-1.09677,-0.300357,-1.441947,-0.76352,1.692813,1.136963,1.400865,...,-0.791861,-1.958941,-1.440642,1.214189,-0.765646,-0.853262,1.519032,0.188515,1.302754,2.157268
4,0.246169,3.115447,-0.082878,-1.079046,-0.321819,-1.298257,-0.760366,1.661492,1.12116,1.410003,...,-0.782524,-1.796969,-1.363379,1.163899,-0.732312,-0.764388,1.450671,0.191647,1.245304,2.095642
5,-0.073989,1.920999,-0.485796,-0.798457,-0.299321,-0.91879,-0.671724,1.400493,0.766859,0.836819,...,-0.702894,-1.193122,-0.917276,0.615969,-0.763657,-0.400789,0.823787,0.114617,0.53605,1.900726
6,-1.539677,-6.411491,0.716679,0.576411,0.459347,3.469873,1.477935,-0.333925,-2.537221,-1.127924,...,-0.147849,1.183525,0.890112,-1.368696,-0.020944,1.508202,-2.503601,-0.887476,-3.253759,-2.831345
7,-1.28474,-6.045468,0.443491,0.987065,0.897312,3.389049,1.879721,0.610316,-2.42088,-0.271895,...,0.297817,1.334344,1.391637,-0.642675,-0.0487,1.54772,-2.34698,-0.859433,-2.75881,-2.664072
8,-0.579216,-1.839221,0.276878,0.045237,-0.073658,1.040149,0.755374,1.006982,-1.031035,0.293702,...,-0.229055,0.767412,0.814296,0.126623,-0.252945,1.046583,-0.636697,0.065987,-0.687458,-0.31026
9,-0.945494,-1.383347,0.255619,-0.045844,-0.424141,0.829961,0.247029,0.974351,-0.841089,0.088883,...,-0.719211,0.411402,0.037168,-0.058658,-0.365759,0.596159,-0.516978,-0.328708,-0.495169,0.442289


---
## Clustering
---

### 1. K-means

In [34]:
k_range_end = 15 # Usually it is sqrt(# of mags)

k_range = range(1, k_range_end)

k_means_models = [KMeans (n_clusters = i, random_state = SEED) for i in k_range]
k_scores = [k_mean_model.fit(mags_df).score(mags_df) for k_mean_model in k_means_model]
k_data = pd.DataFrame ({'k_range':k_range, 'k_scores':k_scores})
k_data


Unnamed: 0,k_range,k_scores
0,1,-2854.137989
1,2,-669.94282
2,3,-310.319632
3,4,-240.240229
4,5,-179.335104
5,6,-149.289063
6,7,-131.255782
7,8,-104.342716
8,9,-88.957743
9,10,-73.295911


In [35]:
k_num_chart = alt.Chart(data = k_data).mark_line().encode(
    alt.X ('k_range:Q'),
    alt.Y ('k_scores:Q')
)

k_num_chart

In [37]:
# We can see from the chart above that 6 or 7 clusters are optimal for this task (where END = 25 MAGs)
num_of_clusters = 7

k_means_model = KMeans (n_clusters = num_of_clusters, random_state = SEED)
k_means_model.fit(mags_df)

KMeans(n_clusters=7, random_state=42)

In [40]:
k_means_predicted = k_means_model.predict(mags_df)
k_means_predicted

array([4, 5, 6, 5, 5, 1, 2, 2, 0, 0, 0, 4, 0, 1, 1, 1, 5, 1, 1, 4, 0, 5,
       2, 3, 1], dtype=int32)

In [1]:


# Next: try kernel k-means clustering
# Next: find evaluation methods

