In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os, random
import gensim
import datetime as dt
from Bio import SeqIO
from gensim.models import Word2Vec
from sklearn.cluster import KMeans, OPTICS
from sklearn.decomposition import PCA
from sklearn import preprocessing, model_selection, metrics


## Defining paths for each and every omic

In [2]:
path_root_data = os.path.join ('..', 'Data', 'Extracted', 'Databases')
path_all_fasta = os.path.join (path_root_data, 'fasta_files', 'AllBins')
path_normalised_metabolomics = os.path.join (path_root_data, 'Metabolomics', 'Normalised_Tables')
path_model_save_root = 'Saved_models'


In [3]:
num_of_mags = len([i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')])
SEED = 42
END = 25
MAX_ROWS = 15000
num_of_workers = 3
random.seed(SEED)
np.random.seed(SEED)
alt.data_transformers.enable('default', max_rows = MAX_ROWS) # Important if you want to visualize datasets with >5000 samples


DataTransformerRegistry.enable('default')

---
# GENOMIC ANALYSIS
---
## MAG-related functions

**Important**: I should review the way I look at MAGs. The names of all fasta files beggining with 'D_##' represent the days those MAGs were obtained. Therefore, I should look at this also as timeseries data. Also, maybe I should only consider 78 MAGs, and not all ~1300.

In [4]:
# Function that splits each genome into k-mers thus creating even longer sentence (MAG)
# It returns tokenized genome i.e. [kmer, kmer,...]
def split_genome (genome, k = 5):
    new_genome = []
    n = len(genome)
    
    if n-k <=0:
        return genome
    else:
        for i in range(n-k):
            new_genome.append(genome[i:i+k])
        
        return new_genome


def vectorize_one_mag (one_mag, w2v_model):
    
    # We have to generate vectors for each word in one MAG and then create vector representation of that MAG
    # by averaging vectors of its words
    zero_vector = np.zeros(w2v_model.vector_size)
    word_vectors = []
    one_mag_vector = []
    
    for sentence in one_mag:
        for word in sentence:
            if word in w2v_model.wv:
                try:
                    word_vectors.append(w2v_model.wv[word])
                except KeyError:
                    print ('Key Error')
                    continue
    
    if word_vectors:
        word_vectors = np.asarray(word_vectors)
        one_mag_vector = word_vectors.mean (axis=0)
    
    else:
        one_mag_vector = zero_vector
    
    return one_mag_vector


# Function that vectorizes a MAG (document) with a pretrained word2vec model. It returns vector representation of a given MAG
# Vectorization is done by averaging word (k-mer) vectors for the whole document (MAG)
def vectorize_mags (w2v_model, end = 25):
    
    print ('Vectorizing MAGs')
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')]
    list_of_mag_vectors = []
    
    # This was done so that I could work with first 'end' FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                # Vectorizing MAGs one by one
                list_of_mag_vectors.append (vectorize_one_mag (one_mag, w2v_model))
    
    print ('Finished vectorizing')
    
    return list_of_mag_vectors
    
    

# If one wants to import MAGs in order to vectorize them, one should use start argument in order to skip first 'start' MAGs
# If one wants to import MAGs to train word2vec model, one should use only end argument, so that first 'end' MAGs are used for training
# Todo: Implement randomisation in picking MAGs for training, and don't use first 'start' MAGs for training
# (create list of indexes from 0 to 1364, use sklearn split train test, then traverse directory and use only MAGs with indexes in train for training w2v)
def import_mags_and_build_model (end = 25, path_all_fasta = path_all_fasta):
    
    print ('Importing MAGs')
    
    # There are 1364 MAGs enclosed in FASTA files
    # I have to traverse every FASTA file, and in each file every sequence
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')]
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                # If we do not have a model, we build one
                if i == 0:
                    print ('Building w2v model')
                    # We build our model on the first MAG
                    w2v_model = Word2Vec (sentences = one_mag, size = 100, workers = num_of_workers, seed=SEED)
                
                # Else we just expand its vocabulary
                else:
                    # Now we expand our vocabulary
                    w2v_model.build_vocab (one_mag, update = True)
                    
    print ('Finished building')
    
    return w2v_model


def train_model (w2v_model, epochs, end = 25):
    
    print ('Starting model training')
    
    # There are 1364 MAGs enclosed in FASTA files
    # I have to traverse every FASTA file, and in each file every sequence
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')]
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                
                w2v_model.train (one_mag, total_examples = w2v_model.corpus_count, epochs = epochs)
                    
    print ('Model training finished')
    
    return w2v_model


def visualize_with_pca (data, labels, centers):
    
    pca_model = PCA (n_components = 2, random_state = SEED)
    data_transformed = pca_model.fit_transform (data)
    
    data_transformed = pd.DataFrame (data_transformed)
    data_transformed.columns = ['PC_1', 'PC_2']
    data_transformed['Labels'] = labels
    
    chart_data = alt.Chart(data_transformed).mark_circle(opacity = 1).encode(
        alt.X ('PC_1:Q'),
        alt.Y ('PC_2:Q'),
        alt.Color ('Labels:N', legend = alt.Legend())
    )
    
    # This means we are visualising centroids from k_means (there are less centroids that data points)
    if labels.shape[0] != centers.shape[0]:
        
        centers_transformed = pca_model.fit_transform (centers)
        centers_transformed = pd.DataFrame (centers_transformed)
        centers_transformed.columns = ['PC_1', 'PC_2']
        
        chart_centers = alt.Chart(centers_transformed).mark_point(shape = 'diamond', color = 'black', size = 50, opacity = 0.7).encode(
            alt.X ('PC_1:Q'),
            alt.Y ('PC_2:Q'),
        )
        
        return chart_data + chart_centers
    
    # For DBSCAN there are no centroids
    else:
        return chart_data



## MAG examination

In [5]:

# FOR CLUSTERING I SHOULD CREATE A DATAFRAME WITH MAGs INDEXES AND THEIR VECTOR REPRESENTATIONS

final_model = import_mags_and_build_model (end = END, path_all_fasta = path_all_fasta)


Importing MAGs
Building w2v model
Finished building


In [6]:
final_model.wv.most_similar('ACGGC')

[('CCGGC', 0.601494312286377),
 ('TCGGC', 0.5599758625030518),
 ('ACGGA', 0.557608425617218),
 ('GCGGC', 0.49576789140701294),
 ('ACGGT', 0.49560925364494324),
 ('ACGGG', 0.49219831824302673),
 ('ACGTG', 0.3080225884914398),
 ('AACGA', 0.28447020053863525),
 ('TACGA', 0.27282172441482544),
 ('CTACG', 0.2670055031776428)]

In [7]:
# Train model. It tooks ~10 minutes for END = 25 amount of MAGs
EPOCHS = 10

final_model = train_model (final_model, epochs = EPOCHS, end = END)


Starting model training
Model training finished


In [8]:
final_model.wv.save_word2vec_format(os.path.join (path_model_save_root, 'model_25.bin'), binary=True) 

Now I should vectorize documents with this model. For further use, I could save this model's weights, and use it to vectorize all mags. That would take a lot, but every MAG will have its vector representation
> This could be done by importing one MAG at a time, then tokenizing it (like before), then getting vector representations of that MAG's sentences (genomes) and then finding the vector representation of the whole MAG (document). If I do that for one MAG at a time, There is no need to worry about memory


In [9]:
list_of_mag_vectors = vectorize_mags (final_model, end = END)

Vectorizing MAGs
Finished vectorizing


In [10]:
list_of_mag_vectors[0:2]

[array([ 0.38310894,  0.7345962 , -2.0617385 , -1.377627  ,  1.9238758 ,
        -0.99686223,  0.12680843,  1.0402697 ,  0.20480345,  1.2802503 ,
         1.916849  , -0.02946088,  0.00329509, -0.7229627 , -1.8478901 ,
        -0.01394626,  0.69409156, -0.7640301 ,  0.74906594, -0.53745276,
        -1.9601172 , -0.7587462 ,  1.3742642 , -1.217851  , -0.47214326,
         0.20286171, -0.18055716,  0.7335174 , -1.1762983 ,  2.653662  ,
        -0.09273058,  0.28547376,  1.651497  , -1.9121877 , -0.18007877,
        -0.03325903, -0.2086385 ,  0.6730497 , -0.55636424,  0.47483596,
         0.65874934,  1.0897474 , -0.08962332,  1.2203519 , -0.7250376 ,
        -1.6788849 ,  0.6476069 , -2.2597039 , -0.8714646 ,  0.5900627 ,
         1.9941175 ,  2.5292747 ,  1.016403  ,  1.6334622 ,  0.91929203,
        -0.8229314 ,  1.0152584 ,  2.2813056 , -0.28456485, -0.32523456,
        -0.700966  , -0.25083938,  1.6358076 , -2.069389  , -0.16377105,
         0.17444853, -1.6677843 ,  0.91869205, -2.2

In [11]:
mags_df = pd.DataFrame (list_of_mag_vectors)
mags_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.383109,0.734596,-2.061738,-1.377627,1.923876,-0.996862,0.126808,1.04027,0.204803,1.28025,...,0.121722,-0.320823,0.425772,0.958635,0.430631,-0.828883,-0.05615,1.267761,-0.271887,0.190092
1,0.481343,-0.932432,-0.865253,1.20797,-1.250237,0.777246,-0.151042,0.642565,-0.588025,-0.33591,...,0.119711,-0.439365,0.231177,-0.232848,-1.494287,0.295241,0.854238,-1.432462,0.173112,0.072484
2,-0.842044,1.259094,-1.096381,-2.221678,2.279827,-0.136583,0.099482,0.355953,-0.353708,0.898319,...,-0.63044,-0.277421,0.437764,0.625571,0.847081,0.026124,-0.727259,1.451243,-1.014845,0.086551
3,-0.101561,-0.939417,-0.922355,1.341428,-1.730036,0.968651,-0.270715,0.7502,-0.406823,-0.750874,...,-0.0261,-0.601101,0.444838,-0.470695,-1.695565,0.666955,0.757462,-1.799523,-0.165285,0.027959
4,0.016708,-0.857043,-0.891568,1.295794,-1.666705,0.888164,-0.286901,0.789029,-0.402812,-0.720539,...,-0.044416,-0.53739,0.463558,-0.489587,-1.554568,0.546746,0.663034,-1.728324,-0.209322,-0.018858
5,-0.354054,-0.533775,-0.704592,0.811292,-1.546534,0.726773,-0.599777,0.906101,-0.334113,-1.178215,...,-0.465441,-0.325899,0.227029,-0.552756,-1.253512,0.416524,0.595106,-1.39212,-0.454608,-0.013096
6,-0.675129,2.190782,-1.305052,-3.228407,3.407607,-0.50574,0.541557,0.68901,-0.991841,2.128106,...,-0.799071,0.374001,0.685423,1.724551,1.525924,0.315825,-1.069971,2.192337,-0.981943,-0.141088
7,-0.779167,1.545789,-2.048666,-2.723992,3.201448,-0.834769,0.000742,0.582649,-0.227595,1.787308,...,-0.306409,-0.320642,0.982029,1.417129,0.910176,-0.423658,-0.930867,2.162766,-1.080486,-0.067732
8,-0.100146,0.215055,-1.726567,-0.677292,0.56965,-0.577484,-0.280464,1.036189,0.273006,0.13608,...,-0.152135,-0.175914,0.627427,0.36158,-0.253739,-0.741563,0.043703,0.528693,-0.471344,0.073297
9,-0.310057,0.328404,-1.113919,-0.548321,0.129512,-0.173507,-0.179598,1.060453,-0.252836,-0.197626,...,-0.511794,-0.275207,0.240458,0.259425,-0.076633,0.013977,0.123453,0.00703,-0.591048,0.120627


## Data preprocessing

In [12]:
mag_scaler = preprocessing.StandardScaler()
scaled_mags_df = mag_scaler.fit_transform(mags_df)

scaled_mags_df

array([[ 0.99955449,  0.59762294, -1.90141858, ...,  1.05705118,
         0.26734028,  0.80848051],
       [ 1.23006   , -1.21137456,  0.31954654, ..., -0.86047306,
         1.44076529,  0.14113652],
       [-1.87526418,  1.16678919, -0.10948257, ...,  1.18734795,
        -1.69177608,  0.22095493],
       ...,
       [ 0.07720136,  2.00135506, -0.13591284, ...,  1.61730304,
        -1.10894927, -1.92132002],
       [-0.75788145, -0.17377555,  1.52587595, ..., -0.35084359,
        -0.13352087,  0.70495625],
       [-0.36582211, -0.65442402,  1.59692396, ..., -1.17553431,
        -0.04371424, -1.76482471]])

## Clustering

### 1. K-means

In [13]:
k_range_end = 15 # Usually it is sqrt(# of mags)

k_range = range(1, k_range_end)

k_mean_models = [KMeans (n_clusters = i, random_state = SEED) for i in k_range]
k_scores = [k_mean_model.fit(scaled_mags_df).score(scaled_mags_df) for k_mean_model in k_mean_models]
k_data = pd.DataFrame ({'k_range':k_range, 'k_scores':k_scores})
k_data


Unnamed: 0,k_range,k_scores
0,1,-2500.0
1,2,-981.401218
2,3,-680.3207
3,4,-549.982976
4,5,-436.679494
5,6,-352.698664
6,7,-302.457493
7,8,-255.561708
8,9,-219.078474
9,10,-186.357715


In [14]:
k_num_chart = alt.Chart(data = k_data).mark_line().encode(
    alt.X ('k_range:Q'),
    alt.Y ('k_scores:Q')
)

k_num_chart

In [15]:
# We can see from the chart above that 6 or 7 clusters are optimal for this task (where END = 25 MAGs)
num_of_clusters = 7

k_means_model = KMeans (n_clusters = num_of_clusters, random_state = SEED)
k_means_predicted = k_means_model.fit_predict(scaled_mags_df)
k_means_predicted

array([4, 1, 0, 1, 1, 3, 0, 0, 2, 2, 2, 4, 2, 3, 3, 6, 1, 6, 3, 4, 2, 1,
       0, 5, 3], dtype=int32)

In [16]:
k_means_chart = visualize_with_pca (scaled_mags_df, k_means_predicted, k_means_model.cluster_centers_)
k_means_chart


### 2. OPTICS

In [17]:
MIN_SAMPLES = 3

optics_model = OPTICS (min_samples = MIN_SAMPLES, n_jobs = num_of_workers)
optics_predicted = optics_model.fit_predict (scaled_mags_df)
optics_predicted

array([ 0,  2, -1,  2,  2,  2,  1,  1,  0,  0,  0,  0,  0,  2,  2,  2,  2,
        2,  2,  0,  0,  2,  1,  2,  2])

In [18]:
# Visualize clusters, since there are no centroids, we are sending bogus array
optics_chart = visualize_with_pca (scaled_mags_df, optics_predicted, np.empty([optics_predicted.shape[0], 1], dtype=int))
optics_chart


In [19]:
# Side by side comparison
alt.hconcat (k_means_chart, optics_chart).resolve_scale(color='independent')

## Evaluation

In [20]:
eval_k_means = metrics.silhouette_score (scaled_mags_df, k_means_predicted)
eval_optics = metrics.silhouette_score (scaled_mags_df, optics_predicted)

print ('Silhouette scores: [best = 1, worst = -1]')
print ('\t1. K-means:', eval_k_means)
print ('\t2. OPTICS:', eval_optics)

Silhouette scores: [best = 1, worst = -1]
	1. K-means: 0.3094919183804488
	2. OPTICS: 0.361855460227712


---
# METABOLOMIC ANALYSIS
---
## Importing Metabolomic data

In [21]:
metabolomics_file_name = os.path.join(path_normalised_metabolomics, os.listdir(path_normalised_metabolomics)[0])
metabolomics_df = pd.read_csv (metabolomics_file_name, delimiter = '\t')
metabolomics_df

Unnamed: 0,Metabolite,tp,date,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,!Phosphoric_acid_3TMS,D1,2012-04-17,snp,known,0.757113,0.737943,0.098650,3,0.056956,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
1,!Phosphoric_acid_3TMS,D10,2012-01-25,snp,known,0.720935,0.873509,0.266593,3,0.153918,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
2,!Phosphoric_acid_3TMS,D11,2012-03-22,snp,known,0.625047,0.652268,0.057820,3,0.033382,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
3,!Phosphoric_acid_3TMS,D12,2012-01-19,snp,known,0.964956,0.941878,0.053754,3,0.031035,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
4,!Phosphoric_acid_3TMS,D13,2011-05-13,snp,known,0.963984,0.974479,0.071659,3,0.041373,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,Valine_2TMS,D51,2011-08-11,bp,known,0.704821,0.478251,0.428368,3,0.247319,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18866,Valine_2TMS,D6,2011-06-03,bp,known,1.085888,1.197471,0.487484,3,0.281449,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18867,Valine_2TMS,D7,2012-04-04,bp,known,2.958023,2.813145,0.341213,3,0.197000,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18868,Valine_2TMS,D8,2012-02-08,bp,known,2.515980,2.322248,0.618556,3,0.357123,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine


## Data preprocessing

In [22]:
metabolomics_df['date'] = pd.to_datetime(metabolomics_df['date'])
metabolomics_df.insert (0, 'date', metabolomics_df.pop('date'))
metabolomics_df.sort_values ('date', inplace = True, ignore_index = True)
metabolomics_df

Unnamed: 0,date,Metabolite,tp,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,2011-03-21,Leucine_2TMS,D37,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar,"C16439,C00123,C01570",leucine;L-leucine;D-leucine,leucine;L-leucine;D-leucine
1,2011-03-21,No match: 1181.78_EM_SP_D1_1_223,D37,sp,unknown,3.778469,3.801417,1.924377,3,1.111040,4.302653,extracellular,polar,,,
2,2011-03-21,unknown#emu_WW_2144.25,D37,bnp,unknown,0.949545,0.947209,0.065422,3,0.037771,4.302653,intracellular,nonpolar,,,
3,2011-03-21,Unknown#bth_pae_001,D37,bp,unknown,2.095116,1.809891,1.916896,3,1.106721,4.302653,intracellular,polar,,,
4,2011-03-21,No match: 2220.34_EM_BP_D1_1_192,D37,bp,unknown,1.631643,1.571125,0.168501,3,0.097284,4.302653,intracellular,polar,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,D48,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar,D01924,octadecan-1-ol,octadecan-1-ol
18866,2012-05-03,No match: 1447.93_EM_BP_D2_1_57,D48,bp,unknown,4.815774,1.160165,6.835474,3,3.946463,4.302653,intracellular,polar,,,
18867,2012-05-03,Lactulose_8TMS,D48,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar,C07064,lactulose,lactulose
18868,2012-05-03,No match: 2004.78_EM_BNP_D10_1_205,D48,bnp,unknown,0.785996,0.744529,0.139901,3,0.080772,4.302653,intracellular,nonpolar,,,


In [23]:
# Changing metabolite name if it is unknown
metabolomics_df.loc[metabolomics_df['known_type'].eq('unknown'), 'Metabolite'] = np.nan
metabolomics_df

Unnamed: 0,date,Metabolite,tp,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,2011-03-21,Leucine_2TMS,D37,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar,"C16439,C00123,C01570",leucine;L-leucine;D-leucine,leucine;L-leucine;D-leucine
1,2011-03-21,,D37,sp,unknown,3.778469,3.801417,1.924377,3,1.111040,4.302653,extracellular,polar,,,
2,2011-03-21,,D37,bnp,unknown,0.949545,0.947209,0.065422,3,0.037771,4.302653,intracellular,nonpolar,,,
3,2011-03-21,,D37,bp,unknown,2.095116,1.809891,1.916896,3,1.106721,4.302653,intracellular,polar,,,
4,2011-03-21,,D37,bp,unknown,1.631643,1.571125,0.168501,3,0.097284,4.302653,intracellular,polar,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,D48,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar,D01924,octadecan-1-ol,octadecan-1-ol
18866,2012-05-03,,D48,bp,unknown,4.815774,1.160165,6.835474,3,3.946463,4.302653,intracellular,polar,,,
18867,2012-05-03,Lactulose_8TMS,D48,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar,C07064,lactulose,lactulose
18868,2012-05-03,,D48,bnp,unknown,0.785996,0.744529,0.139901,3,0.080772,4.302653,intracellular,nonpolar,,,


In [24]:
print ('Dataset uniqueness:')
print ('\t1. Timestamps:', len(metabolomics_df['date'].unique()))
print ('\t2. Metabolites:', len(metabolomics_df['Metabolite'].unique()))
print ('\t3. Types:', len(metabolomics_df['type'].unique()))
print ('\t4. Known types:', len(metabolomics_df['known_type'].unique()))
print ('\t5. Ns:', len(metabolomics_df['N'].unique()))
print ('\t6. Type 2s:', len(metabolomics_df['type2'].unique()))
print ('\t7. Measurements:', len(metabolomics_df['measurement'].unique()))

Dataset uniqueness:
	1. Timestamps: 51
	2. Metabolites: 86
	3. Types: 4
	4. Known types: 2
	5. Ns: 2
	6. Type 2s: 2
	7. Measurements: 2


In [25]:
# Saving the name column and removing unnecessairy columns
#metabolite_names = metabolomics_df['Metabolite']
#metabolomics_df.drop(labels = ['Metabolite', 'tp', 'KEGG.Compound.ID', 'Chebi.Name', 'Chebi.Name_combined'], axis = 1, inplace = True)
metabolomics_df.drop(labels = ['tp', 'KEGG.Compound.ID', 'Chebi.Name', 'Chebi.Name_combined'], axis = 1, inplace = True)
metabolomics_df

Unnamed: 0,date,Metabolite,type,known_type,means,medians,sds,N,se,ci,type2,measurement
0,2011-03-21,Leucine_2TMS,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar
1,2011-03-21,,sp,unknown,3.778469,3.801417,1.924377,3,1.111040,4.302653,extracellular,polar
2,2011-03-21,,bnp,unknown,0.949545,0.947209,0.065422,3,0.037771,4.302653,intracellular,nonpolar
3,2011-03-21,,bp,unknown,2.095116,1.809891,1.916896,3,1.106721,4.302653,intracellular,polar
4,2011-03-21,,bp,unknown,1.631643,1.571125,0.168501,3,0.097284,4.302653,intracellular,polar
...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar
18866,2012-05-03,,bp,unknown,4.815774,1.160165,6.835474,3,3.946463,4.302653,intracellular,polar
18867,2012-05-03,Lactulose_8TMS,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar
18868,2012-05-03,,bnp,unknown,0.785996,0.744529,0.139901,3,0.080772,4.302653,intracellular,nonpolar


In [26]:
# Dummy eencoding categorical data
scaled_metabolomics_df = pd.get_dummies(metabolomics_df, columns = ['type', 'known_type', 'N', 'type2', 'measurement'])
scaled_metabolomics_df

Unnamed: 0,date,Metabolite,means,medians,sds,se,ci,type_bnp,type_bp,type_snp,type_sp,known_type_known,known_type_unknown,N_2,N_3,type2_extracellular,type2_intracellular,measurement_nonpolar,measurement_polar
0,2011-03-21,Leucine_2TMS,1.440122,1.489979,0.146440,0.084547,4.302653,0,0,0,1,1,0,0,1,1,0,0,1
1,2011-03-21,,3.778469,3.801417,1.924377,1.111040,4.302653,0,0,0,1,0,1,0,1,1,0,0,1
2,2011-03-21,,0.949545,0.947209,0.065422,0.037771,4.302653,1,0,0,0,0,1,0,1,0,1,1,0
3,2011-03-21,,2.095116,1.809891,1.916896,1.106721,4.302653,0,1,0,0,0,1,0,1,0,1,0,1
4,2011-03-21,,1.631643,1.571125,0.168501,0.097284,4.302653,0,1,0,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,1.364105,1.457780,0.170311,0.098329,4.302653,1,0,0,0,1,0,0,1,0,1,1,0
18866,2012-05-03,,4.815774,1.160165,6.835474,3.946463,4.302653,0,1,0,0,0,1,0,1,0,1,0,1
18867,2012-05-03,Lactulose_8TMS,0.697489,0.684339,0.049632,0.028655,4.302653,0,1,0,0,1,0,0,1,0,1,0,1
18868,2012-05-03,,0.785996,0.744529,0.139901,0.080772,4.302653,1,0,0,0,0,1,0,1,0,1,1,0


In [27]:
# Standardizing data
metabolomics_scaler = preprocessing.StandardScaler()
scaled_metabolomics_df[['means', 'medians', 'sds', 'se', 'ci']] = metabolomics_scaler.fit_transform(metabolomics_df[['means', 'medians', 'sds', 'se', 'ci']])

scaled_metabolomics_df

Unnamed: 0,date,Metabolite,means,medians,sds,se,ci,type_bnp,type_bp,type_snp,type_sp,known_type_known,known_type_unknown,N_2,N_3,type2_extracellular,type2_intracellular,measurement_nonpolar,measurement_polar
0,2011-03-21,Leucine_2TMS,-0.145146,-0.094812,-0.189837,-0.190047,-0.128175,0,0,0,1,1,0,0,1,1,0,0,1
1,2011-03-21,,0.148026,0.316083,-0.000023,-0.000270,-0.128175,0,0,0,1,0,1,0,1,1,0,0,1
2,2011-03-21,,-0.206653,-0.191297,-0.198486,-0.198695,-0.128175,1,0,0,0,0,1,0,1,0,1,1,0
3,2011-03-21,,-0.063026,-0.037942,-0.000822,-0.001068,-0.128175,0,1,0,0,0,1,0,1,0,1,0,1
4,2011-03-21,,-0.121134,-0.080386,-0.187482,-0.187692,-0.128175,0,1,0,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,-0.154677,-0.100535,-0.187288,-0.187499,-0.128175,1,0,0,0,1,0,0,1,0,1,1,0
18866,2012-05-03,,0.278079,-0.153441,0.524288,0.523941,-0.128175,0,1,0,0,0,1,0,1,0,1,0,1
18867,2012-05-03,Lactulose_8TMS,-0.238254,-0.238027,-0.200172,-0.200380,-0.128175,0,1,0,0,1,0,0,1,0,1,0,1
18868,2012-05-03,,-0.227158,-0.227327,-0.190535,-0.190745,-0.128175,1,0,0,0,0,1,0,1,0,1,1,0


In [28]:
metabolomics_df.dropna(inplace = True)
metabolomics_df.reset_index(drop=True, inplace=True)
metabolomics_df

Unnamed: 0,date,Metabolite,type,known_type,means,medians,sds,N,se,ci,type2,measurement
0,2011-03-21,Leucine_2TMS,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar
1,2011-03-21,Glycerol-3-phosphoric_acid_4TMS,bp,known,1.462276,1.524821,0.204146,3,0.117864,4.302653,intracellular,polar
2,2011-03-21,Putrescine_4TMS,bp,known,0.667033,0.667946,0.145202,3,0.083832,4.302653,intracellular,polar
3,2011-03-21,Glyceric_acid_3TMS,sp,known,1.195441,1.312780,0.294056,3,0.169774,4.302653,extracellular,polar
4,2011-03-21,Proline_2TMS,sp,known,1.874113,2.038239,0.541314,3,0.312528,4.302653,extracellular,polar
...,...,...,...,...,...,...,...,...,...,...,...,...
6880,2012-05-03,Glycerol-3-phosphoric_acid_4TMS,bnp,known,0.950065,0.984881,0.098055,3,0.056612,4.302653,intracellular,nonpolar
6881,2012-05-03,1-Monooleoylglycerol_2TMS,bnp,known,1.300665,1.342977,0.078951,3,0.045582,4.302653,intracellular,nonpolar
6882,2012-05-03,Octadecanol_1TMS,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar
6883,2012-05-03,Lactulose_8TMS,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar


## Time series examination

In [29]:
# This function creates new dataframe with column that represent season according to date
# It also concatenate important types with metabolite names
def season_data (data, temporal_column):
    new_df = data
    new_df['season'] = new_df[temporal_column].dt.month%12 // 3 + 1
    
    #important_types = [metabolite_column] + important_types
    #new_df['new_name'] = df[important_types].agg('\n'.join, axis=1)
    
    return new_df

def visualize_metabolites (data, temporal_column, metabolite_column, type_columns):
    
    data_seasoned = season_data (data, temporal_column)
    
    # Extract columns with float values
    float_columns = []
    
    for i in data_seasoned.columns:
        if data_seasoned[i].dtypes == 'float64' or data_seasoned[i].dtypes == 'float32':
            float_columns.append(i)
    
    # Create repeated chart with varying size encodings
    chart = alt.Chart(data_seasoned).mark_point(opacity = 1).encode(
        alt.X (temporal_column, type = 'temporal', scale = alt.Scale (nice = True)),
        alt.Y (metabolite_column, type = 'nominal'),
        alt.Size (alt.repeat("row"), type = 'quantitative'),
        alt.Color ('season:N', scale = alt.Scale (range = ['blue', 'green', 'orange', 'brown'])),
        alt.Tooltip (type_columns, type = 'nominal')
    ).properties(
        width = 1200
    ).repeat(
        row = float_columns
    ).resolve_scale(color = 'independent', size = 'independent')#.interactive()
    
    return chart


In [30]:
metabolites_chart = visualize_metabolites(metabolomics_df, 'date', 'Metabolite', ['type', 'type2', 'measurement', 'N'])
metabolites_chart

## Clustering

In [31]:
# Deep learning temporal clustering

# Should I even do this? Previous visualizations are descriptive enough. It would be a lot of work for not much benefit


---
# TRANSCRIPTOMIC ANALYSIS
---
## Importing Transcriptomic data