In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os, random, math
import gensim
import datetime as dt
import altair_saver
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from gensim.models import Word2Vec
from sklearn.cluster import KMeans, OPTICS
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn import preprocessing, model_selection, metrics
from scipy.spatial.distance import jaccard, pdist, squareform


## Defining paths for each and every omic

In [2]:
path_root_data = os.path.join ('..', 'Data', 'Extracted', 'First source', 'Databases')

path_all_fasta = os.path.join (path_root_data, 'fasta_files', 'AllBins')
path_genomics_78 = os.path.join (path_root_data, 'fasta_files', 'rmags_filtered')
path_genomics_kegg = os.path.join (path_root_data, 'Annotations', 'KEGG')
path_normalised_metabolomics = os.path.join (path_root_data, 'Metabolomics', 'Normalised_Tables')
path_proteomics_78 = os.path.join (path_root_data, 'Proteomics', 'set_of_78')
path_physico_chemical = os.path.join (path_root_data, 'PhysicoChemical')
path_second_source = os.path.join ('..', 'Data', 'Extracted', 'Second source', 'Multi-omics_data')

path_model_save_root = 'Saved_models'
path_figures_save_root = 'Output_figures'


In [3]:
num_of_mags = len([i for i in os.listdir(path_genomics_78) if i.endswith('fa')])
num_of_proteomics = len([i for i in os.listdir(path_proteomics_78) if i.endswith('faa')])
SEED = 42
END = num_of_mags
ALL_DAYS = 51
MAX_ROWS = 15000
EPOCHS = 10
NUM_OF_WORKERS = 8
START_DATE = dt.datetime.strptime ('2011-03-21', '%Y-%m-%d')
random.seed(SEED)
np.random.seed(SEED)
alt.data_transformers.enable('default', max_rows = MAX_ROWS) # Important if you want to visualize datasets with >5000 samples


DataTransformerRegistry.enable('default')

In [4]:
metabolomics_df = pd.read_csv(os.path.join(path_second_source, 'Metabolomics_df_Data.csv'))
metabolomics_df


Unnamed: 0.1,Unnamed: 0,pHILIC_142.1225_1.3,pHILIC_199.0825_1.5,pHILIC_154.1225_1.6,pHILIC_302.3046_1.8,pHILIC_790.5707_2,pHILIC_257.1127_2,pHILIC_114.0661_2.5,pHILIC_776.5554_3,pHILIC_137.0457_3.1,...,nRPLC_187.0976_5.4,nRPLC_209.0814_5.3,nRPLC_183.1025_5.1,nRPLC_155.0713_3.6,nRPLC_411.127_3.6,nRPLC_206.0821_3.4,nRPLC_188.0352_3.1,nRPLC_151.04_2.3,nRPLC_203.002_1.9,nRPLC_117.0557_0.6
0,1,48933.50,12930.38,8806.556,161984.40,7.346514e+00,116880.70,36174800.0,527681.50,2602094.0,...,59332.21,4509.464,9391.550,10801.98,2490.610,935.6534,3259.925,1333.717,1804.745,85970.57
1,2,38733.88,59952.15,10619.350,216496.40,2.236876e+03,312380.90,78940410.0,1179171.00,4149377.0,...,295626.60,28947.810,55894.140,18800.82,16961.230,4870.6410,16948.620,96603.690,234939.000,456850.90
2,3,42435.48,38987.30,7809.121,108672.90,2.856187e+04,104521.10,39027150.0,336941.30,3691144.0,...,65809.69,6576.804,6967.923,10666.80,4019.752,1984.9850,4993.880,5273.242,63674.750,163617.20
3,4,46083.25,15084.49,8909.069,97748.90,1.508881e+01,139771.50,50265620.0,159105.70,3473635.0,...,78788.11,8979.845,7936.962,11954.09,4362.652,4684.4880,5450.713,5575.404,52257.440,115033.30
4,5,52522.11,31970.83,9438.909,56578.23,8.179492e-01,95810.41,33657510.0,88254.79,2381462.0,...,81166.43,13435.120,7876.849,16490.47,2604.270,3527.4080,2736.235,3822.664,39924.610,100389.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,977,527291.60,188541.60,3983986.000,349126.80,3.826502e+05,138845.20,75042160.0,941019.90,4307979.0,...,80989.16,11007.840,14229.030,32860.79,13801.630,5393.8590,15654.260,169405.400,116282.500,496630.10
977,978,524953.70,75464.94,3415717.000,237835.60,1.592969e+05,182862.90,71068130.0,776016.90,1788566.0,...,67519.21,10069.110,16392.230,41626.03,14129.090,5432.8250,17294.790,778465.400,122492.200,544065.00
978,979,69219.12,83106.39,9701.342,243664.40,3.399754e+04,362691.20,76319210.0,2812122.00,4261663.0,...,77102.76,14315.610,11541.050,32037.51,13784.310,5592.1750,14918.980,38619.810,171618.000,363381.40
979,980,39563.84,38518.92,8411.480,319299.40,2.569009e+05,163863.80,64280430.0,1390076.00,1955622.0,...,52485.35,9048.104,8522.427,29624.09,12322.570,7302.8270,19094.250,33187.110,224365.200,442555.90


---
# GENOMIC ANALYSIS
---
## MAG-related functions

**Important**: I should review the way I look at MAGs. The names of all fasta files beggining with 'D_##' represent the days those MAGs were obtained. Therefore, I should look at this also as timeseries data. Also, maybe I should only consider 78 MAGs, and not all ~1300.
After some consideration, I conclude that I should definetly use only 78 MAGs, because that way I wouldn't be tied to meta-omics data only. I also thinked about what should I visualize in that case. One idea is that I should also encode those MAGs with word2wec, and then make a 3D chart where one dimension is time, and other two dimensions would be PCA dimensions of those MAGs. I could also use this function to visualize proteomics data if I want.

Another important thing is that I should actually include FASTA headers and possibly use only them. That way, I could make figures like in a relevant paper where MAGs are groupped according to their taxonomy etc. I should look more into this.

In [4]:
# Function that saves charts from list_of_charts with names from list_of_names
def save_charts (list_of_chart, list_of_names):
    
    for chart, name in zip(list_of_chart, list_of_names):
        altair_saver.save(chart, os.path.join (path_figures_save_root, name))
    

In [45]:
# Function that splits each genome into k-mers thus creating even longer sentence (MAG)
# It returns tokenized genome i.e. [kmer, kmer,...]
def split_genome (genome, k = 5):
    new_genome = []
    n = len(genome)
    
    if n-k <=0:
        return genome
    else:
        for i in range(n-k):
            new_genome.append(genome[i:i+k])
        
        return new_genome


def vectorize_one_mag (one_mag, w2v_model):
    
    # We have to generate vectors for each word in one MAG and then create vector representation of that MAG
    # by averaging vectors of its words
    zero_vector = np.zeros(w2v_model.vector_size)
    word_vectors = []
    one_mag_vector = []
    
    for sentence in one_mag:
        for word in sentence:
            if word in w2v_model.wv:
                try:
                    word_vectors.append(w2v_model.wv[word])
                except KeyError:
                    print ('Key Error')
                    continue
    
    if word_vectors:
        word_vectors = np.asarray(word_vectors)
        one_mag_vector = word_vectors.mean (axis=0)
    
    else:
        one_mag_vector = zero_vector
    
    return one_mag_vector


# Function that vectorizes a MAG (document) with a pretrained word2vec model. It returns vector representation of a given MAG
# Vectorization is done by averaging word (k-mer) vectors for the whole document (MAG)
def vectorize_mags (w2v_model, path_fasta = path_genomics_78, end = 25):
    
    print ('Vectorizing MAGs')
    
    fasta_files = [i for i in os.listdir(path_fasta) if (i.endswith('fa') and i.startswith('D'))]
    list_of_mag_vectors = []
    
    # This was done so that I could work with first 'end' FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                # Vectorizing MAGs one by one
                list_of_mag_vectors.append (vectorize_one_mag (one_mag, w2v_model))
    
    print ('Finished vectorizing')
    
    return list_of_mag_vectors
    
    

# If one wants to import MAGs in order to vectorize them, one should use start argument in order to skip first 'start' MAGs
# If one wants to import MAGs to train word2vec model, one should use only end argument, so that first 'end' MAGs are used for training
# Todo: Implement randomisation in picking MAGs for training, and don't use first 'start' MAGs for training
# (create list of indexes from 0 to 1364, use sklearn split train test, then traverse directory and use only MAGs with indexes in train for training w2v)
def import_mags_and_build_model (end = 25, path_fasta = path_genomics_78):
    
    print ('Importing MAGs and building model')
    
    # There are 1364 MAGs enclosed in FASTA files
    # I have to traverse every FASTA file, and in each file every sequence
    
    fasta_files = [i for i in os.listdir(path_fasta) if (i.endswith('fa') and i.startswith('D'))]
    fasta_ids = []
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                one_mag_ids = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    # Save FASTA ids for every MAG
                    one_mag_ids.append(str(fasta_string.id))
                    
                # Save list of ids for one MAG in global list
                fasta_ids.append(one_mag_ids)
                
                # If we do not have a model, we build one
                if i == 0:
                    print ('Building w2v model')
                    # We build our model on the first MAG
                    w2v_model = Word2Vec (sentences = one_mag, size = 100, workers = NUM_OF_WORKERS, seed=SEED)
                
                # Else we just expand its vocabulary
                else:
                    # Now we expand our vocabulary
                    w2v_model.build_vocab (one_mag, update = True)
                    
    print ('Finished building')
    
    return w2v_model, fasta_files, fasta_ids


def train_model (w2v_model, epochs, path_fasta = path_genomics_78, end = 25):
    
    print ('Starting model training')
    
    # There are 1364 MAGs enclosed in FASTA files
    # I have to traverse every FASTA file, and in each file every sequence
    
    fasta_files = [i for i in os.listdir(path_fasta) if (i.endswith('fa') and i.startswith('D'))]
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                
                w2v_model.train (one_mag, total_examples = w2v_model.corpus_count, epochs = epochs)
                    
    print ('Model training finished')
    
    return w2v_model


def visualize_with_pca (data, labels, centers):
    
    pca_model = PCA (n_components = 2, random_state = SEED)
    data_transformed = pca_model.fit_transform (data)
    
    data_transformed = pd.DataFrame (data_transformed)
    data_transformed.columns = ['PC_1', 'PC_2']
    data_transformed['Labels'] = labels
    
    chart_data = alt.Chart(data_transformed).mark_circle(opacity = 1).encode(
        alt.X ('PC_1:Q'),
        alt.Y ('PC_2:Q'),
        alt.Color ('Labels:N', legend = alt.Legend())
    )
    
    # This means we are visualising centroids from k_means (there are less centroids that data points)
    if labels.shape[0] != centers.shape[0]:
        
        centers_transformed = pca_model.fit_transform (centers)
        centers_transformed = pd.DataFrame (centers_transformed)
        centers_transformed.columns = ['PC_1', 'PC_2']
        
        chart_centers = alt.Chart(centers_transformed).mark_point(shape = 'diamond', color = 'black', size = 50, opacity = 0.7).encode(
            alt.X ('PC_1:Q'),
            alt.Y ('PC_2:Q'),
        )
        
        return chart_data + chart_centers
    
    # For DBSCAN there are no centroids
    else:
        return chart_data


# This function creates new dataframe with column that represent season according to date
# It also concatenate important types with metabolite names
def season_data (data, temporal_column):
    new_df = data
    new_df['season'] = new_df[temporal_column].dt.month%12 // 3 + 1
    
    #important_types = [metabolite_column] + important_types
    #new_df['new_name'] = df[important_types].agg('\n'.join, axis=1)
    
    return new_df


def create_temporal_column (list_of_days, start_date, end):
    
    list_of_dates = []
    
    # This is specific to the metaomics data set I am using
    # Creating list of dates for every rMAG
    for i in list_of_days[:end]:
        
        tmp_datetime = start_date + dt.timedelta (weeks = int(i[1:3]))
        
        if tmp_datetime not in list_of_dates:
            list_of_dates.append (tmp_datetime)
        
        else:
            tmp_datetime = tmp_datetime.replace (day = tmp_datetime.day + 1)
            list_of_dates.append (tmp_datetime)
    
    return list_of_dates


def visualize_temporal_mags (data, list_of_days, start_date, end):
    
    list_of_dates = create_temporal_column (list_of_days, start_date, end)
    
    pca_model = PCA (n_components = 2, random_state = SEED)
    data_transformed = pca_model.fit_transform (data)
    
    data_transformed = np.hstack(((np.asarray(list_of_dates))[:, np.newaxis], data_transformed))
    data_transformed = pd.DataFrame (data_transformed, columns = ['DateTime', 'PCA_1', 'PCA_2'])
    
    data_transformed = season_data (data_transformed, 'DateTime')
    
    chart_data = alt.Chart(data_transformed).mark_circle(opacity = 1).encode(
        alt.X ('PCA_1:Q'),
        alt.Y ('PCA_2:Q'),
        alt.Color ('season:N', scale = alt.Scale (range = ['blue', 'green', 'orange', 'brown'])),
    ).properties(
        width = 1200
    )
    
    return chart_data


def import_kegg_and_create_df (end = 51, path_fasta = path_genomics_78, path_all_keggs = path_genomics_kegg):
    
    print ('Importing KEGG data')
    
    # There are 51 files for each day, in which there are KEGG IDs for each genome collected that day
    # I have to traverse every KEGG file, and create DataFrame for each and every one 
    
    kegg_files = [i for i in os.listdir(path_all_keggs) if (i.endswith('besthits') and i.startswith('D'))]
    rmags_78_names = [os.path.splitext(i)[0] for i in os.listdir(path_fasta) if (i.endswith('fa') and i.startswith('D'))]
    kegg_data_list = []
    
    # This was done so that I could work with first 100 files only. Otherwise, I should just remove: i, and enumerate
    for i, kegg_file_name in enumerate(kegg_files):
        
        if i == end:
            break
        
        else:
            # Now I create a DataFrame out of it and save it in the list of DataFrames
            tmp_df = pd.read_csv (os.path.join(path_genomics_kegg, kegg_file_name), delimiter = '\t')
            tmp_filter = tmp_df['Gene'].apply(lambda x: str(x).split('_')[0] + '_' + str(x).split('_')[1]).isin(rmags_78_names)
            tmp_df = tmp_df[tmp_filter]
                    
            tmp_df['Gene'] = tmp_df['Gene'].apply(lambda x: str(x).split('_')[0] + '_' + str(x).split('_')[1])
            tmp_df['ID'] = tmp_df['ID'].apply(lambda x: str(x).split(':')[1])
            tmp_df.drop (['maxScore', 'hitNumber'], axis = 1, inplace = True)
            tmp_df.reset_index(drop=True, inplace=True)
            
            kegg_data_list.append (tmp_df)
            
    
    print ('Finished importing')
    return create_kegg_matrix (kegg_data_list, path_fasta)


def create_kegg_matrix (list_data, path_fasta = path_genomics_78):
    
    print ('Creating KEGG matrix')
    
    rmags_78_names = [os.path.splitext(i)[0] for i in os.listdir(path_fasta) if (i.endswith('fa') and i.startswith('D'))]
    result_matrix_df = pd.DataFrame (columns = rmags_78_names)
    
    for i in list_data:
        tmp_df = i.value_counts().reset_index()
        
        for i, row in tmp_df.iterrows():
            result_matrix_df.at[row['ID'], row['Gene']] = row[0]
    
    result_matrix_df.fillna(0, inplace = True)
    
    print ('Finished creating')
    return result_matrix_df.T


def create_pairwise_jaccard (data):
    
    tmp_data = data.clip(0, 1)
    result = squareform(pdist(tmp_data.astype(bool), jaccard))
    
    return pd.DataFrame (result, index = data.index, columns = data.index)


def visualize_with_mds (data, start_date, end, path_fasta = path_genomics_78):
    
    mds_model = MDS(n_components = 2, random_state = SEED, dissimilarity = "precomputed", n_jobs = NUM_OF_WORKERS)
    mds_pos = mds_model.fit_transform(data)
    
    list_of_days = [i for i in os.listdir(path_fasta) if (i.endswith('fa') and i.startswith('D'))]
    temporal_column = create_temporal_column (list_of_days, start_date, end)
    
    data_transformed = pd.DataFrame (mds_pos)
    data_transformed.columns = ['MDS_1', 'MDS_2']
    data_transformed = np.hstack(((np.asarray(temporal_column))[:, np.newaxis], data_transformed))
    data_transformed = pd.DataFrame (data_transformed, columns = ['DateTime', 'MDS_1', 'MDS_2'])
    
    data_transformed = season_data (data_transformed, 'DateTime')
    
    chart_data = alt.Chart(data_transformed).mark_circle(opacity = 1).encode(
        alt.X ('MDS_1:Q'),
        alt.Y ('MDS_2:Q'),
        alt.Color ('season:N', scale = alt.Scale (range = ['blue', 'green', 'orange', 'brown'])),
    )
    
    return chart_data



## MAG examination

### KEGG examination

In [6]:
kegg_matrix = import_kegg_and_create_df (end = ALL_DAYS, path_fasta = path_genomics_78, path_all_keggs = path_genomics_kegg)
kegg_matrix

Importing KEGG data
Finished importing
Creating KEGG matrix
Finished creating


Unnamed: 0,K03088,K03286,K06882,K09667,K02477,K02014,K12340,K07114,K03790,K16092,...,K03332,K02080,K07062;KEGG,K05294,K06823,K16506,K08743,K01280,K00596,K02475
D42_G14,17,0,0,1,0,0,0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
D05_L3.17,1,2,0,7,0,2,1,4,1,2,...,0,0,0,0,0,0,0,0,0,0
D44_G14,1,7,0,2,0,4,1,3,2,1,...,0,0,0,0,0,0,0,0,0,0
D36_L2.1.6.2,3,7,2,20,1,1,3,11,1,13,...,0,0,0,0,0,0,0,0,0,0
D36_L1.15,2,3,0,4,0,0,1,0,2,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D35_G2.16,15,8,2,3,3,12,2,1,6,3,...,0,0,0,0,0,0,0,0,0,0
D35_G2.27,16,9,3,5,4,3,2,1,5,2,...,0,0,0,0,0,0,0,0,0,0
D47_G1.71.2,6,0,0,0,0,0,0,1,4,0,...,0,0,0,0,0,0,0,0,0,0
D24_L14,4,3,0,11,0,3,2,1,0,3,...,0,0,0,0,0,0,0,0,0,0


In [7]:
mag_scaler = preprocessing.StandardScaler()
scaled_keggs_df = mag_scaler.fit_transform(kegg_matrix)
#scaled_keggs_df = kegg_matrix.clip(0, 1)
scaled_keggs_df

array([[ 1.48130157, -0.94941858, -0.53455038, ..., -0.11470787,
        -0.11470787, -0.11470787],
       [-1.3049023 , -0.563639  , -0.53455038, ..., -0.11470787,
        -0.11470787, -0.11470787],
       [-1.3049023 ,  0.40080995, -0.53455038, ..., -0.11470787,
        -0.11470787, -0.11470787],
       ...,
       [-0.43421359, -0.94941858, -0.53455038, ..., -0.11470787,
        -0.11470787, -0.11470787],
       [-0.78248908, -0.37074921, -0.53455038, ..., -0.11470787,
        -0.11470787, -0.11470787],
       [-1.3049023 , -0.563639  , -0.53455038, ..., -0.11470787,
        -0.11470787, -0.11470787]])

In [8]:
k_range_end = int(math.sqrt(num_of_mags)) # Usually it is sqrt(# of mags)

k_range = range(1, k_range_end)

k_mean_models = [KMeans (n_clusters = i, random_state = SEED) for i in k_range]
k_scores = [k_mean_model.fit(scaled_keggs_df).score(scaled_keggs_df) for k_mean_model in k_mean_models]
k_data = pd.DataFrame ({'k_range':k_range, 'k_scores':k_scores})

In [9]:
k_num_chart = alt.Chart(data = k_data).mark_line().encode(
    alt.X ('k_range:Q'),
    alt.Y ('k_scores:Q')
)

k_num_chart

In [10]:
# We can see from the chart above that 6 or 7 clusters are optimal for this task (where END = 25 MAGs)
num_of_clusters = 4

k_means_model = KMeans (n_clusters = num_of_clusters, random_state = SEED)
k_means_predicted = k_means_model.fit_predict(scaled_keggs_df)
k_means_predicted

array([0, 3, 3, 2, 3, 1, 0, 0, 3, 1, 3, 3, 0, 3, 3, 0, 3, 0, 0, 1, 1, 0,
       1, 1, 3, 3, 3, 1, 3, 3, 3, 1, 1, 2, 0, 3, 1, 1, 3, 1, 3, 1, 0, 3,
       1, 1, 1, 1, 3, 3, 3, 0, 1, 2, 0, 3, 1, 0, 0, 1, 0, 0, 1, 2, 1, 0,
       3, 1, 0, 0, 1, 3, 1, 1, 3, 3, 3], dtype=int32)

In [11]:
k_means_chart = visualize_with_pca (scaled_keggs_df, k_means_predicted, k_means_model.cluster_centers_)
k_means_chart


### KEGG examination but with pairwise Jaccard distance matrix (as seen in paper)

In [43]:
kegg_pairwise = create_pairwise_jaccard (kegg_matrix)
kegg_pairwise

Unnamed: 0,D42_G14,D05_L3.17,D44_G14,D36_L2.1.6.2,D36_L1.15,D20_O1.13,D12_L1.3,D47_P30,D15_G4,D33_L1.8.2,...,D38_G2.3,D43_G6,D09_G1.28,D16_O2.2.1.1.14,D03_O1.31.2,D35_G2.16,D35_G2.27,D47_G1.71.2,D24_L14,D15_G1.18.2
D42_G14,0.000000,0.754106,0.699642,0.749155,0.741826,0.720139,0.545663,0.541852,0.734463,0.780684,...,0.770992,0.562290,0.383476,0.794412,0.771676,0.739367,0.728509,0.619703,0.741910,0.759250
D05_L3.17,0.754106,0.000000,0.658834,0.706349,0.638507,0.733252,0.800103,0.768233,0.673977,0.807158,...,0.810951,0.741906,0.761236,0.833612,0.670986,0.786413,0.764257,0.778598,0.580645,0.683370
D44_G14,0.699642,0.658834,0.000000,0.705718,0.661430,0.705415,0.774236,0.687729,0.473659,0.795627,...,0.775895,0.709881,0.715111,0.816628,0.697556,0.765734,0.742721,0.741294,0.627743,0.590528
D36_L2.1.6.2,0.749155,0.706349,0.705718,0.000000,0.742698,0.682093,0.797688,0.753536,0.720071,0.762694,...,0.785323,0.720537,0.757871,0.800107,0.739850,0.760237,0.722415,0.752775,0.698613,0.740228
D36_L1.15,0.741826,0.638507,0.661430,0.742698,0.000000,0.739030,0.799519,0.755774,0.688696,0.805500,...,0.820308,0.741908,0.751337,0.843943,0.690104,0.796553,0.772922,0.781173,0.600000,0.707763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D35_G2.16,0.739367,0.786413,0.765734,0.760237,0.796553,0.740317,0.798731,0.750626,0.775845,0.686835,...,0.707020,0.749385,0.757544,0.691815,0.779907,0.000000,0.584204,0.739130,0.782113,0.769896
D35_G2.27,0.728509,0.764257,0.742721,0.722415,0.772922,0.700798,0.791011,0.741904,0.731463,0.648148,...,0.667141,0.738395,0.744507,0.679363,0.760708,0.584204,0.000000,0.728033,0.755138,0.748152
D47_G1.71.2,0.619703,0.778598,0.741294,0.752775,0.781173,0.754525,0.695208,0.644199,0.729942,0.769720,...,0.740899,0.608575,0.623460,0.779212,0.743136,0.739130,0.728033,0.000000,0.772646,0.750299
D24_L14,0.741910,0.580645,0.627743,0.698613,0.600000,0.696613,0.789566,0.734848,0.644953,0.786923,...,0.811962,0.722509,0.755177,0.824848,0.663942,0.782113,0.755138,0.772646,0.000000,0.690731


In [46]:
kegg_mds_chart = visualize_with_mds(kegg_pairwise, START_DATE, END, path_genomics_78)
kegg_mds_chart

---
# VAZNO:
Sledece sto treba da se uradi je da se nadje transcriptomic data set i da se obradi i on u potpunosti. Nakon toga, treba da se sve podeli po skriptama i da se odluci o dizajnu. Posle ostaje jos da se napravi front end.

---

In [12]:
# FOR CLUSTERING I SHOULD CREATE A DATAFRAME WITH MAGs INDEXES AND THEIR VECTOR REPRESENTATIONS
final_model, fasta_names, fasta_ids = import_mags_and_build_model (end = END, path_fasta = path_genomics_78)


Importing MAGs
Building w2v model
Finished building


In [13]:
# Train model. It tooks ~10 minutes for END = 25 amount of MAGs
final_model = train_model (final_model, epochs = EPOCHS, end = END)


Starting model training
Model training finished


In [14]:
final_model.wv.save_word2vec_format(os.path.join (path_model_save_root, 'model_78.bin'), binary=True) 

Now I should vectorize documents with this model. For further use, I could save this model's weights, and use it to vectorize all mags. That would take a lot, but every MAG will have its vector representation
> This could be done by importing one MAG at a time, then tokenizing it (like before), then getting vector representations of that MAG's sentences (genomes) and then finding the vector representation of the whole MAG (document). If I do that for one MAG at a time, There is no need to worry about memory


In [15]:
list_of_mag_vectors = vectorize_mags (final_model, path_fasta = path_genomics_78, end = END)
list_of_mag_vectors[0:2]

Vectorizing MAGs
Finished vectorizing


[array([-0.40356913, -4.0978575 ,  0.44172108, -1.1477782 ,  1.3755908 ,
        -3.9352734 ,  5.3010893 ,  0.2568175 ,  2.396293  ,  2.465444  ,
         1.8259434 ,  3.6453576 ,  3.233116  ,  1.7985865 , -1.6551371 ,
        -3.1117454 , -1.9132982 ,  1.0402396 , -2.0969384 ,  6.3776207 ,
        -0.7166837 ,  0.73049974,  4.1996527 , -0.3848365 ,  2.322695  ,
         1.3548968 , -0.3900317 , -2.1874418 ,  1.4151862 , -2.5942433 ,
        -3.6459203 ,  8.592173  ,  2.3462574 , -2.190395  ,  0.99342966,
         4.6254807 ,  3.665181  , -1.6076263 , -0.50503594, -1.0242015 ,
         2.0642269 , -1.5744766 , -2.3988478 , -0.9301293 , -2.0386758 ,
        -1.917383  ,  2.2761846 ,  1.5927812 ,  1.4246876 , -3.4618948 ,
        -3.4321864 ,  0.86566037,  6.31276   ,  4.107195  ,  1.4319881 ,
         5.2046328 ,  3.1581497 , -3.4473693 ,  7.0224338 ,  2.496395  ,
        -3.2614818 , -1.5519826 ,  0.27111453,  2.755383  ,  0.94470805,
         0.4065004 ,  0.76320934, -1.4922376 , -0.7

In [16]:
mags_df = pd.DataFrame (list_of_mag_vectors)
mags_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.403569,-4.097857,0.441721,-1.147778,1.375591,-3.935273,5.301089,0.256817,2.396293,2.465444,...,-1.964326,-0.821487,1.552016,-3.859475,-3.917797,0.386338,1.526248,1.708964,2.890493,-2.453544
1,-0.354514,-2.665838,1.619215,-0.146361,1.664685,-2.873812,2.862468,0.812865,1.111019,1.888380,...,-0.351842,-1.037667,1.708731,-2.450855,-2.640466,0.408503,1.479363,1.830384,2.086777,-1.233071
2,-0.542583,7.642497,1.121906,-0.300285,-3.902902,3.650409,-3.858173,-3.338632,-3.248994,-4.137262,...,-0.028813,2.883260,-2.425049,4.723054,2.731140,-1.646981,-1.216646,-2.569994,-1.560324,1.042046
3,2.045064,4.941150,0.106128,0.316174,-2.586119,2.590436,-0.933800,-2.439459,-2.231836,-2.023832,...,-2.398793,1.094581,-2.139761,3.910742,0.227961,-1.526857,-1.212860,-3.209470,-1.518929,0.462747
4,-1.126303,8.579502,1.146511,0.225343,-5.010315,3.642467,-4.392977,-4.275597,-3.021706,-5.000117,...,-0.073279,2.957838,-2.535977,5.057989,3.472293,-1.504143,-1.534439,-3.081459,-1.232088,1.504446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,1.112774,5.573669,0.674946,-0.041394,-3.550853,2.789464,-1.539359,-3.141921,-3.044721,-2.874783,...,-1.061333,1.086871,-2.890047,3.773283,1.444222,-1.597849,-1.189063,-3.369953,-1.158864,0.907829
73,0.982042,5.371077,0.459525,0.480391,-3.345948,1.804958,-1.738433,-2.943676,-2.404052,-2.410188,...,-1.656888,1.292474,-2.364439,3.598442,1.270896,-1.336313,-0.778616,-3.109831,-1.272962,0.912159
74,-0.779419,-2.516735,1.258990,0.392266,1.136436,-3.080913,3.004478,-0.316845,1.156177,1.063708,...,-0.986541,-0.412610,0.786822,-2.239142,-1.988834,0.514551,1.612411,1.338068,2.296900,-1.190793
75,-1.662307,-2.940595,0.787765,-0.370578,1.820011,-4.091670,3.496829,0.251294,1.745731,1.908235,...,-0.846204,-0.518816,1.695340,-3.392995,-2.626649,0.780962,1.845701,3.118144,2.799217,-1.388280


## Data preprocessing

In [17]:
mag_scaler = preprocessing.StandardScaler()
scaled_mags_df = mag_scaler.fit_transform(mags_df)
scaled_mags_df

array([[-0.27620496, -1.45901848, -1.60196046, ...,  1.17487643,
         1.55521398, -1.84712009],
       [-0.22357218, -1.15746059,  1.64683705, ...,  1.22691103,
         1.09732133, -0.95624363],
       [-0.42535784,  1.01329229,  0.27472285, ..., -0.65887482,
        -0.98050405,  0.70446411],
       ...,
       [-0.67946781, -1.126062  ,  0.65294831, ...,  1.01592833,
         1.2170323 , -0.92538303],
       [-1.62674979, -1.21531958, -0.64719891, ...,  1.77878169,
         1.50321229, -1.06953743],
       [ 0.44247278, -0.59236055,  0.41309269, ...,  0.41410786,
         1.14638471, -1.23528094]])

## Clustering

### 1. K-means

In [18]:
k_range_end = int(math.sqrt(num_of_mags)) # Usually it is sqrt(# of mags)

k_range = range(1, k_range_end)

k_mean_models = [KMeans (n_clusters = i, random_state = SEED) for i in k_range]
k_scores = [k_mean_model.fit(scaled_mags_df).score(scaled_mags_df) for k_mean_model in k_mean_models]
k_data = pd.DataFrame ({'k_range':k_range, 'k_scores':k_scores})

In [19]:
k_num_chart = alt.Chart(data = k_data).mark_line().encode(
    alt.X ('k_range:Q'),
    alt.Y ('k_scores:Q')
)

k_num_chart

In [20]:
# We can see from the chart above that 6 or 7 clusters are optimal for this task (where END = 25 MAGs)
num_of_clusters = 4

k_means_model = KMeans (n_clusters = num_of_clusters, random_state = SEED)
k_means_predicted = k_means_model.fit_predict(scaled_mags_df)
k_means_predicted

array([3, 3, 2, 1, 2, 2, 1, 3, 1, 2, 3, 2, 1, 3, 1, 1, 0, 2, 2, 2, 0, 2,
       3, 0, 2, 2, 2, 2, 0, 3, 2, 2, 0, 0, 3, 3, 1, 0, 2, 1, 3, 2, 1, 2,
       2, 3, 1, 0, 2, 0, 2, 2, 2, 1, 1, 0, 3, 3, 3, 2, 2, 0, 1, 3, 2, 1,
       2, 3, 1, 3, 3, 3, 1, 1, 3, 3, 0], dtype=int32)

In [21]:
k_means_chart = visualize_with_pca (scaled_mags_df, k_means_predicted, k_means_model.cluster_centers_)
k_means_chart


### 2. OPTICS

In [22]:
MIN_SAMPLES = 4

optics_model = OPTICS (min_samples = MIN_SAMPLES, n_jobs = NUM_OF_WORKERS)
optics_predicted = optics_model.fit_predict (scaled_mags_df)
optics_predicted

array([-1,  0,  4, -1,  4,  4, -1,  0,  3,  4,  0, -1, -1,  0, -1,  3,  1,
       -1,  4,  3, -1,  4, -1,  1,  4, -1,  4,  4, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1,  4, -1, -1,  4, -1, -1,  3,  0,  3,  1, -1, -1, -1,
       -1,  4,  3,  3,  1,  2,  0,  0,  3,  4, -1, -1,  1, -1, -1, -1,  2,
       -1,  2, -1,  1,  3,  3,  0,  2, -1])

In [23]:
# Visualize clusters, since there are no centroids, we are sending bogus array
optics_chart = visualize_with_pca (scaled_mags_df, optics_predicted, np.empty([optics_predicted.shape[0], 1], dtype=int))
optics_chart


In [24]:
# Side by side comparison
cluster_comparison_chart = alt.hconcat (k_means_chart, optics_chart).resolve_scale(color='independent')
cluster_comparison_chart

## Evaluation

In [25]:
eval_k_means = metrics.silhouette_score (scaled_mags_df, k_means_predicted)
eval_optics = metrics.silhouette_score (scaled_mags_df, optics_predicted)

print ('Silhouette scores: [best = 1, worst = -1]')
print ('\t1. K-means:', eval_k_means)
print ('\t2. OPTICS:', eval_optics)

Silhouette scores: [best = 1, worst = -1]
	1. K-means: 0.2984907840493
	2. OPTICS: 0.014349883460489373


## Visualizing rMAGs with time axis

In [26]:
time_chart = visualize_temporal_mags (scaled_mags_df, fasta_names, START_DATE, END)
time_chart

In [27]:
save_charts ([k_means_chart, optics_chart, cluster_comparison_chart, time_chart], ['genomics_k_means_chart.png', 'genomics_optics_chart.png', 'genomics_cluster_comparison_chart.png', 'genomics_time_chart.png'])

/home/alexein-work/miniconda3/share/vega-lite-cli/node_modules/vega-lite/build/vega-lite.js:736
    const copy = { ...obj
                   ^^^

SyntaxError: Unexpected token ...
    at createScript (vm.js:56:10)
    at Object.runInThisContext (vm.js:97:10)
    at Module._compile (module.js:542:28)
    at Object.Module._extensions..js (module.js:579:10)
    at Module.load (module.js:487:32)
    at tryModuleLoad (module.js:446:12)
    at Function.Module._load (module.js:438:3)
    at Module.require (module.js:497:17)
    at require (internal/module.js:20:19)
    at Object.<anonymous> (/home/alexein-work/miniconda3/share/vega-lite-cli/node_modules/vega-lite/bin/vl2vg:6:18)


CalledProcessError: Command '['/home/alexein-work/miniconda3/bin/vl2vg']' returned non-zero exit status 1.

---
# METABOLOMIC ANALYSIS
---
## Importing Metabolomic data

In [None]:
metabolomics_file_name = os.path.join(path_normalised_metabolomics, os.listdir(path_normalised_metabolomics)[0])
metabolomics_df = pd.read_csv (metabolomics_file_name, delimiter = '\t')
metabolomics_df

## Data preprocessing

In [None]:
metabolomics_df['date'] = pd.to_datetime(metabolomics_df['date'])
metabolomics_df.insert (0, 'date', metabolomics_df.pop('date'))
metabolomics_df.sort_values ('date', inplace = True, ignore_index = True)
metabolomics_df

In [None]:
# Changing metabolite name if it is unknown
metabolomics_df.loc[metabolomics_df['known_type'].eq('unknown'), 'Metabolite'] = np.nan
metabolomics_df

In [None]:
print ('Dataset uniqueness:')
print ('\t1. Timestamps:', len(metabolomics_df['date'].unique()))
print ('\t2. Metabolites:', len(metabolomics_df['Metabolite'].unique()))
print ('\t3. Types:', len(metabolomics_df['type'].unique()))
print ('\t4. Known types:', len(metabolomics_df['known_type'].unique()))
print ('\t5. Ns:', len(metabolomics_df['N'].unique()))
print ('\t6. Type 2s:', len(metabolomics_df['type2'].unique()))
print ('\t7. Measurements:', len(metabolomics_df['measurement'].unique()))

In [None]:
# Saving the name column and removing unnecessairy columns
#metabolite_names = metabolomics_df['Metabolite']
#metabolomics_df.drop(labels = ['Metabolite', 'tp', 'KEGG.Compound.ID', 'Chebi.Name', 'Chebi.Name_combined'], axis = 1, inplace = True)
metabolomics_df.drop(labels = ['tp', 'KEGG.Compound.ID', 'Chebi.Name', 'Chebi.Name_combined'], axis = 1, inplace = True)
metabolomics_df

In [None]:
# Dummy eencoding categorical data
scaled_metabolomics_df = pd.get_dummies(metabolomics_df, columns = ['type', 'known_type', 'N', 'type2', 'measurement'])
scaled_metabolomics_df

In [None]:
# Standardizing data
metabolomics_scaler = preprocessing.StandardScaler()
scaled_metabolomics_df[['means', 'medians', 'sds', 'se', 'ci']] = metabolomics_scaler.fit_transform(metabolomics_df[['means', 'medians', 'sds', 'se', 'ci']])

scaled_metabolomics_df

In [None]:
metabolomics_df.dropna(inplace = True)
metabolomics_df.reset_index(drop=True, inplace=True)
metabolomics_df

## Time series examination

In [None]:
def visualize_metabolites (data, temporal_column, metabolite_column, type_columns):
    
    data_seasoned = season_data (data, temporal_column)
    
    # Extract columns with float values
    float_columns = []
    
    for i in data_seasoned.columns:
        if data_seasoned[i].dtypes == 'float64' or data_seasoned[i].dtypes == 'float32':
            float_columns.append(i)
    
    # Create repeated chart with varying size encodings
    chart = alt.Chart(data_seasoned).mark_point(opacity = 1).encode(
        alt.X (temporal_column, type = 'temporal', scale = alt.Scale (nice = True)),
        alt.Y (metabolite_column, type = 'nominal'),
        alt.Size (alt.repeat("row"), type = 'quantitative'),
        alt.Color ('season:N', scale = alt.Scale (range = ['blue', 'green', 'orange', 'brown'])),
        alt.Tooltip (type_columns, type = 'nominal')
    ).properties(
        width = 1200
    ).repeat(
        row = float_columns
    ).resolve_scale(color = 'independent', size = 'independent')#.interactive()
    
    return chart


In [None]:
metabolites_chart = visualize_metabolites(metabolomics_df, 'date', 'Metabolite', ['type', 'type2', 'measurement', 'N'])
metabolites_chart

In [None]:
save_charts ([metabolites_chart], ['metabolomics_metabolites_chart.png'])

## Clustering

In [None]:
# Deep learning temporal clustering

# Should I even do this? Previous visualizations are descriptive enough. It would be a lot of work for not much benefit


---
# PROTEOMIC ANALYSIS
---
## Importing Proteomic data

In [None]:
# I could create something similar to Fig. 5 of the original paper, where I would calculate mean of different proteomic feature values for each rMAG calculated by days
# So I would have a table: date | feature 1 | feature 2 | ...
# Where each feature is mean of all values for one day of each MAG in that rMAG

In [None]:
def import_proteomics (end = 25, path_proteomics = path_proteomics_78):
    
    print ('Importing proteomics data')
    
    # There are 78 FASTA files
    # I have to traverse every FASTA file, and in each file every protein sequence
    
    fasta_files = [i for i in os.listdir(path_proteomics) if (i[-3:] == 'faa')]
    tmp_all = []
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_proteomics, fasta_file_name), 'r') as input_file:
                
                one_mag_list = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Analyzing protein (peptide) and creating list of values for one MAG
                    sequence = str(fasta_string.seq)
                    
                    if '*' in sequence:
                        continue
                    
                    else:
                    
                        sequence_analysis = ProteinAnalysis (sequence)
                        
                        tmp_list = [sequence_analysis.molecular_weight(), sequence_analysis.gravy(), sequence_analysis.aromaticity(), sequence_analysis.instability_index(), sequence_analysis.isoelectric_point()]
                        
                        tmp_sec_str = sequence_analysis.secondary_structure_fraction()
                        tmp_list += [tmp_sec_str[0], tmp_sec_str[1], tmp_sec_str[2]]
                        tmp_list.append (sequence.count('K') + sequence.count('R') - sequence.count('D') - sequence.count('E')) # Electricity
                        
                        amino_acid_perc = sequence_analysis.get_amino_acids_percent()
                        
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'AGILPV']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'STNQ']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'QNHSTYCMW']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'AGILPVF']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'HKR']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'CM']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'DE']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'NQ']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'ST']))
                        
                        # Now I put all these values in one_mag_list as a numpy arrays
                        one_mag_list.append(np.asarray(tmp_list))
                        
                # Now I put one mag values, aggregated by mean, into the all mag list
                tmp_all.append (np.asarray(one_mag_list).mean (axis = 0))
    
    
    COLUMN_LIST = ['Molecular weight', 'Gravy', 'Aromaticity', 'Instability index', 'Isoelectric point', 'Secondary structure fraction 0', 'Secondary structure fraction 1', 'Secondary structure fraction 2', 'Electricity', 'Fraction aliphatic', 'Fraction uncharged polar', 'Fraction polar', 'Fraction hydrophobic', 'Fraction positive', 'Fraction sulfur', 'Fraction negative', 'Fraction amide', 'Fraction alcohol']
    all_mag_df = pd.DataFrame (tmp_all, columns = COLUMN_LIST)
    
    print ('Finished importing')
    
    return all_mag_df

def visualize_proteomics (data):
    
    # Adding another column that replaces temporal data for now
    if 'Index_tmp' not in data.columns:
        data.insert (0, 'Index_tmp', data.index.values)
    
    # Create repeated chart
    chart = alt.Chart(data).mark_area().encode(
        alt.X ('Index_tmp', type = 'quantitative'),
        alt.Y (alt.repeat('row'), type = 'quantitative'),
    ).properties(
        width = 1200
    ).repeat(
        row = data.columns.values
    )#.resolve_scale(color = 'independent', size = 'independent')#.interactive()
    
    return chart


In [None]:
proteomics_data = import_proteomics (end = num_of_proteomics)
proteomics_data

In [None]:
chart_proteomics = visualize_proteomics(proteomics_data)
chart_proteomics

In [None]:
save_charts ([chart_proteomics], ['proteomics_chart_proteomics.png'])

---
# PHYSICO-CHEMICAL ANALYSIS
---
## Importing Physico-chemical data

In [None]:
phy_che_file_name = os.path.join(path_physico_chemical, [i for i in os.listdir(path_physico_chemical) if (i.endswith(('.tsv', '.csv')))][1])
phy_che_df = pd.read_csv (phy_che_file_name, decimal = ',')
phy_che_df

## Data preprocessing

In [None]:
phy_che_df.drop(index = 0, axis = 1, inplace = True)
phy_che_df['Date'] = pd.to_datetime(phy_che_df['Date'])
phy_che_df['Time'] = pd.to_timedelta(phy_che_df["Time"], unit = 'h')
phy_che_df

In [None]:
filtered_phy_che_df = phy_che_df[(phy_che_df['Date'] >= '2011-03-21') & (phy_che_df['Date'] <= '2012-05-03')]
tmp_column = pd.Series(filtered_phy_che_df['Date'] + filtered_phy_che_df['Time'])

filtered_phy_che_df.drop (['Date', 'Time'], axis = 1, inplace = True)
filtered_phy_che_df.reset_index(inplace = True, drop = True)
filtered_phy_che_df = filtered_phy_che_df.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',','.')))#, errors='coerce'))
filtered_phy_che_df.insert (0, 'DateTime', tmp_column.values)
filtered_phy_che_df

In [None]:
# Visualize temperature, air_temperature, conductivity, inflow_pH, nitrate, oxygen, pH

def visualize_phy_che (data, temporal_column, list_of_columns):
    
    # Create repeated chart
    chart = alt.Chart(data).mark_line().encode(
        alt.X (temporal_column, type = 'temporal'),#, timeUnit = 'month'),
        alt.Y (alt.repeat('row'), type = 'quantitative'),
    ).properties(
        width = 1200
    ).repeat(
        row = list_of_columns
    )#.resolve_scale(color = 'independent', size = 'independent')#.interactive()
    
    return chart

def visualize_phy_che_heatmap (data):
    
    new_data = data.drop('DateTime', axis = 1)
    corr = new_data.corr().reset_index().melt('index')
    corr.columns = ['var_1', 'var_2', 'correlation']
    
    # Create correlation chart
    chart = alt.Chart(corr).mark_rect().encode(
        alt.X ('var_1', title = None, axis = alt.Axis(labelAngle = -45)),
        alt.Y ('var_2', title = None),
        alt.Color('correlation', legend=None, scale = alt.Scale(scheme='redblue', reverse = True)),
    ).properties(
        width = alt.Step(40),
        height = alt.Step(40)
    )
    
    chart += chart.mark_text(size = 12).encode(
        alt.Text ('correlation', format=".2f"),
        color = alt.condition("abs(datum.correlation) > 0.5", alt.value('white'), alt.value('black'))
    )
    
    return chart.transform_filter("datum.var_1 < datum.var_2") # This returns only lower triangle


In [None]:
chart_phy_che = visualize_phy_che (filtered_phy_che_df, 'DateTime', filtered_phy_che_df.columns.values[4:])
chart_phy_che_corr = visualize_phy_che_heatmap (filtered_phy_che_df)
chart_phy_che_corr

In [None]:
chart_phy_che

In [None]:
save_charts ([chart_phy_che_corr, chart_phy_che], ['physico_chemical_chart_psy_che_corr.png', 'physico_chemical_chart_psy_che.png'])