In [4]:
import pandas as pd
import numpy as np
import altair as alt
import os, random
import gensim
import datetime as dt
import altair_saver
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from gensim.models import Word2Vec
from sklearn.cluster import KMeans, OPTICS
from sklearn.decomposition import PCA
from sklearn import preprocessing, model_selection, metrics


In [23]:
# Checking if I am using GPU
import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

tf.config.list_physical_devices('GPU') 

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13088356531004289744
]


[]

## Defining paths for each and every omic

In [6]:
path_root_data = os.path.join ('..', 'Data', 'Extracted', 'Databases')

path_all_fasta = os.path.join (path_root_data, 'fasta_files', 'AllBins')
path_genomics_78 = os.path.join (path_root_data, 'fasta_files', 'rmags_filtered')
path_normalised_metabolomics = os.path.join (path_root_data, 'Metabolomics', 'Normalised_Tables')
path_proteomics_78 = os.path.join (path_root_data, 'Proteomics', 'set_of_78')
path_physico_chemical = os.path.join (path_root_data, 'PhysicoChemical')

path_model_save_root = 'Saved_models'
path_figures_save_root = 'Output_figures'


In [7]:
num_of_mags = len([i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')])
num_of_proteomics = len([i for i in os.listdir(path_proteomics_78) if (i[-3:] == 'faa')])
SEED = 42
END = 25
MAX_ROWS = 15000
EPOCHS = 10
NUM_OF_WORKERS = 3
random.seed(SEED)
np.random.seed(SEED)
alt.data_transformers.enable('default', max_rows = MAX_ROWS) # Important if you want to visualize datasets with >5000 samples


DataTransformerRegistry.enable('default')

---
# GENOMIC ANALYSIS
---
## MAG-related functions

**Important**: I should review the way I look at MAGs. The names of all fasta files beggining with 'D_##' represent the days those MAGs were obtained. Therefore, I should look at this also as timeseries data. Also, maybe I should only consider 78 MAGs, and not all ~1300.
After some consideration, I conclude that I should definetly use only 78 MAGs, because that way I wouldn't be tied to meta-omics data only. I also thinked about what should I visualize in that case. One idea is that I should also encode those MAGs with word2wec, and then make a 3D chart where one dimension is time, and other two dimensions would be PCA dimensions of those MAGs. I could also use this function to visualize proteomics data if I want.

Another important thing is that I should actually include FASTA headers and possibly use only them. That way, I could make figures like in a relevant paper where MAGs are groupped according to their taxonomy etc. I should look more into this.

In [8]:
# Function that saves charts from list_of_charts with names from list_of_names
def save_charts (list_of_chart, list_of_names):
    
    for chart, name in zip(list_of_chart, list_of_names):
        altair_saver.save(chart, os.path.join (path_figures_save_root, name))
    

In [9]:
# Function that splits each genome into k-mers thus creating even longer sentence (MAG)
# It returns tokenized genome i.e. [kmer, kmer,...]
def split_genome (genome, k = 5):
    new_genome = []
    n = len(genome)
    
    if n-k <=0:
        return genome
    else:
        for i in range(n-k):
            new_genome.append(genome[i:i+k])
        
        return new_genome


def vectorize_one_mag (one_mag, w2v_model):
    
    # We have to generate vectors for each word in one MAG and then create vector representation of that MAG
    # by averaging vectors of its words
    zero_vector = np.zeros(w2v_model.vector_size)
    word_vectors = []
    one_mag_vector = []
    
    for sentence in one_mag:
        for word in sentence:
            if word in w2v_model.wv:
                try:
                    word_vectors.append(w2v_model.wv[word])
                except KeyError:
                    print ('Key Error')
                    continue
    
    if word_vectors:
        word_vectors = np.asarray(word_vectors)
        one_mag_vector = word_vectors.mean (axis=0)
    
    else:
        one_mag_vector = zero_vector
    
    return one_mag_vector


# Function that vectorizes a MAG (document) with a pretrained word2vec model. It returns vector representation of a given MAG
# Vectorization is done by averaging word (k-mer) vectors for the whole document (MAG)
def vectorize_mags (w2v_model, end = 25):
    
    print ('Vectorizing MAGs')
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i[-2:] == 'fa')]
    list_of_mag_vectors = []
    
    # This was done so that I could work with first 'end' FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                # Vectorizing MAGs one by one
                list_of_mag_vectors.append (vectorize_one_mag (one_mag, w2v_model))
    
    print ('Finished vectorizing')
    
    return list_of_mag_vectors
    
    

# If one wants to import MAGs in order to vectorize them, one should use start argument in order to skip first 'start' MAGs
# If one wants to import MAGs to train word2vec model, one should use only end argument, so that first 'end' MAGs are used for training
# Todo: Implement randomisation in picking MAGs for training, and don't use first 'start' MAGs for training
# (create list of indexes from 0 to 1364, use sklearn split train test, then traverse directory and use only MAGs with indexes in train for training w2v)
def import_mags_and_build_model (end = 25, path_all_fasta = path_all_fasta):
    
    print ('Importing MAGs')
    
    # There are 1364 MAGs enclosed in FASTA files
    # I have to traverse every FASTA file, and in each file every sequence
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i.endswith('fa') and i.startswith('D'))]
    fasta_ids = []
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                one_mag_ids = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    # Save FASTA ids for every MAG
                    one_mag_ids.append(str(fasta_string.id))
                    
                # Save list of ids for one MAG in global list
                fasta_ids.append(one_mag_ids)
                
                # If we do not have a model, we build one
                if i == 0:
                    print ('Building w2v model')
                    # We build our model on the first MAG
                    w2v_model = Word2Vec (sentences = one_mag, size = 100, workers = NUM_OF_WORKERS, seed=SEED)
                
                # Else we just expand its vocabulary
                else:
                    # Now we expand our vocabulary
                    w2v_model.build_vocab (one_mag, update = True)
                    
    print ('Finished building')
    
    return w2v_model, fasta_files, fasta_ids


def train_model (w2v_model, epochs, end = 25):
    
    print ('Starting model training')
    
    # There are 1364 MAGs enclosed in FASTA files
    # I have to traverse every FASTA file, and in each file every sequence
    
    fasta_files = [i for i in os.listdir(path_all_fasta) if (i.endswith('fa') and i.startswith('D'))]
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_all_fasta, fasta_file_name), 'r') as input_file:
                
                one_mag = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Get kmers of a genome and create a sentence (list of words)
                    temp_kmers = split_genome (str(fasta_string.seq))
                    
                    # Create a document (list of sentences)
                    one_mag.append(temp_kmers)
                    
                
                w2v_model.train (one_mag, total_examples = w2v_model.corpus_count, epochs = epochs)
                    
    print ('Model training finished')
    
    return w2v_model


def visualize_with_pca (data, labels, centers):
    
    pca_model = PCA (n_components = 2, random_state = SEED)
    data_transformed = pca_model.fit_transform (data)
    
    data_transformed = pd.DataFrame (data_transformed)
    data_transformed.columns = ['PC_1', 'PC_2']
    data_transformed['Labels'] = labels
    
    chart_data = alt.Chart(data_transformed).mark_circle(opacity = 1).encode(
        alt.X ('PC_1:Q'),
        alt.Y ('PC_2:Q'),
        alt.Color ('Labels:N', legend = alt.Legend())
    )
    
    # This means we are visualising centroids from k_means (there are less centroids that data points)
    if labels.shape[0] != centers.shape[0]:
        
        centers_transformed = pca_model.fit_transform (centers)
        centers_transformed = pd.DataFrame (centers_transformed)
        centers_transformed.columns = ['PC_1', 'PC_2']
        
        chart_centers = alt.Chart(centers_transformed).mark_point(shape = 'diamond', color = 'black', size = 50, opacity = 0.7).encode(
            alt.X ('PC_1:Q'),
            alt.Y ('PC_2:Q'),
        )
        
        return chart_data + chart_centers
    
    # For DBSCAN there are no centroids
    else:
        return chart_data



## MAG examination

In [10]:
# FOR CLUSTERING I SHOULD CREATE A DATAFRAME WITH MAGs INDEXES AND THEIR VECTOR REPRESENTATIONS
final_model, fasta_names, fasta_ids = import_mags_and_build_model (end = END, path_all_fasta = path_genomics_78)


Importing MAGs
Building w2v model
Finished building


In [11]:


# IT APPEARS THAT I CANNOT USE THIS NAMES, SINCE THEY ARE NOT TIED TO EXACT ORGANISMS. THEREFORE, I WILL ONLY TRY TO CLUSTER THEM


fasta_names

['D42_G14.fa',
 'D05_L3.17.fa',
 'D44_G14.fa',
 'D36_L2.1.6.2.fa',
 'D36_L1.15.fa',
 'D20_O1.13.fa',
 'D12_L1.3.fa',
 'D47_P30.fa',
 'D15_G4.fa',
 'D33_L1.8.2.fa',
 'D37_L8.fa',
 'D32_O1.57.fa',
 'D04_O2.19.fa',
 'D30_O2.4.2.2.2.fa',
 'D20_P23.fa',
 'D51_G1.1.2.fa',
 'D08_P1.44.fa',
 'D37_G1.25.1.fa',
 'D39_O1.57.fa',
 'D23_P13.fa',
 'D04_G2.13.fa',
 'D05_G3.4.fa',
 'D49_G15.fa',
 'D47_O14.fa',
 'D22_G1.42.2.fa',
 'D08_O6.fa',
 'D11_O1.27.3.fa',
 'D35_G2.61.fa',
 'D20_G1.24.fa',
 'D20_O1.17.fa',
 'D46_L2.48.fa',
 'D49_O1.fa',
 'D16_L3.1.1.fa',
 'D31_O1.3.2.fa',
 'D28_L2.9.fa',
 'D15_L1.3.fa',
 'D44_O1.1.2.2.1.2.7.fa',
 'D47_O5.fa',
 'D13_L7.fa',
 'D35_G2.23.fa',
 'D04_L6.fa',
 'D42_G1.1.2.3.fa',
 'D29_L5.fa',
 'D28_O5.fa',
 'D30_P23.fa',
 'D10_O1.45.fa',
 'D26_L1.23.fa',
 'D36_G7.fa',
 'D04_G2.5.fa',
 'D51_G9.fa',
 'D51_L1.24.fa',
 'D22_L1.5.4.fa',
 'D37_G5.fa',
 'D15_O1.7.1.2.1.fa',
 'D05_G3.14.1.fa',
 'D15_G1.8.fa',
 'D19_G1.34.1.1.fa',
 'D36_L1.41.fa',
 'D11_O1.7.fa',
 'D41_O4.fa',


In [12]:
# Train model. It tooks ~10 minutes for END = 25 amount of MAGs
final_model = train_model (final_model, epochs = EPOCHS, end = END)


Starting model training


KeyboardInterrupt: 

In [None]:
final_model.wv.save_word2vec_format(os.path.join (path_model_save_root, 'model_78.bin'), binary=True) 

Now I should vectorize documents with this model. For further use, I could save this model's weights, and use it to vectorize all mags. That would take a lot, but every MAG will have its vector representation
> This could be done by importing one MAG at a time, then tokenizing it (like before), then getting vector representations of that MAG's sentences (genomes) and then finding the vector representation of the whole MAG (document). If I do that for one MAG at a time, There is no need to worry about memory


In [None]:
list_of_mag_vectors = vectorize_mags (final_model, end = END)
list_of_mag_vectors[0:2]

In [None]:
mags_df = pd.DataFrame (list_of_mag_vectors)
mags_df

## Data preprocessing

In [None]:
mag_scaler = preprocessing.StandardScaler()
scaled_mags_df = mag_scaler.fit_transform(mags_df)
scaled_mags_df

## Clustering

### 1. K-means

In [None]:
k_range_end = 15 # Usually it is sqrt(# of mags)

k_range = range(1, k_range_end)

k_mean_models = [KMeans (n_clusters = i, random_state = SEED) for i in k_range]
k_scores = [k_mean_model.fit(scaled_mags_df).score(scaled_mags_df) for k_mean_model in k_mean_models]
k_data = pd.DataFrame ({'k_range':k_range, 'k_scores':k_scores})

In [None]:
k_num_chart = alt.Chart(data = k_data).mark_line().encode(
    alt.X ('k_range:Q'),
    alt.Y ('k_scores:Q')
)

k_num_chart

In [None]:
# We can see from the chart above that 6 or 7 clusters are optimal for this task (where END = 25 MAGs)
num_of_clusters = 7

k_means_model = KMeans (n_clusters = num_of_clusters, random_state = SEED)
k_means_predicted = k_means_model.fit_predict(scaled_mags_df)
k_means_predicted

In [None]:
k_means_chart = visualize_with_pca (scaled_mags_df, k_means_predicted, k_means_model.cluster_centers_)
k_means_chart


### 2. OPTICS

In [None]:
MIN_SAMPLES = 3

optics_model = OPTICS (min_samples = MIN_SAMPLES, n_jobs = NUM_OF_WORKERS)
optics_predicted = optics_model.fit_predict (scaled_mags_df)
optics_predicted

In [None]:
# Visualize clusters, since there are no centroids, we are sending bogus array
optics_chart = visualize_with_pca (scaled_mags_df, optics_predicted, np.empty([optics_predicted.shape[0], 1], dtype=int))
optics_chart


In [None]:
# Side by side comparison
alt.hconcat (k_means_chart, optics_chart).resolve_scale(color='independent')

## Evaluation

In [19]:
eval_k_means = metrics.silhouette_score (scaled_mags_df, k_means_predicted)
eval_optics = metrics.silhouette_score (scaled_mags_df, optics_predicted)

print ('Silhouette scores: [best = 1, worst = -1]')
print ('\t1. K-means:', eval_k_means)
print ('\t2. OPTICS:', eval_optics)

Silhouette scores: [best = 1, worst = -1]
	1. K-means: 0.3545105482425982
	2. OPTICS: 0.2898402877092666


---
# METABOLOMIC ANALYSIS
---
## Importing Metabolomic data

In [20]:
metabolomics_file_name = os.path.join(path_normalised_metabolomics, os.listdir(path_normalised_metabolomics)[0])
metabolomics_df = pd.read_csv (metabolomics_file_name, delimiter = '\t')
metabolomics_df

Unnamed: 0,Metabolite,tp,date,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,!Phosphoric_acid_3TMS,D1,2012-04-17,snp,known,0.757113,0.737943,0.098650,3,0.056956,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
1,!Phosphoric_acid_3TMS,D10,2012-01-25,snp,known,0.720935,0.873509,0.266593,3,0.153918,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
2,!Phosphoric_acid_3TMS,D11,2012-03-22,snp,known,0.625047,0.652268,0.057820,3,0.033382,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
3,!Phosphoric_acid_3TMS,D12,2012-01-19,snp,known,0.964956,0.941878,0.053754,3,0.031035,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
4,!Phosphoric_acid_3TMS,D13,2011-05-13,snp,known,0.963984,0.974479,0.071659,3,0.041373,4.302653,extracellular,nonpolar,C00009,phosphoric acid,phosphoric acid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,Valine_2TMS,D51,2011-08-11,bp,known,0.704821,0.478251,0.428368,3,0.247319,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18866,Valine_2TMS,D6,2011-06-03,bp,known,1.085888,1.197471,0.487484,3,0.281449,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18867,Valine_2TMS,D7,2012-04-04,bp,known,2.958023,2.813145,0.341213,3,0.197000,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine
18868,Valine_2TMS,D8,2012-02-08,bp,known,2.515980,2.322248,0.618556,3,0.357123,4.302653,intracellular,polar,"C16436,C06417,C00183",valine;D-valine;L-valine,valine;D-valine;L-valine


## Data preprocessing

In [21]:
metabolomics_df['date'] = pd.to_datetime(metabolomics_df['date'])
metabolomics_df.insert (0, 'date', metabolomics_df.pop('date'))
metabolomics_df.sort_values ('date', inplace = True, ignore_index = True)
metabolomics_df

Unnamed: 0,date,Metabolite,tp,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,2011-03-21,Leucine_2TMS,D37,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar,"C16439,C00123,C01570",leucine;L-leucine;D-leucine,leucine;L-leucine;D-leucine
1,2011-03-21,No match: 1181.78_EM_SP_D1_1_223,D37,sp,unknown,3.778469,3.801417,1.924377,3,1.111040,4.302653,extracellular,polar,,,
2,2011-03-21,unknown#emu_WW_2144.25,D37,bnp,unknown,0.949545,0.947209,0.065422,3,0.037771,4.302653,intracellular,nonpolar,,,
3,2011-03-21,Unknown#bth_pae_001,D37,bp,unknown,2.095116,1.809891,1.916896,3,1.106721,4.302653,intracellular,polar,,,
4,2011-03-21,No match: 2220.34_EM_BP_D1_1_192,D37,bp,unknown,1.631643,1.571125,0.168501,3,0.097284,4.302653,intracellular,polar,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,D48,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar,D01924,octadecan-1-ol,octadecan-1-ol
18866,2012-05-03,No match: 1447.93_EM_BP_D2_1_57,D48,bp,unknown,4.815774,1.160165,6.835474,3,3.946463,4.302653,intracellular,polar,,,
18867,2012-05-03,Lactulose_8TMS,D48,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar,C07064,lactulose,lactulose
18868,2012-05-03,No match: 2004.78_EM_BNP_D10_1_205,D48,bnp,unknown,0.785996,0.744529,0.139901,3,0.080772,4.302653,intracellular,nonpolar,,,


In [22]:
# Changing metabolite name if it is unknown
metabolomics_df.loc[metabolomics_df['known_type'].eq('unknown'), 'Metabolite'] = np.nan
metabolomics_df

Unnamed: 0,date,Metabolite,tp,type,known_type,means,medians,sds,N,se,ci,type2,measurement,KEGG.Compound.ID,Chebi.Name,Chebi.Name_combined
0,2011-03-21,Leucine_2TMS,D37,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar,"C16439,C00123,C01570",leucine;L-leucine;D-leucine,leucine;L-leucine;D-leucine
1,2011-03-21,,D37,sp,unknown,3.778469,3.801417,1.924377,3,1.111040,4.302653,extracellular,polar,,,
2,2011-03-21,,D37,bnp,unknown,0.949545,0.947209,0.065422,3,0.037771,4.302653,intracellular,nonpolar,,,
3,2011-03-21,,D37,bp,unknown,2.095116,1.809891,1.916896,3,1.106721,4.302653,intracellular,polar,,,
4,2011-03-21,,D37,bp,unknown,1.631643,1.571125,0.168501,3,0.097284,4.302653,intracellular,polar,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,D48,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar,D01924,octadecan-1-ol,octadecan-1-ol
18866,2012-05-03,,D48,bp,unknown,4.815774,1.160165,6.835474,3,3.946463,4.302653,intracellular,polar,,,
18867,2012-05-03,Lactulose_8TMS,D48,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar,C07064,lactulose,lactulose
18868,2012-05-03,,D48,bnp,unknown,0.785996,0.744529,0.139901,3,0.080772,4.302653,intracellular,nonpolar,,,


In [23]:
print ('Dataset uniqueness:')
print ('\t1. Timestamps:', len(metabolomics_df['date'].unique()))
print ('\t2. Metabolites:', len(metabolomics_df['Metabolite'].unique()))
print ('\t3. Types:', len(metabolomics_df['type'].unique()))
print ('\t4. Known types:', len(metabolomics_df['known_type'].unique()))
print ('\t5. Ns:', len(metabolomics_df['N'].unique()))
print ('\t6. Type 2s:', len(metabolomics_df['type2'].unique()))
print ('\t7. Measurements:', len(metabolomics_df['measurement'].unique()))

Dataset uniqueness:
	1. Timestamps: 51
	2. Metabolites: 86
	3. Types: 4
	4. Known types: 2
	5. Ns: 2
	6. Type 2s: 2
	7. Measurements: 2


In [24]:
# Saving the name column and removing unnecessairy columns
#metabolite_names = metabolomics_df['Metabolite']
#metabolomics_df.drop(labels = ['Metabolite', 'tp', 'KEGG.Compound.ID', 'Chebi.Name', 'Chebi.Name_combined'], axis = 1, inplace = True)
metabolomics_df.drop(labels = ['tp', 'KEGG.Compound.ID', 'Chebi.Name', 'Chebi.Name_combined'], axis = 1, inplace = True)
metabolomics_df

Unnamed: 0,date,Metabolite,type,known_type,means,medians,sds,N,se,ci,type2,measurement
0,2011-03-21,Leucine_2TMS,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar
1,2011-03-21,,sp,unknown,3.778469,3.801417,1.924377,3,1.111040,4.302653,extracellular,polar
2,2011-03-21,,bnp,unknown,0.949545,0.947209,0.065422,3,0.037771,4.302653,intracellular,nonpolar
3,2011-03-21,,bp,unknown,2.095116,1.809891,1.916896,3,1.106721,4.302653,intracellular,polar
4,2011-03-21,,bp,unknown,1.631643,1.571125,0.168501,3,0.097284,4.302653,intracellular,polar
...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar
18866,2012-05-03,,bp,unknown,4.815774,1.160165,6.835474,3,3.946463,4.302653,intracellular,polar
18867,2012-05-03,Lactulose_8TMS,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar
18868,2012-05-03,,bnp,unknown,0.785996,0.744529,0.139901,3,0.080772,4.302653,intracellular,nonpolar


In [25]:
# Dummy eencoding categorical data
scaled_metabolomics_df = pd.get_dummies(metabolomics_df, columns = ['type', 'known_type', 'N', 'type2', 'measurement'])
scaled_metabolomics_df

Unnamed: 0,date,Metabolite,means,medians,sds,se,ci,type_bnp,type_bp,type_snp,type_sp,known_type_known,known_type_unknown,N_2,N_3,type2_extracellular,type2_intracellular,measurement_nonpolar,measurement_polar
0,2011-03-21,Leucine_2TMS,1.440122,1.489979,0.146440,0.084547,4.302653,0,0,0,1,1,0,0,1,1,0,0,1
1,2011-03-21,,3.778469,3.801417,1.924377,1.111040,4.302653,0,0,0,1,0,1,0,1,1,0,0,1
2,2011-03-21,,0.949545,0.947209,0.065422,0.037771,4.302653,1,0,0,0,0,1,0,1,0,1,1,0
3,2011-03-21,,2.095116,1.809891,1.916896,1.106721,4.302653,0,1,0,0,0,1,0,1,0,1,0,1
4,2011-03-21,,1.631643,1.571125,0.168501,0.097284,4.302653,0,1,0,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,1.364105,1.457780,0.170311,0.098329,4.302653,1,0,0,0,1,0,0,1,0,1,1,0
18866,2012-05-03,,4.815774,1.160165,6.835474,3.946463,4.302653,0,1,0,0,0,1,0,1,0,1,0,1
18867,2012-05-03,Lactulose_8TMS,0.697489,0.684339,0.049632,0.028655,4.302653,0,1,0,0,1,0,0,1,0,1,0,1
18868,2012-05-03,,0.785996,0.744529,0.139901,0.080772,4.302653,1,0,0,0,0,1,0,1,0,1,1,0


In [26]:
# Standardizing data
metabolomics_scaler = preprocessing.StandardScaler()
scaled_metabolomics_df[['means', 'medians', 'sds', 'se', 'ci']] = metabolomics_scaler.fit_transform(metabolomics_df[['means', 'medians', 'sds', 'se', 'ci']])

scaled_metabolomics_df

Unnamed: 0,date,Metabolite,means,medians,sds,se,ci,type_bnp,type_bp,type_snp,type_sp,known_type_known,known_type_unknown,N_2,N_3,type2_extracellular,type2_intracellular,measurement_nonpolar,measurement_polar
0,2011-03-21,Leucine_2TMS,-0.145146,-0.094812,-0.189837,-0.190047,-0.128175,0,0,0,1,1,0,0,1,1,0,0,1
1,2011-03-21,,0.148026,0.316083,-0.000023,-0.000270,-0.128175,0,0,0,1,0,1,0,1,1,0,0,1
2,2011-03-21,,-0.206653,-0.191297,-0.198486,-0.198695,-0.128175,1,0,0,0,0,1,0,1,0,1,1,0
3,2011-03-21,,-0.063026,-0.037942,-0.000822,-0.001068,-0.128175,0,1,0,0,0,1,0,1,0,1,0,1
4,2011-03-21,,-0.121134,-0.080386,-0.187482,-0.187692,-0.128175,0,1,0,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18865,2012-05-03,Octadecanol_1TMS,-0.154677,-0.100535,-0.187288,-0.187499,-0.128175,1,0,0,0,1,0,0,1,0,1,1,0
18866,2012-05-03,,0.278079,-0.153441,0.524288,0.523941,-0.128175,0,1,0,0,0,1,0,1,0,1,0,1
18867,2012-05-03,Lactulose_8TMS,-0.238254,-0.238027,-0.200172,-0.200380,-0.128175,0,1,0,0,1,0,0,1,0,1,0,1
18868,2012-05-03,,-0.227158,-0.227327,-0.190535,-0.190745,-0.128175,1,0,0,0,0,1,0,1,0,1,1,0


In [27]:
metabolomics_df.dropna(inplace = True)
metabolomics_df.reset_index(drop=True, inplace=True)
metabolomics_df

Unnamed: 0,date,Metabolite,type,known_type,means,medians,sds,N,se,ci,type2,measurement
0,2011-03-21,Leucine_2TMS,sp,known,1.440122,1.489979,0.146440,3,0.084547,4.302653,extracellular,polar
1,2011-03-21,Glycerol-3-phosphoric_acid_4TMS,bp,known,1.462276,1.524821,0.204146,3,0.117864,4.302653,intracellular,polar
2,2011-03-21,Putrescine_4TMS,bp,known,0.667033,0.667946,0.145202,3,0.083832,4.302653,intracellular,polar
3,2011-03-21,Glyceric_acid_3TMS,sp,known,1.195441,1.312780,0.294056,3,0.169774,4.302653,extracellular,polar
4,2011-03-21,Proline_2TMS,sp,known,1.874113,2.038239,0.541314,3,0.312528,4.302653,extracellular,polar
...,...,...,...,...,...,...,...,...,...,...,...,...
6880,2012-05-03,Glycerol-3-phosphoric_acid_4TMS,bnp,known,0.950065,0.984881,0.098055,3,0.056612,4.302653,intracellular,nonpolar
6881,2012-05-03,1-Monooleoylglycerol_2TMS,bnp,known,1.300665,1.342977,0.078951,3,0.045582,4.302653,intracellular,nonpolar
6882,2012-05-03,Octadecanol_1TMS,bnp,known,1.364105,1.457780,0.170311,3,0.098329,4.302653,intracellular,nonpolar
6883,2012-05-03,Lactulose_8TMS,bp,known,0.697489,0.684339,0.049632,3,0.028655,4.302653,intracellular,polar


## Time series examination

In [28]:
# This function creates new dataframe with column that represent season according to date
# It also concatenate important types with metabolite names
def season_data (data, temporal_column):
    new_df = data
    new_df['season'] = new_df[temporal_column].dt.month%12 // 3 + 1
    
    #important_types = [metabolite_column] + important_types
    #new_df['new_name'] = df[important_types].agg('\n'.join, axis=1)
    
    return new_df

def visualize_metabolites (data, temporal_column, metabolite_column, type_columns):
    
    data_seasoned = season_data (data, temporal_column)
    
    # Extract columns with float values
    float_columns = []
    
    for i in data_seasoned.columns:
        if data_seasoned[i].dtypes == 'float64' or data_seasoned[i].dtypes == 'float32':
            float_columns.append(i)
    
    # Create repeated chart with varying size encodings
    chart = alt.Chart(data_seasoned).mark_point(opacity = 1).encode(
        alt.X (temporal_column, type = 'temporal', scale = alt.Scale (nice = True)),
        alt.Y (metabolite_column, type = 'nominal'),
        alt.Size (alt.repeat("row"), type = 'quantitative'),
        alt.Color ('season:N', scale = alt.Scale (range = ['blue', 'green', 'orange', 'brown'])),
        alt.Tooltip (type_columns, type = 'nominal')
    ).properties(
        width = 1200
    ).repeat(
        row = float_columns
    ).resolve_scale(color = 'independent', size = 'independent')#.interactive()
    
    return chart


In [29]:
metabolites_chart = visualize_metabolites(metabolomics_df, 'date', 'Metabolite', ['type', 'type2', 'measurement', 'N'])
metabolites_chart

## Clustering

In [30]:
# Deep learning temporal clustering

# Should I even do this? Previous visualizations are descriptive enough. It would be a lot of work for not much benefit


---
# PROTEOMIC ANALYSIS
---
## Importing Proteomic data

In [31]:
# I could create something similar to Fig. 5 of the original paper, where I would calculate mean of different proteomic feature values for each rMAG calculated by days
# So I would have a table: date | feature 1 | feature 2 | ...
# Where each feature is mean of all values for one day of each MAG in that rMAG

In [32]:
def import_proteomics (end = 25, path_proteomics = path_proteomics_78):
    
    print ('Importing proteomics data')
    
    # There are 78 FASTA files
    # I have to traverse every FASTA file, and in each file every protein sequence
    
    fasta_files = [i for i in os.listdir(path_proteomics) if (i[-3:] == 'faa')]
    tmp_all = []
    
    # This was done so that I could work with first 100 FASTA files only. Otherwise, I should just remove: i, and enumerate
    for i, fasta_file_name in enumerate(fasta_files):
        
        if i == end:
            break
        
        else:
            with open(os.path.join(path_proteomics, fasta_file_name), 'r') as input_file:
                
                one_mag_list = []
                for fasta_string in SeqIO.parse(input_file, "fasta"):
                    
                    # Analyzing protein (peptide) and creating list of values for one MAG
                    sequence = str(fasta_string.seq)
                    
                    if '*' in sequence:
                        continue
                    
                    else:
                    
                        sequence_analysis = ProteinAnalysis (sequence)
                        
                        tmp_list = [sequence_analysis.molecular_weight(), sequence_analysis.gravy(), sequence_analysis.aromaticity(), sequence_analysis.instability_index(), sequence_analysis.isoelectric_point()]
                        
                        tmp_sec_str = sequence_analysis.secondary_structure_fraction()
                        tmp_list += [tmp_sec_str[0], tmp_sec_str[1], tmp_sec_str[2]]
                        tmp_list.append (sequence.count('K') + sequence.count('R') - sequence.count('D') - sequence.count('E')) # Electricity
                        
                        amino_acid_perc = sequence_analysis.get_amino_acids_percent()
                        
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'AGILPV']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'STNQ']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'QNHSTYCMW']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'AGILPVF']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'HKR']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'CM']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'DE']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'NQ']))
                        tmp_list.append (sum ([amino_acid_perc[aa] for aa in 'ST']))
                        
                        # Now I put all these values in one_mag_list as a numpy arrays
                        one_mag_list.append(np.asarray(tmp_list))
                        
                # Now I put one mag values, aggregated by mean, into the all mag list
                tmp_all.append (np.asarray(one_mag_list).mean (axis = 0))
    
    
    COLUMN_LIST = ['Molecular weight', 'Gravy', 'Aromaticity', 'Instability index', 'Isoelectric point', 'Secondary structure fraction 0', 'Secondary structure fraction 1', 'Secondary structure fraction 2', 'Electricity', 'Fraction aliphatic', 'Fraction uncharged polar', 'Fraction polar', 'Fraction hydrophobic', 'Fraction positive', 'Fraction sulfur', 'Fraction negative', 'Fraction amide', 'Fraction alcohol']
    all_mag_df = pd.DataFrame (tmp_all, columns = COLUMN_LIST)
    
    print ('Finished importing')
    
    return all_mag_df

def visualize_proteomics (data):
    
    # Adding another column that replaces temporal data for now
    if 'Index_tmp' not in data.columns:
        data.insert (0, 'Index_tmp', data.index.values)
    
    # Create repeated chart
    chart = alt.Chart(data).mark_area().encode(
        alt.X ('Index_tmp', type = 'quantitative'),
        alt.Y (alt.repeat('row'), type = 'quantitative'),
    ).properties(
        width = 1200
    ).repeat(
        row = data.columns.values
    )#.resolve_scale(color = 'independent', size = 'independent')#.interactive()
    
    return chart


In [33]:
proteomics_data = import_proteomics (end = num_of_proteomics)
proteomics_data

Importing proteomics data
Finished importing


Unnamed: 0,Molecular weight,Gravy,Aromaticity,Instability index,Isoelectric point,Secondary structure fraction 0,Secondary structure fraction 1,Secondary structure fraction 2,Electricity,Fraction aliphatic,Fraction uncharged polar,Fraction polar,Fraction hydrophobic,Fraction positive,Fraction sulfur,Fraction negative,Fraction amide,Fraction alcohol
0,30107.370303,-0.029115,0.069055,39.655099,7.354626,0.305178,0.210988,0.303285,-1.273648,0.497255,0.173165,0.269161,0.529845,0.126935,0.032892,0.100698,0.074195,0.098970
1,31945.929807,-0.157285,0.104198,34.785878,7.207581,0.344477,0.222770,0.243080,-0.647059,0.399968,0.210687,0.318818,0.453415,0.134969,0.037200,0.112978,0.092672,0.118014
2,33336.676155,-0.045767,0.059752,37.570359,6.049116,0.285099,0.230651,0.306529,-10.212190,0.506151,0.164821,0.247837,0.533880,0.114913,0.029888,0.124475,0.047787,0.117033
3,26058.479897,-0.104694,0.072250,39.041305,7.346492,0.303804,0.205971,0.294666,-0.469168,0.461895,0.187834,0.289298,0.500347,0.134007,0.040014,0.103999,0.090910,0.096924
4,27971.040966,-0.232816,0.101992,33.917894,7.294352,0.347434,0.220156,0.240268,0.034398,0.379836,0.223348,0.317693,0.435946,0.143279,0.031380,0.120165,0.103245,0.120103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,24586.140194,-0.113981,0.060095,39.992533,6.646171,0.279006,0.234656,0.280482,-4.912468,0.477694,0.180007,0.270632,0.508087,0.127144,0.036477,0.118583,0.054836,0.125171
74,33364.056584,-0.161289,0.100820,33.432398,7.590524,0.332487,0.218691,0.250484,1.369316,0.407491,0.207281,0.314093,0.458395,0.136934,0.038155,0.109319,0.088083,0.119198
75,26741.991077,-0.108067,0.073667,37.581279,7.504971,0.296754,0.221329,0.286249,-1.240703,0.469162,0.169856,0.278743,0.504968,0.140227,0.041462,0.105626,0.060045,0.109811
76,34303.306631,0.032637,0.059614,35.771707,6.769888,0.290662,0.225830,0.310594,-4.963504,0.521625,0.160698,0.242854,0.549332,0.119270,0.028500,0.110294,0.045722,0.114976


In [34]:
chart_proteomics = visualize_proteomics(proteomics_data)
chart_proteomics

---
# PHYSICO-CHEMICAL ANALYSIS
---
## Importing Physico-chemical data

In [35]:
phy_che_file_name = os.path.join(path_physico_chemical, [i for i in os.listdir(path_physico_chemical) if (i.endswith(('.tsv', '.csv')))][1])
phy_che_df = pd.read_csv (phy_che_file_name, decimal = ',')
phy_che_df

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Date,Time,Inflow_Volume,Inflow_pH,Inflow_conductivity,T,outf_preclarifier_volume,outf_preclarifier_COD,outf_preclarifier_NH4-N,Abschlagmenge_Durchfluß,...,Rücklaufschlamm,Überfallmenge1_Durchfluß,Überfallmenge2_Durchfluß,Rücklaufschlamm_TS,Menge_Ablauf_Durchfluß,Outflow_pH,Outflow_Conductivity,Menge_Fäkalien_Durchfluß,Menge_Fäkalien_pH,Menge_Fäkalien_Conductivity
0,,,m3/h,pH,µS/cm,C,m3/h,mg/l,mg/l,m3/h,...,m3/h,m3/h,m3/h,g/l,m3/h,pH,mS/cm,m3/h,pH,mS/cm
1,07/03/02,1.0,437.13,8.55,5.10,43.80,0.0956,334.13,0.00,7.11,...,1365.87,778,754,10.94,0.00,2.00,0.00,0.0000,8.3266,0.0529
2,07/03/02,3.0,308.08,8.52,5.15,46.92,0.0166,331.81,0.00,4.13,...,1362.75,773,718,10.40,0.00,2.00,0.00,0.0000,8.3185,0.0528
3,07/03/02,5.0,255.27,8.49,5.30,49.24,0.0258,319.13,0.00,5.74,...,1359.20,772,707,9.84,0.00,2.00,0.00,0.0000,8.3260,0.0532
4,07/03/02,7.0,261.22,8.42,5.36,39.20,0.0916,303.37,0.00,5.33,...,1355.74,770,709,9.63,0.00,2.00,0.00,0.0000,8.2952,0.0521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61138,11/19/18,15.0,599.03,8.09,1615.81,16.29,144.8233,195.59,80.00,6.14,...,534.42,287,286,8.81,440.26,7.19,1069.06,0.0000,2.0000,0.0000
61139,11/19/18,17.0,579.19,7.97,1574.42,16.34,147.6642,300.74,80.00,5.57,...,488.12,268,270,9.17,410.50,7.18,1072.12,0.0000,2.0000,0.0000
61140,11/19/18,19.0,696.77,7.88,1476.28,15.96,148.2223,469.70,80.00,4.99,...,575.44,309,311,8.58,482.34,7.17,1073.38,0.0000,2.0000,0.0000
61141,11/19/18,21.0,942.94,7.83,1366.18,15.45,150.4223,384.14,70.51,3.77,...,900.10,478,477,5.75,739.48,7.14,1091.47,0.0000,2.0000,0.0000


## Data preprocessing

In [36]:
phy_che_df.drop(index = 0, axis = 1, inplace = True)
phy_che_df['Date'] = pd.to_datetime(phy_che_df['Date'])
phy_che_df['Time'] = pd.to_timedelta(phy_che_df["Time"], unit = 'h')
phy_che_df

Unnamed: 0,Date,Time,Inflow_Volume,Inflow_pH,Inflow_conductivity,T,outf_preclarifier_volume,outf_preclarifier_COD,outf_preclarifier_NH4-N,Abschlagmenge_Durchfluß,...,Rücklaufschlamm,Überfallmenge1_Durchfluß,Überfallmenge2_Durchfluß,Rücklaufschlamm_TS,Menge_Ablauf_Durchfluß,Outflow_pH,Outflow_Conductivity,Menge_Fäkalien_Durchfluß,Menge_Fäkalien_pH,Menge_Fäkalien_Conductivity
1,2002-07-03,0 days 01:00:00,437.13,8.55,5.10,43.80,0.0956,334.13,0.00,7.11,...,1365.87,778,754,10.94,0.00,2.00,0.00,0.0000,8.3266,0.0529
2,2002-07-03,0 days 03:00:00,308.08,8.52,5.15,46.92,0.0166,331.81,0.00,4.13,...,1362.75,773,718,10.40,0.00,2.00,0.00,0.0000,8.3185,0.0528
3,2002-07-03,0 days 05:00:00,255.27,8.49,5.30,49.24,0.0258,319.13,0.00,5.74,...,1359.20,772,707,9.84,0.00,2.00,0.00,0.0000,8.3260,0.0532
4,2002-07-03,0 days 07:00:00,261.22,8.42,5.36,39.20,0.0916,303.37,0.00,5.33,...,1355.74,770,709,9.63,0.00,2.00,0.00,0.0000,8.2952,0.0521
5,2002-07-03,0 days 09:00:00,439.54,8.53,6.24,40.48,0.0895,288.80,0.00,4.93,...,1349.17,774,709,9.81,0.00,2.00,0.00,0.0000,8.3044,0.0523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61138,2018-11-19,0 days 15:00:00,599.03,8.09,1615.81,16.29,144.8233,195.59,80.00,6.14,...,534.42,287,286,8.81,440.26,7.19,1069.06,0.0000,2.0000,0.0000
61139,2018-11-19,0 days 17:00:00,579.19,7.97,1574.42,16.34,147.6642,300.74,80.00,5.57,...,488.12,268,270,9.17,410.50,7.18,1072.12,0.0000,2.0000,0.0000
61140,2018-11-19,0 days 19:00:00,696.77,7.88,1476.28,15.96,148.2223,469.70,80.00,4.99,...,575.44,309,311,8.58,482.34,7.17,1073.38,0.0000,2.0000,0.0000
61141,2018-11-19,0 days 21:00:00,942.94,7.83,1366.18,15.45,150.4223,384.14,70.51,3.77,...,900.10,478,477,5.75,739.48,7.14,1091.47,0.0000,2.0000,0.0000


In [37]:
filtered_phy_che_df = phy_che_df[(phy_che_df['Date'] >= '2011-03-21') & (phy_che_df['Date'] <= '2012-05-03')]
tmp_column = pd.Series(filtered_phy_che_df['Date'] + filtered_phy_che_df['Time'])

filtered_phy_che_df.drop (['Date', 'Time'], axis = 1, inplace = True)
filtered_phy_che_df.reset_index(inplace = True, drop = True)
filtered_phy_che_df = filtered_phy_che_df.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',','.')))#, errors='coerce'))
filtered_phy_che_df.insert (0, 'DateTime', tmp_column.values)
filtered_phy_che_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,DateTime,Inflow_Volume,Inflow_pH,Inflow_conductivity,T,outf_preclarifier_volume,outf_preclarifier_COD,outf_preclarifier_NH4-N,Abschlagmenge_Durchfluß,Stapelbehälter_Durchfluß,...,Rücklaufschlamm,Überfallmenge1_Durchfluß,Überfallmenge2_Durchfluß,Rücklaufschlamm_TS,Menge_Ablauf_Durchfluß,Outflow_pH,Outflow_Conductivity,Menge_Fäkalien_Durchfluß,Menge_Fäkalien_pH,Menge_Fäkalien_Conductivity
0,2011-03-21 01:00:00,587.82,6.38,2150.46,15.09,0.0046,951.78,39.74,3.64,156.79,...,916.98,432.0,431.0,4.93,574.32,7.13,905.88,0.0203,5.1371,66.9787
1,2011-03-21 03:00:00,492.41,6.33,2215.49,14.76,0.0042,937.98,42.32,3.92,158.46,...,840.24,374.0,373.0,5.06,495.70,7.16,907.40,0.0153,5.2199,66.9394
2,2011-03-21 05:00:00,454.61,6.29,2212.68,14.51,0.0083,930.48,41.47,3.79,159.22,...,799.92,348.0,348.0,5.04,462.31,7.17,897.46,0.0154,5.2995,66.3648
3,2011-03-21 07:00:00,505.10,6.34,2212.21,14.39,0.0042,924.90,39.83,3.75,158.51,...,768.65,342.0,342.0,5.08,460.75,7.18,905.26,0.0155,5.3578,66.5581
4,2011-03-21 09:00:00,610.22,6.20,2205.67,14.56,12.2000,916.73,28.85,2.54,0.22,...,863.17,415.0,417.0,4.98,558.43,7.15,905.27,0.0156,5.2081,66.3478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4819,2012-05-03 15:00:00,1236.79,8.57,1094.55,17.60,4.9685,225.49,39.17,1.22,0.00,...,1651.32,894.0,894.0,8.51,1200.06,7.26,791.09,0.0062,3.9065,69.2477
4820,2012-05-03 17:00:00,1286.00,8.40,1001.93,17.50,0.0020,70.78,23.17,1.05,0.00,...,1694.97,915.0,915.0,8.42,1223.47,7.27,806.07,0.0065,3.8800,69.6208
4821,2012-05-03 19:00:00,1297.70,8.39,997.82,17.39,0.0037,58.38,21.00,1.03,0.00,...,1749.27,950.0,950.0,8.37,1265.73,7.31,812.59,0.0067,3.8692,69.6072
4822,2012-05-03 21:00:00,1296.84,8.45,1044.01,17.69,0.0029,61.13,22.61,1.16,0.00,...,1750.20,952.0,950.0,8.32,1268.07,7.36,804.93,0.0061,3.8935,69.1437


In [38]:
# Visualize temperature, air_temperature, conductivity, inflow_pH, nitrate, oxygen, pH

def visualize_psy_che (data, temporal_column, list_of_columns):
    
    # Create repeated chart
    chart = alt.Chart(data).mark_line().encode(
        alt.X (temporal_column, type = 'temporal'),#, timeUnit = 'month'),
        alt.Y (alt.repeat('row'), type = 'quantitative'),
    ).properties(
        width = 1200
    ).repeat(
        row = list_of_columns
    )#.resolve_scale(color = 'independent', size = 'independent')#.interactive()
    
    return chart

def visualize_psy_che_heatmap (data):
    
    new_data = data.drop('DateTime', axis = 1)
    corr = new_data.corr().reset_index().melt('index')
    corr.columns = ['var_1', 'var_2', 'correlation']
    
    # Create correlation chart
    chart = alt.Chart(corr).mark_rect().encode(
        alt.X ('var_1', title = None, axis = alt.Axis(labelAngle = -45)),
        alt.Y ('var_2', title = None),
        alt.Color('correlation', legend=None, scale = alt.Scale(scheme='redblue', reverse = True)),
    ).properties(
        width = alt.Step(40),
        height = alt.Step(40)
    )
    
    chart += chart.mark_text(size = 12).encode(
        alt.Text ('correlation', format=".2f"),
        color = alt.condition("abs(datum.correlation) > 0.5", alt.value('white'), alt.value('black'))
    )
    
    return chart.transform_filter("datum.var_1 < datum.var_2") # This returns only lower triangle


In [39]:
chart_psy_che = visualize_psy_che (filtered_phy_che_df, 'DateTime', filtered_phy_che_df.columns.values[4:])
chart_psy_che_corr = visualize_psy_che_heatmap (filtered_phy_che_df)
chart_psy_che_corr

In [40]:
chart_psy_che