In [None]:
"""
This is for looking at evolving immuno-landscape as a function of time for individual COVID-19 patients.
"""
base_path = "../DataSets/2020_COVID19/"

import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
import umap
import umap.plot
import os
from pathlib import Path
import fcsparser
from matplotlib.pyplot import figure

In [None]:
file_name_pattern = "*.fcs"

fcs_file_paths = []
fcs_file_names = []
for filepath in Path(base_path).rglob(file_name_pattern):
    fcs_file_paths.append(filepath)
    fcs_file_names.append(filepath.name)

def split_file_name_into_parts(filename):
    """This function assumes the files are named as [accession patient sample date]."""
    spaces_positions = []
    for i,x in enumerate(filename):
        if x == ' ':
            spaces_positions.append(i)
            
    underscore_positions = []
    for i,x in enumerate(filename):
        if x == '_':
            underscore_positions.append(i)
    
    period_positions = []
    for i,x in enumerate(filename):
        if x == '.':
            period_positions.append(i)
    
    patient_start = spaces_positions[0] + 1
    sample_start = filename.find(" PB ") + 1
    date_start = sample_start + 3        
    tube_start = filename.find("_WB_") + 1
    if tube_start == 0:
        tube_start = filename.find("_PBMC_") + 1

    #default values
    accession = patient = sample = date = tube = "unknown"
    
    accession = filename[0:patient_start - 1]
    patient = filename[patient_start:sample_start - 1]
    sample = filename[sample_start:date_start]
    date = filename[date_start:tube_start-1]
    tube = filename[tube_start:period_positions[0]]
    
    if sample_start == 0: #then there was a troublesome case with no " PB " in the name.
        sample = "PB"  #All the cases in this study are peripheral blood.
        date_start = spaces_positions[1] + 1
        patient = filename[patient_start:date_start - 1]
        date = filename[date_start:tube_start-1]
    
    if accession == "unknown" or patient == "unknown" or sample == "unknown" or \
    date == "unknown" or tube == "unknown":
        print(filename)
        print([accession, patient, sample, date, tube])
    return [accession, patient, sample, date, tube]
        
parsed_file_names = []

for filename in fcs_file_names:
    parsed_file_names.append(split_file_name_into_parts(filename))
    
#print(parsed_folder_names)
df = pd.DataFrame(parsed_file_names)
df.columns = ['Accession', 'Patient', 'Sample', 'Date', 'Tube']
df['File_Path'] = fcs_file_paths
df['File_Name'] = fcs_file_names
#df['Date']= pd.to_datetime(df['Date']) 

print(df.shape)
df.head(5)    

In [None]:
print(len(df['Patient'].unique()))
print((df['Patient'].unique()))

print(len(df['Tube'].unique()))
print((df['Tube'].unique()))

In [None]:
df.to_excel(base_path + "parsed_file_names.xlsx", index = None)

In [None]:
#Now make a dataframe of the patient with the most samples.
patient_samples = df[df['Patient'].isin(df.Patient.mode().to_list())]
patient_samples = patient_samples.sort_values(by='Date')
patient_samples.head(1000)

In [None]:
tube_types = patient_samples['Tube'].unique()
print(tube_types)

In [None]:
#Okay, I am now going to restrict it to the Treg tube and do the consecutive UMAPs.

patient_Treg_samples = patient_samples[patient_samples['Tube'] == tube_types[0]]
patient_Treg_samples = patient_Treg_samples.sort_values(by='Date')
patient_Treg_samples.head(10)

In [None]:
path = patient_Treg_samples['File_Path'].to_list()[0]
meta, data = fcsparser.parse(path, reformat_meta=True)
meta['_channels_']

del data['Time']
del data['FSC-H']
del data['SSC-H']
print(data.shape)
data.head()

In [None]:
#subdata = data.values[0:200000, :]
subdata = data.values
def transform_FCS_data(fcs_data_values):
    new_values = fcs_data_values
    new_values[new_values < 1] = 1
    new_values = np.log10(new_values)
    return new_values

subdata = transform_FCS_data(subdata)

In [None]:
fit = umap.UMAP()
%time u = fit.fit(subdata)

In [None]:
import umap.plot
umap.plot.points(u)

In [None]:
%time u = fit.transform(subdata)
plt.scatter(u[:,0], u[:,1], s=0.01)
plt.title('UMAP');
plt.show()

from matplotlib.pyplot import figure
figure(num=None, figsize=(4, 3), dpi=150, facecolor='w', edgecolor='k')

#Plot it as a hexbin heatmap.
plt.hexbin(u[:,0], u[:,1], gridsize=(150,150), cmap=plt.cm.Purples)
plt.colorbar()
plt.title('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[0])
plt.show()


In [None]:
#Now demonstrate the use of the embedding on a different set of data from the same specimen.

subdata = transform_FCS_data(data.values[200000:, :])
%time u = fit.transform(subdata)

plt.scatter(u[:,0], u[:,1], s=0.01)
plt.title('UMAP');
plt.show()

from matplotlib.pyplot import figure
figure(num=None, figsize=(4, 3), dpi=150, facecolor='w', edgecolor='k')

#Plot it as a hexbin heatmap.
plt.hexbin(u[:,0], u[:,1], gridsize=(150,150), cmap=plt.cm.Purples)
plt.colorbar()
plt.title('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[0])
plt.show()


In [None]:
#Okay, now I am going to make the embedding using all points in the data.  
#Then, I will transform the other files and plot.

path = patient_Treg_samples['File_Path'].to_list()[0]
meta, data = fcsparser.parse(path, reformat_meta=True)
del data['Time']
del data['FSC-H']
del data['SSC-H']
print(data.shape)

data = transform_FCS_data(data.values[0:250000,:])

fit = umap.UMAP()
%time u = fit.fit(data)

#umap.plot.points(u)
#plt.show()

%time u = fit.transform(data)

"""
#Make a scatter plot.
figure(num=None, figsize=(6, 4.5), dpi=300, facecolor='w', edgecolor='k')
plt.scatter(u[:,0], u[:,1], s=0.005)
plt.title('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[0])
plt.show()

#Plot it as a hexbin heatmap.
figure(num=None, figsize=(6, 4.5), dpi=300, facecolor='w', edgecolor='k')
plt.hexbin(u[:,0], u[:,1], gridsize=(100,100))#, cmap=plt.cm.Reds)
plt.colorbar()
plt.title('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[0])
plt.show()


#
figure(num=None, figsize=(6, 4.5), dpi=300, facecolor='w', edgecolor='k')
plt.hist2d(u[:,0], u[:,1],bins=100)
plt.show()
"""
#
with sns.axes_style("white"):
    sns.jointplot(u[:,0], u[:,1], kind="hex", color="k", bins=100);
  

In [None]:
for i, path in enumerate(patient_Treg_samples['File_Path'].to_list()):
    meta, data = fcsparser.parse(path, reformat_meta=True)
    del data['Time']
    del data['FSC-H']
    del data['SSC-H']
    data = transform_FCS_data(data.values)
    %time u = fit.transform(data)
    
    #Make a scatter plot.
    figure(num=None, figsize=(4, 3), dpi=150, facecolor='w', edgecolor='k')
    plt.scatter(u[:,0], u[:,1], s=0.01, alpha=0.5)
    plt.title('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[i])
    plt.show()
    
    #Plot it as a hexbin heatmap.
    figure(num=None, figsize=(4, 3), dpi=150, facecolor='w', edgecolor='k')
    plt.hexbin(u[:,0], u[:,1], gridsize=(150,150), cmap=plt.cm.Purples)
    plt.colorbar()
    plt.title('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[i])
    plt.show()



In [None]:
#Just plot the hexbins
for i, path in enumerate(patient_Treg_samples['File_Path'].to_list()):
    meta, data = fcsparser.parse(path, reformat_meta=True)
    del data['Time']
    del data['FSC-H']
    del data['SSC-H']
    data = transform_FCS_data(data.values)
    %time u = fit.transform(data)
    
    print('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[i])
    with sns.axes_style("white"):
        sns.jointplot(u[:,0], u[:,1], kind="hex", color="k", bins=100);
    plt.show()
    

In [None]:
for i, path in enumerate(patient_Treg_samples['File_Path'].to_list()):
    meta, data = fcsparser.parse(path, reformat_meta=True)
    del data['Time']
    del data['FSC-H']
    del data['SSC-H']
    data = transform_FCS_data(data.values)
    %time u = fit.transform(data)
    

    #Plot it as a hexbin heatmap.
    figure(num=None, figsize=(4, 3), dpi=150, facecolor='w', edgecolor='k')
    plt.hexbin(u[:,0], u[:,1], gridsize=(100,100))
    plt.colorbar()
    plt.title('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[i])
    plt.show()



In [None]:
#Okay, now I am going to make the embedding using all points in the data.  
#Then, I will transform the other files and plot.

#Approach 2: make embedding using some cells from each sample.


data_list = []
for path in patient_Treg_samples['File_Path'].to_list():
    meta, data = fcsparser.parse(path, reformat_meta=True)
    del data['Time']
    del data['FSC-H']
    del data['SSC-H']
    print(data.shape)
    data = transform_FCS_data(data.values[0:50000,:])
    data_list.append(data)
    
data_concatenated = np.concatenate(data_list, axis=0)
print(data_concatenated.shape)


In [None]:
fit = umap.UMAP()
%time u = fit.fit_transform(data_concatenated)

#umap.plot.points(u)
#plt.show()

#%time u = fit.transform(data)

#
with sns.axes_style("white"):
    sns.jointplot(u[:,0], u[:,1], kind="hex", color="k", bins=100);
  

In [None]:
#Plot it as a hexbin heatmap.
figure(num=None, figsize=(4, 3), dpi=150, facecolor='w', edgecolor='k')
plt.hexbin(u[:,0], u[:,1], gridsize=(100,100), cmap='inferno')
plt.colorbar()
plt.title('UMAP embedding, Treg')
plt.show()



In [None]:
#Just plot the hexbins
for i, path in enumerate(patient_Treg_samples['File_Path'].to_list()):
    meta, data = fcsparser.parse(path, reformat_meta=True)
    del data['Time']
    del data['FSC-H']
    del data['SSC-H']
    data = transform_FCS_data(data.values)
    %time u = fit.transform(data)
    
    print('UMAP, Treg tube, ' + patient_Treg_samples['Date'].to_list()[i])
    with sns.axes_style("white"):
        sns.jointplot(u[:,0], u[:,1], kind="hex", color="k", bins=150);
    plt.show()
    
    figure(num=None, figsize=(4, 3), dpi=150, facecolor='w', edgecolor='k')
    plt.hexbin(u[:,0], u[:,1], gridsize=(100,100), cmap='inferno')
    plt.colorbar()
    plt.title('UMAP, Treg, ' + patient_Treg_samples['Date'].to_list()[i])
    plt.show()