In [1]:
from scipy.spatial.distance import pdist, squareform
from itertools import combinations
from mpl_toolkits.mplot3d import Axes3D
from skimage.measure import compare_mse as mse
from skimage.measure import compare_ssim as ssim

import re
import os
import cv2
import time
import tarfile
import subprocess
import numpy as np
import pandas as pd
import ringity as rng
import urllib.request
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import xml.etree.ElementTree as ET

%matplotlib inline

# Lipids

In [2]:
# download
subprocess.run(['executables/get_lipid_data.sh']);

In [3]:
# process
usecols  = range(1,11)
skiprows = [168]
df = pd.read_excel('data/lipids/1-s2.0-S0092867415006418-mmc3.xlsx', 
                   header    = 1, 
                   index_col = 0, 
                   usecols   = usecols, 
                   skiprows  = lambda x : x in skiprows)
C = np.corrcoef(np.array(df))

In [4]:
# save data
np.savetxt('data/lipids/lipid_corr.txt', C, fmt='%1.6f')

# Genes 
<center><h3 style="color:darkred"> >>> CAUTION - Big Data! <<<  </h3></center>

In [5]:
# download
subprocess.run(['executables/get_gene_data.sh']);

In [6]:
# unzip
subprocess.run(['executables/unzip_gene_data.sh']);

In [7]:
# process
df = pd.read_csv('data/genes/circadiaNET_correlation_matrices/arabidopsis_thaliana_correlation_matrix.txt', 
                 header    = 0, 
                 index_col = 0, 
                 delimiter = ' ')
C = np.corrcoef(np.array(df))

In [8]:
# save
np.savetxt('data/genes/gene_corr.txt', C, fmt='%1.6f')

# Cells

In [9]:
# process
name = 'FLS18 TNF'
df = pd.read_excel('data/cells/connectivity_FLS11_data.xls', sheet_name=name)
D = squareform(pdist(df[['Position X','Position Y','Position Z']]))

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw_data/cells/connectivity_FLS11_data.xls'

In [34]:
# save
np.savetxt('data/cells/cell_D.txt',D)

NameError: name 'D' is not defined

# Soil GIF

In [35]:
# download
subprocess.run(['executables/get_soil_data.sh']);

In [36]:
# extract frames
subprocess.run(['executables/extract_frames.sh']);

### This might take a while...    
&nbsp;&nbsp;&nbsp;&nbsp;(~4h on my computer)

In [39]:
# process
path = 'data/soil/frames'
pic_list = sorted(file for file in os.listdir(path) if file.endswith('jpg'))

n = len(pic_list)

D_mse  = np.zeros([n,n])
C_ssim = np.ones([n,n])

t1 = time.time()

for i, (jpg_a, jpg_b) in enumerate(combinations(pic_list, 2), 0):
    
    node_a = int(*re.findall(r'\d+', jpg_a)) - 1
    node_b = int(*re.findall(r'\d+', jpg_b)) - 1
    
    if i%1000==0:
        t2 = time.time()
        print(f'{i/782.1:.3f}% - {t2-t1:.3f}sec')
        
    img_a = mpimg.imread(f'{path}/{jpg_a}',0)
    img_b = mpimg.imread(f'{path}/{jpg_b}',0)

    img_a = cv2.cvtColor(img_a, cv2.COLOR_BGR2GRAY)
    img_b = cv2.cvtColor(img_b, cv2.COLOR_BGR2GRAY)

    D_mse[node_a, node_b] = mse( img_a, img_b)
    D_mse[node_b, node_a] = D_mse[node_a, node_b]
    
    C_ssim[node_a, node_b] = ssim(img_a, img_b)
    C_ssim[node_b, node_a] = C_ssim[node_a, node_b]

In [43]:
# save data
np.savetxt('data/soil/soil_gray_mse.txt' , D_mse , fmt='%1.6f')
np.savetxt('data/soil/soil_gray_ssim.txt', C_ssim, fmt='%1.6f')

# IFN

In [10]:
rootDir = "homo_sapiens.sbml"
# If SMBL files are not yet in the working directory
if not rootDir in os.listdir():
    # Download all human reactions from Reactome in SBML format
    urllib.request.urlretrieve("https://reactome.org/download/current/homo_sapiens.3.1.sbml.tgz", "/tmp/reactome_smbl.tgz")
    tar = tarfile.open("/tmp/reactome_smbl.tgz")
    tar.extractall(rootDir)
    tar.close()

In [11]:
pathwayFile = 'R-HSA-913531.sbml'

sbml = ET.parse(rootDir + '/' + pathwayFile)
model = sbml.getroot().find("{http://www.sbml.org/sbml/level3/version1/core}model")
reactions = model.find("{http://www.sbml.org/sbml/level3/version1/core}listOfReactions")

# List species annotated as "simple chemical" to remove them from networks, 
# to avoid creating star structures with the most common small molecules
smallMolec = {term.attrib['id'] for term in model.find("{http://www.sbml.org/sbml/level3/version1/core}listOfSpecies") 
  if ('sboTerm' in term.attrib.keys()) and (term.attrib['sboTerm'] == "SBO:0000247")} 

pathwayName = model.attrib['name']
pathwayID = model.attrib['id']

G = nx.DiGraph() 

# For each reaction in the pathway
for reaction in reactions:
    products = reaction.find("{http://www.sbml.org/sbml/level3/version1/core}listOfProducts")
    reagents = reaction.find("{http://www.sbml.org/sbml/level3/version1/core}listOfReactants")
    if not products or not reagents:
#             print("No products or no reagents")
        break
    products = {product.attrib['species'] for product in products if product.attrib['species'] not in smallMolec}
    reagents = {reagent.attrib['species'] for reagent in reagents if reagent.attrib['species'] not in smallMolec}
    # Add edge from reagents to products
    G.add_edges_from([(r,p) for r in reagents for p in products])

In [12]:
nx.write_edgelist(G, 'data/IFNs/IFN_edgelist.csv')

# Arctic GIF

# Temperatures

# Image Processing (?)

# Watts-Strogatz

In [47]:
# generate data with time stamps
N = 100
k = 6
p = 1.0

for i in range(7):
    W = nx.watts_strogatz_graph(N,k,p)
    dgm = rng.diagram(W, induce=True)
    name = str(int(time.time()*10**6))+'.csv'
    rng.save_dgm(dgm, f'data/watts_strogatz/dgms/N{N}/k{k}/p{p:.6f}/{name}')

FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/lib/python3.6/dist-packages/ringity/ripser/1565610208.5510364.csv'

In [19]:
# collect diagrams to a single score sheet
N = 1024
k = 8
    
path = f'data/watts_strogatz/dgms/N{N}/k{k}'
p_list = sorted([float(file[1:]) for file in os.listdir(path) if file.startswith('p')])

df = pd.DataFrame()

for p in p_list:
    scores_tmp = []
    for i, file in enumerate(os.listdir(f'{path}/p{p:.6f}')):
        if not file.endswith('.csv'):
            continue
        dgm = rng.load_dgm(fname=f'{path}/p{p:.6f}/{file}')
        scores_tmp.append(dgm.GGS)

    df_tmp = pd.DataFrame({p:scores_tmp})
    df = pd.concat([df,df_tmp], axis=1)

df.to_csv(f'data/watts_strogatz/GGS/N{N}/k{k}/GGS.csv')

# Erdos-Renyi

In [17]:
N = 2**8
    
path = f'data/erdos_renyi/ER_annealing_10000/dgms/N{N}'
p_list = sorted([float(file[1:]) for file in os.listdir(path) if file.startswith('p')])

df = pd.DataFrame()

for p in p_list:
    scores_tmp = []
    for i, file in enumerate(os.listdir(f'{path}/p{p:.6f}')):
        if not file.endswith('.csv'):
            continue         
        dgm = rng.load_dgm(fname=f'{path}/p{p:.6f}/{file}')
        scores_tmp.append(dgm.GGS)

    df_tmp = pd.DataFrame({p:scores_tmp})
    df = pd.concat([df,df_tmp], axis=1)

df.to_csv(f'data/erdos_renyi/ER_annealing_10000/GGS/{N}.csv')