# Downloading & Preprocessing data


In [None]:
# Downloading raw data in data/raw folder
# Data is downloaded from NIH website -> GEO dataset -> Gene Expression Omnibus repository (curated gene expression DataSets
!bash ../data/download_data.sh 

In [1]:
import sys
path_to_module = '../src/data_preprocess.py'
path_to_module2 = '../src/graph_construction.py'
import os
sys.path.append(os.path.dirname(path_to_module))
sys.path.append(os.path.dirname(path_to_module2))
import data_preprocess,graph_construction
import pandas as pd


In [2]:
path_ctrl_data="../data/raw/ctrl_data.txt.gz" 
path_t2d_data="../data/raw/t2d_data.txt.gz" 
df_ctrl=data_preprocess.read_rawdata(path_ctrl_data)
df_t2d=data_preprocess.read_rawdata(path_t2d_data)

Shape check [#genes x # cells] : (57980, 1081)
Shape check [#genes x # cells] : (57980, 1081)


In [3]:
df_ctrl.head()

Unnamed: 0,ERR1630014,ERR1630015,ERR1630016,ERR1630017,ERR1630020,ERR1630021,ERR1630022,ERR1630023,ERR1630024,ERR1630025,...,ERR1632736,ERR1632739,ERR1632743,ERR1632744,ERR1632747,ERR1632750,ERR1632753,ERR1632756,ERR1632757,ERR1632758
TSPAN6,105,0,0,0,0,72,145,0,0,0,...,0,4,7,0,2,1,0,0,60,40
TNMD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DPM1,0,0,0,0,0,0,238,111,0,0,...,3,8,4,2,0,3,2,7,27,15
SCYL3,0,0,0,0,0,110,0,0,0,0,...,0,0,7,0,0,0,0,5,0,6
C1orf112,0,0,0,0,0,0,14,0,0,0,...,0,0,0,0,0,0,0,8,0,1


In [4]:
print(df_ctrl.equals(df_t2d)) #error in downloading files

True


## Preprocess

In [4]:
# Filtering

# Dead cells or bad quality (if expressed genes < (500-100))  (https://www.biorxiv.org/content/10.1101/483297v1.full)
df_filtered=data_preprocess.filter_genes_expressed(df_ctrl,1500) # chosen 500

# Filter non informative genes --> remove genes expressed in very few cells ( < 3)

df_filtered=data_preprocess.filter_genes_not_informative(df_filtered,1)


Cells remained after filtering: 1069
Genes remained after filtering: 41705


In [5]:
df_norm=data_preprocess.normalize(df_filtered,1e4)
df_reduced=data_preprocess.reduce_dimentionality(df_norm,50)

count    1069.00
mean     3040.41
std       515.90
min       474.00
25%      2744.78
50%      3060.82
75%      3387.00
max      4369.72
dtype: float64
New dimensions:  (1069, 50)


## Graph construction

In [6]:
# Nodes: Cells
# Edges: connect cells with similar expression profile
# use Pearson correlation to calculate correlation between cells. Threshold =0.3
# Steps: 1. Matrix correlation, 2. Apply threshold to detect cell connectivity (simlarity). 3. Check diagonal is 0

adj_matrix=graph_construction.adjacent_matrix(df_reduced,0.1)
G=graph_construction.createGraph(adj_matrix,df_reduced)


Dimension of matrix correlation: (50, 50)
Threshold 0.1: around 0 graph connections
Data(x=[1069, 50], edge_index=[2, 0])


In [None]:
# Visualize graph
