# Proof of concept: *PyGMQL*

In [1]:
import gmql as gl
import numpy as np
import pandas as pd

## Loading a dataset 

In [2]:
input_path = "/home/luca/Scrivania/GMQL-Python/resources/hg_narrowPeaks/"

In [3]:
bed_parser = gl.parsers.BedParser(delimiter="\t", chrPos=None, startPos=None, stopPos=None)

In [4]:
dataset = gl.GMQLDataset(parser=bed_parser)

In [5]:
dataset = dataset.load_from_path(path=input_path)

2017-03-23 17:18:46,616 - gmql_logger - INFO - loading metadata
2017-03-23 17:18:50,707 - gmql_logger - INFO - parsing metadata
2017-03-23 17:18:50,709 - gmql_logger - INFO - collecting metadata
2017-03-23 17:19:08,794 - gmql_logger - INFO - dataframe construction


100%|██████████| 115/115 [00:38<00:00,  3.03it/s]


## Visualize the metadata in a tabular form

In [6]:
dataset.meta_dataset.head()

Unnamed: 0_level_0,ID,antibody,antibody_antibodyDescription,antibody_deprecated,antibody_lab,antibody_label,antibody_lots,antibody_orderUrl,antibody_tag,antibody_target,...,treatment_label,treatment_tag,treatment_type,type,url,view,view_description,view_label,view_tag,view_type
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9220584259719780958,[512],[E2F6],"[Rabbit polyclonal IgG, epitope corresponding ...",[],"[Farnham, Myers]",[E2F6 (sc-22823)],[D1206 (Myers)],[http://www.scbt.com/datasheet-22823-e2f-6-h-5...,[E2F6],[E2F6],...,[No treatment or prot],[NONE],[control],[narrowPeak],[http://hgdownload.cse.ucsc.edu/goldenPath/hg1...,[Peaks],[Regions of enriched signal in experiment],[Peaks],[PKS],[view]
-9219803729611809025,[1890],[CTCF],[Rabbit polyclonal. Antibody Target: CTCF],[],"[Myers, Hardison, Snyder]",[CTCF (07-729)],[1350637 DAM1472197],[http://www.millipore.com/catalogue/item/07-729],[CTCF],[CTCF],...,[No treatment or prot],[NONE],[control],[narrowPeak],[http://hgdownload.cse.ucsc.edu/goldenPath/hg1...,[Peaks],[Regions of enriched signal in experiment],[Peaks],[PKS],[view]
-9211745969161520790,[1394],[eGFP-HDAC8],[Goat polyclonal. Antibody Target: eGFP-HDAC8],[],[White],[HDAC8 (eGFP-HDAC8)],[],[http://www.genes.uchicago.edu/white.html],[HDAC8e],[eGFP-HDAC8],...,[],[],[],[narrowPeak],[http://hgdownload.cse.ucsc.edu/goldenPath/hg1...,[Peaks],[Regions of enriched signal in experiment],[Peaks],[PKS],[view]
-9210823539931286134,[266],[GABP],"[Mouse monoclonal, GABPa(G-1), IgG1. Antibody ...",[],[Myers],[GABPA (sc-28312)],[F1804],[http://www.scbt.com/product.php?datasheet=28312],[GABP],[GABPA],...,[No treatment or prot],[NONE],[control],[narrowPeak],[http://hgdownload.cse.ucsc.edu/goldenPath/hg1...,[Peaks],[Regions of enriched signal in experiment],[Peaks],[PKS],[view]
-9197969371849812436,[1570],[],[],[],[],[],[],[],[],[],...,[No treatment or prot],[NONE],[control],[narrowPeak],[http://hgdownload.cse.ucsc.edu/goldenPath/hg1...,[Peaks],[Regions of enriched signal in experiment],[Peaks],[PKS],[view]


## Select rows of the metadata based on a logical predicate

In [7]:
filtered_dataset = dataset.meta_select(lambda row: 'CTCF' in row['antibody'])

In [11]:
filtered_dataset.meta_dataset.shape[0]

236

## Project metadata based on an attribute list

In [12]:
filtered_proj_data = filtered_dataset.meta_project(['antibody', 'cell'])
filtered_proj_data.meta_dataset.head()

Unnamed: 0_level_0,antibody,cell
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1
-9219803729611809025,[CTCF],[AG04449]
-9120762041249846625,[CTCF],[MCF-7]
-9118037537398139811,[CTCF],[MCF-7]
-8760850962206896694,[CTCF],[MCF-7]
-8556045950597285261,[CTCF],[A549]


### Add a new column

In [13]:
filtered_proj_data = filtered_proj_data.add_meta('creator', 'luca')
filtered_proj_data.meta_dataset.head()

Unnamed: 0_level_0,antibody,cell,creator
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9219803729611809025,[CTCF],[AG04449],[luca]
-9120762041249846625,[CTCF],[MCF-7],[luca]
-9118037537398139811,[CTCF],[MCF-7],[luca]
-8760850962206896694,[CTCF],[MCF-7],[luca]
-8556045950597285261,[CTCF],[A549],[luca]


In [15]:
all_attributes = filtered_proj_data.get_meta_attributes()
all_attributes

['antibody', 'cell', 'creator']

### Project and also compute new columns based on complex functions

In [16]:
# define a function that operates on rows of the metadata dataset and gives us the resulting new column value
def complex_function(row):
    x = list(row['antibody'])
    y = list(row['cell'])
    #print("antibody: {}\t cell: {}".format(x, y))
    return x + y

In [17]:
new_attr_dict = {
    'extended' : complex_function
}

extended_dataset = filtered_proj_data.meta_project(attr_list=all_attributes, new_attr_dict=new_attr_dict)

## Example

In [47]:
from datetime import datetime

born_date = datetime.strptime("30 Nov 1935","%d %b %Y")
death_date = datetime.strptime("30 Nov 1999","%d %b %Y")

In [67]:
example_dataset = filtered_proj_data.add_meta('born_date', born_date)
example_dataset = example_dataset.add_meta('death_date', death_date)
all_attributes = example_dataset.get_meta_attributes()
all_attributes

['antibody', 'cell', 'creator', 'born_date', 'death_date']

In [68]:
def calculate_age(row):
    #print(row)
    born_date = row['born_date'][0]
    death_date = row['death_date'][0]
    return (death_date - born_date).days / 365

In [69]:
new_attr_dict = {
    'age' : calculate_age
}
example_dataset = example_dataset.meta_project(attr_list=all_attributes, new_attr_dict=new_attr_dict)

In [71]:
example_dataset.meta_dataset.head()

Unnamed: 0_level_0,antibody,cell,creator,born_date,death_date,age
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-9219803729611809025,[CTCF],[AG04449],[luca],[1935-11-30 00:00:00],[1999-11-30 00:00:00],64.043836
-9120762041249846625,[CTCF],[MCF-7],[luca],[1935-11-30 00:00:00],[1999-11-30 00:00:00],64.043836
-9118037537398139811,[CTCF],[MCF-7],[luca],[1935-11-30 00:00:00],[1999-11-30 00:00:00],64.043836
-8760850962206896694,[CTCF],[MCF-7],[luca],[1935-11-30 00:00:00],[1999-11-30 00:00:00],64.043836
-8556045950597285261,[CTCF],[A549],[luca],[1935-11-30 00:00:00],[1999-11-30 00:00:00],64.043836
