# Proof of concept: *PyGMQL*

In [1]:
import gmql as gl
import numpy as np
import pandas as pd

## Loading a dataset 

In [2]:
input_path = "/home/luca/Scrivania/GMQL-Python/resources/hg_narrowPeaks/"

In [8]:
np_parser = gl.parsers.NarrowPeakParser()

In [9]:
dataset = gl.GMQLDataset(parser=np_parser)

In [10]:
dataset = dataset.load_from_path(path=input_path)

2017-03-27 17:03:37,311 - gmql_logger - INFO - loading metadata
2017-03-27 17:03:37,837 - gmql_logger - INFO - parsing metadata
2017-03-27 17:03:37,839 - gmql_logger - INFO - collecting metadata
2017-03-27 17:03:46,511 - gmql_logger - INFO - dataframe construction


100%|██████████| 115/115 [00:38<00:00,  3.05it/s]

2017-03-27 17:04:25,223 - gmql_logger - INFO - loading region data





2017-03-27 17:04:25,806 - gmql_logger - INFO - parsing region data


## Visualize the metadata in a tabular form

In [17]:
m =dataset.meta_dataset.index.tolist()
r = dataset.reg_dataset.filter(lambda sample: sample['id_sample'] in m)

## Select rows of the metadata based on a logical predicate

In [None]:
filtered_dataset = dataset.meta_select(lambda row: 'CTCF' in row['antibody'])

In [None]:
filtered_dataset.meta_dataset.head()

## Project metadata based on an attribute list

In [None]:
filtered_proj_data = filtered_dataset.meta_project(['antibody', 'cell'])
filtered_proj_data.meta_dataset.head()

### Add a new column

In [None]:
filtered_proj_data = filtered_proj_data.add_meta('creator', 'luca')
filtered_proj_data.meta_dataset.head()

In [None]:
all_attributes = filtered_proj_data.get_meta_attributes()
all_attributes

### Project and also compute new columns based on complex functions

In [None]:
# define a function that operates on rows of the metadata dataset and gives us the resulting new column value
def complex_function(row):
    x = list(row['antibody'])
    y = list(row['cell'])
    #print("antibody: {}\t cell: {}".format(x, y))
    return x + y

In [None]:
new_attr_dict = {
    'extended' : complex_function
}

extended_dataset = filtered_proj_data.meta_project(attr_list=all_attributes, new_attr_dict=new_attr_dict)

In [None]:
extended_dataset.meta_dataset.head()

## Example

In [None]:
from datetime import datetime

born_date = datetime.strptime("30 Nov 1935","%d %b %Y")
death_date = datetime.strptime("30 Nov 1999","%d %b %Y")

In [None]:
example_dataset = filtered_proj_data.add_meta('born_date', born_date)
example_dataset = example_dataset.add_meta('death_date', death_date)
all_attributes = example_dataset.get_meta_attributes()
all_attributes

In [None]:
def calculate_age(row):
    #print(row)
    born_date = row['born_date'][0]
    death_date = row['death_date'][0]
    return (death_date - born_date).days / 365

In [None]:
new_attr_dict = {
    'age' : calculate_age
}
example_dataset = example_dataset.meta_project(attr_list=all_attributes, new_attr_dict=new_attr_dict)

In [None]:
example_dataset.meta_dataset.head()