Skip to content

Commit

Permalink
Merge branch 'omnibem' of https://github.com/CancerRxGene/gdsctools i…
Browse files Browse the repository at this point in the history
…nto omnibem
  • Loading branch information
elisabeth-chen committed Jun 1, 2016
2 parents 06d129f + 8d37905 commit 67b5ac3
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 30 deletions.
4 changes: 4 additions & 0 deletions doc/source/glossary.rst
Expand Up @@ -29,6 +29,10 @@ Glossary
response halfway between the baseline and maximum after a specified
amount of time. It is used as a measure of drug's potency

MoBEM
Data structure used in GDSC to store genomic information. This is a
CSV file with specific header: (SAMPLE, TISSUE, TYPE, GENE)

MSI
Microsatellite Instability (MSI) are markers indicating the
presence or absence of a MSI shift, allele
Expand Down
5 changes: 5 additions & 0 deletions doc/source/references.rst
Expand Up @@ -55,6 +55,11 @@ Readers
:members:
:undoc-members:

.. automodule:: gdsctools.omnibem
:members:
:undoc-members:


Visualisation
---------------
Volcano plot
Expand Down
4 changes: 4 additions & 0 deletions gdsctools/data/README.rst
Expand Up @@ -20,4 +20,8 @@ Here is a short explanation for some of them
988 cell lines


* test_omnibem files are used to test OmniBEM but the files are also
scientifically interesting since they contain e.g. Methylation that are not
included in other official files such as genomic_features_v17

github-gist.css and highlight.pack.js were found at https://highlightjs.org/
Binary file not shown.
121 changes: 100 additions & 21 deletions gdsctools/omnibem.py
Expand Up @@ -8,30 +8,62 @@


class OmniBEMBuilder(object):
"""Create a BEM (Genomic Feature) file
"""Utility to create :class:`~gdsctools.readers.GenomicFeatures` instance from BEM data
Starting from an example provided in GDSCTools
(test_omnibem_genomic_alteration.csv.gz), here is the code to get a data
structure compatible with the :class:`GenomicFeature`, which can then be
used as input to the :class:`~gdsctools.anova.ANOVA` class.
::
bem = OmniBEM("data.csv")
# optional
from gdsctools import gdsctools_data, OmniBEM, GenomicFeatures
input_data = gdsctools_data("test_omnibem_genomic_alteration.csv.gz")
bem = OmniBEM(input_data)
# You may filter the data for instance to keep only a set of genes
bem.filter_by_gene_list(gene_list)
mobem = bem.get_mobem()
# Sort genes by number of times they have an alteration
bem.unified.sort_values(by="IDENTIFIER")
# Finally, create a MoBEM dataframe
mobem = bem.get_mobem()
# You may filter with other functions(e.g., to keep only Methylation
# features
bem.filter_by_type_list(["Methylation"])
mobem = bem.get_mobem()
# top genes represented multiple times i
bem.unified.sort_values(by="IDENTIFIER", ascending=False).head(20)
# Then, let us create a dataframe that is compatible with
# GenomicFeature. We just need to make sure the columns are correct
mobem.columns = [x.replace("TISSUE_TYPE", "TISSUE_FACTOR") for x in mobem.columns]
mobem[[x for x in mobem.columns if x!="SAMPLE"]]
gf = GenomicFeatures(mobem[[x for x in mobem.columns if x!="SAMPLE"]])
You can filter by, gene, cosmic identifiers, sample name, type of
alterations.
# Now, we can perform an ANOVA analysis:
an = ANOVA(ic50_test, gf)
results = an.anova_all()
results.volcano()
.. note:: The underlying data is stored in the attribute :attr:`df`.
- merge features or not.
"""
def __init__(self, genomic_alteration):
""".. rubric:: Constructor
The input must be a CSV file (gzipped or not) with the following
columns::
COSMIC_ID:
TISSUE_TYPE: e.g. Methylation,
SAMPLE: this should correspond to the COSMID_ID
GENE:
IDENTIFIER: required but may be removed (rows can be used as
identifier indeed
"""
self.df = Reader(genomic_alteration).df
# Note that if we keep NA, the groupby will not work as expected.
self.df.fillna("", inplace=True)
Expand All @@ -40,6 +72,12 @@ def __init__(self, genomic_alteration):
self._update_unified()

def _update_unified(self):
"""# Sort genes by number of times they have an alteration
bem.unified.sort_values(by="IDENTIFIER")
# top genes represented multiple times i
bem.unified.sort_values(by="IDENTIFIER", ascending=False).head(20)
"""
# In R this is an aggregate function. In Pandas a groupby + aggregate
# http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html
groups = self.df.groupby(by=["COSMIC_ID", "GENE", "SAMPLE",
Expand All @@ -55,14 +93,19 @@ def _update_unified(self):
df.columns = ['total', 'grouped', 'fraction']
self.summary = df

def get_mobem(self, minimum_gene=3):
def get_mobem(self):
"""Return a dataframe compatible with ANOVA analysis
"""
# Select gene that appear at least a minimum number of times
agg = self.unified.groupby("GENE")["GENE"].count()
self.selection = agg[agg>=minimum_gene]
#agg = self.unified.groupby("GENE")["GENE"].count()
#self.selection = agg[agg>=minimum_gene]

# keep only gene in the selection
df = self.unified.query("GENE in @self.selection.index")

#df = self.unified.query("GENE in @self.selection.index")
df = self.unified
this = pd.crosstab(df['GENE'], columns=[
df["COSMIC_ID"], df['TISSUE_TYPE'], df["SAMPLE"]])
this = this.T
Expand All @@ -73,37 +116,60 @@ def get_significant_genes(self, N=20):
df = self.unified.groupby("GENE")["GENE"]
return df.count().sort_values(ascending=False).head(N)

def filter_by_gene_list(self, genes):
"""
:param genes: a list of genes or a filename
def filter_by_gene_list(self, genes, minimum_gene=3):
"""Keeps only required genes
:param genes: a list of genes or a filename (a CSV file with a column
named GENE).
:param minimum_gene: genes with not enough entries are remove
(defaults to 3)
"""
if isinstance(genes, str):
df = pd.read_csv(genes)
genes = df.GENE.values.tolist()
self.df = self.df.query("GENE in @genes")
# Update unified matrix
self._update_unified()
agg = self.unified.groupby("GENE")["GENE"].count()
self.selection = agg[agg>=minimum_gene]

# keep only gene in the selection
self.unified = self.unified.query("GENE in @self.selection.index")
#self._update_unified()

def filter_by_tissue_list(self, tissue_list):
self.df = self.df.query("TISSUE_TYPE in @tissue_list")
self._update_unified()

def filter_by_type_list(self, type_list):
"""Keeps only required type"""
self.df = self.df.query("TYPE in @type_list")
self._update_unified()

def filter_by_sample_list(self, sample_list):
"""Filter data to keep sample in sample_list"""
self.df = self.df.query("SAMPLE in @sample_list")
self._update_unified()

def filter_by_cosmic_list(self, cosmic_list):
"""Filter data to keep cosmic ids in cosmic_list"""
self.df = self.df.query("COSMIC_ID in @cosmic_list")
self._update_unified()

def plot_number_alteration_by_tissue(self):
"""Plot number of alterations
.. plot::
:width: 50%
:include-source:
from gdsctools import *
data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
bem = OmniBEMBuilder(data)
bem.filter_by_gene_list(gdsctools_data("test_omnibem_genes.txt"))
bem.plot_number_alteration_by_tissue()
"""
count = self.unified.groupby(['TISSUE_TYPE'])['GENE'].count()
count.sort_values(inplace=True, ascending=False)
count.plot(kind="bar")
Expand All @@ -114,9 +180,22 @@ def plot_number_alteration_by_tissue(self):
except:pass

def plot_alterations_per_cellline(self):
"""Plot number of alterations
.. plot::
:width: 50%
:include-source:
from gdsctools import *
data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
bem = OmniBEMBuilder(data)
bem.filter_by_gene_list(gdsctools_data("test_omnibem_genes.txt"))
bem.plot_alterations_per_cellline()
"""
df = self.summary.sort_values("fraction", ascending=False)
df.plot(y="fraction", legend=False, kind='bar', fontsize=10, width=0.8)
pylab.ylabel("Alterations per cell line")
pylab.grid()
try:pylab.tight_layout()
except:pass

Expand Down
23 changes: 14 additions & 9 deletions test/gdsctools/test_omnibem.py
@@ -1,36 +1,41 @@
from gdsctools.omnibem import OmniBEMBuilder
from gdsctools import gdsctools_data

omnibem_data = gdsctools_data("test_omnibem_genomic_alteration.csv.gz")
omnibem_data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
omnibem_genes = gdsctools_data("test_omnibem_genes.txt")


def test_omnibem():
ob = OmniBEMBuilder(omnibem_data)
ob.filter_by_gene_list(omnibem_genes)
mobem = ob.get_mobem()
assert mobem[mobem.columns[3:]].sum().sum() == 4971
assert mobem[mobem.columns[3:]].sum().sum() == 57678

#
ob.plot_number_alteration_by_tissue()
ob.plot_alterations_per_cellline()
ob.get_significant_genes()


# filter by cosmic id
ob = OmniBEMBuilder(omnibem_data)
ob.filter_by_cosmic_list([910916])
mobem = ob.get_mobem()
assert mobem.shape == (1,105)
assert mobem.ix[0,3:].sum() == 102


ob = OmniBEMBuilder(omnibem_data)
ob.filter_by_gene_list(omnibem_genes)
ob.filter_by_type_list(["Methylation"])
mobem = ob.get_mobem()
assert mobem[mobem.columns[3:]].sum().sum() == 127
assert mobem[mobem.columns[3:]].sum().sum() == 12964

ob = OmniBEMBuilder(omnibem_data)
ob.filter_by_gene_list(omnibem_genes)
ob.filter_by_tissue_list(["HNSC"])
mobem = ob.get_mobem()
assert mobem[mobem.columns[3:]].sum().sum() == 115
assert mobem[mobem.columns[3:]].sum().sum() == 4124

ob = OmniBEMBuilder(omnibem_data)
ob.filter_by_gene_list(omnibem_genes)
ob.filter_by_sample_list(["SNU-423"])
#mobem = ob.get_mobem()
#assert mobem[mobem.columns[3:]].sum().sum() == 4971
mobem = ob.get_mobem()
assert mobem[mobem.columns[3:]].sum().sum() == 63

0 comments on commit 67b5ac3

Please sign in to comment.