Merge branch 'omnibem' of https://github.com/CancerRxGene/gdsctools i…

…nto omnibem
CancerRxGene · Jun 1, 2016 · 67b5ac3 · 67b5ac3
2 parents 06d129f + 8d37905
commit 67b5ac3
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 30 deletions.
diff --git a/doc/source/glossary.rst b/doc/source/glossary.rst
@@ -29,6 +29,10 @@ Glossary
         response halfway between the baseline and maximum after a specified
         amount of time. It is used as a measure of drug's potency
 
+    MoBEM
+        Data structure used in GDSC to store genomic information. This is a
+        CSV file with specific header: (SAMPLE, TISSUE, TYPE, GENE)
+
     MSI
         Microsatellite Instability (MSI) are markers indicating the
         presence or absence of a MSI shift, allele

diff --git a/doc/source/references.rst b/doc/source/references.rst
@@ -55,6 +55,11 @@ Readers
     :members:
     :undoc-members:
 
+.. automodule:: gdsctools.omnibem
+    :members:
+    :undoc-members:
+
+
 Visualisation
 ---------------
 Volcano plot

diff --git a/gdsctools/data/README.rst b/gdsctools/data/README.rst
@@ -20,4 +20,8 @@ Here is a short explanation for some of them
   988 cell lines
 
 
+* test_omnibem files are used to test OmniBEM but the files are also
+  scientifically interesting since they contain e.g. Methylation that are not
+  included in other official files such as genomic_features_v17
+
 github-gist.css and highlight.pack.js were found at https://highlightjs.org/
diff --git a/gdsctools/data/test_omnibem_genomic_alterations.csv.gz b/gdsctools/data/test_omnibem_genomic_alterations.csv.gz
diff --git a/gdsctools/omnibem.py b/gdsctools/omnibem.py
@@ -8,30 +8,62 @@
 
 
 class OmniBEMBuilder(object):
-    """Create a BEM (Genomic Feature) file
+    """Utility to create :class:`~gdsctools.readers.GenomicFeatures` instance from BEM data
+
+    Starting from an example provided in GDSCTools
+    (test_omnibem_genomic_alteration.csv.gz), here is the code to get a data
+    structure compatible with the :class:`GenomicFeature`, which can then be
+    used as input to the :class:`~gdsctools.anova.ANOVA` class.
 
     ::
 
-        bem = OmniBEM("data.csv")
-        # optional
+        from gdsctools import gdsctools_data, OmniBEM, GenomicFeatures
+
+        input_data = gdsctools_data("test_omnibem_genomic_alteration.csv.gz")
+        bem = OmniBEM(input_data)
+
+        # You may filter the data for instance to keep only a set of genes
         bem.filter_by_gene_list(gene_list)
-        mobem = bem.get_mobem()
 
-        # Sort genes by number of times they have an alteration
-        bem.unified.sort_values(by="IDENTIFIER")
+        # Finally, create a MoBEM dataframe
+        mobem = bem.get_mobem()
 
+        # You may filter with other functions(e.g., to keep only Methylation
+        # features
+        bem.filter_by_type_list(["Methylation"])
+        mobem = bem.get_mobem()
 
-        # top genes represented multiple times i
-        bem.unified.sort_values(by="IDENTIFIER", ascending=False).head(20)
+        # Then, let us create a dataframe that is compatible with
+        # GenomicFeature. We just need to make sure the columns are correct
+        mobem.columns = [x.replace("TISSUE_TYPE", "TISSUE_FACTOR") for x in mobem.columns]
+        mobem[[x for x in mobem.columns if x!="SAMPLE"]]
+        gf = GenomicFeatures(mobem[[x for x in mobem.columns if x!="SAMPLE"]])
 
 
-    You can filter by, gene, cosmic identifiers, sample name, type of
-    alterations.
+        # Now, we can perform an ANOVA analysis:
+        an = ANOVA(ic50_test, gf)
+        results = an.anova_all()
+        results.volcano()
 
+    .. note:: The underlying data is stored in the attribute :attr:`df`.
 
-    - merge features or not.
     """
     def __init__(self, genomic_alteration):
+        """.. rubric:: Constructor
+
+
+        The input must be a CSV file (gzipped or not) with the following
+        columns::
+
+            COSMIC_ID:
+            TISSUE_TYPE: e.g. Methylation,
+            SAMPLE: this should correspond to the COSMID_ID
+            GENE:
+            IDENTIFIER: required but may be removed (rows can be used as
+                identifier indeed
+
+
+        """
         self.df = Reader(genomic_alteration).df
         # Note that if we keep NA, the groupby will not work as expected.
         self.df.fillna("", inplace=True)
@@ -40,6 +72,12 @@ def __init__(self, genomic_alteration):
         self._update_unified()
 
     def _update_unified(self):
+        """# Sort genes by number of times they have an alteration
+        bem.unified.sort_values(by="IDENTIFIER")
+
+        # top genes represented multiple times i
+        bem.unified.sort_values(by="IDENTIFIER", ascending=False).head(20)
+        """
         # In R this is an aggregate function. In Pandas a groupby + aggregate
         # http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html
         groups = self.df.groupby(by=["COSMIC_ID", "GENE", "SAMPLE",
@@ -55,14 +93,19 @@ def _update_unified(self):
         df.columns = ['total', 'grouped', 'fraction']
         self.summary = df
 
-    def get_mobem(self, minimum_gene=3):
+    def get_mobem(self):
+        """Return a dataframe compatible with ANOVA analysis
+
+
+
+        """
         # Select gene that appear at least a minimum number of times
-        agg = self.unified.groupby("GENE")["GENE"].count()
-        self.selection = agg[agg>=minimum_gene]
+        #agg = self.unified.groupby("GENE")["GENE"].count()
+        #self.selection = agg[agg>=minimum_gene]
 
         # keep only gene in the selection
-        df = self.unified.query("GENE in @self.selection.index")
-
+        #df = self.unified.query("GENE in @self.selection.index")
+        df = self.unified
         this = pd.crosstab(df['GENE'], columns=[
             df["COSMIC_ID"], df['TISSUE_TYPE'], df["SAMPLE"]])
         this = this.T
@@ -73,37 +116,60 @@ def get_significant_genes(self, N=20):
         df = self.unified.groupby("GENE")["GENE"]
         return df.count().sort_values(ascending=False).head(N)
 
-    def filter_by_gene_list(self, genes):
-        """
-
-        :param genes: a list of genes or a filename
+    def filter_by_gene_list(self, genes, minimum_gene=3):
+        """Keeps only required genes
 
+        :param genes: a list of genes or a filename (a CSV file with a column
+            named GENE).
+        :param minimum_gene: genes with not enough entries are remove
+            (defaults to 3)
 
         """
         if isinstance(genes, str):
             df = pd.read_csv(genes)
             genes = df.GENE.values.tolist()
         self.df = self.df.query("GENE in @genes")
         # Update unified matrix
-        self._update_unified()
+        agg = self.unified.groupby("GENE")["GENE"].count()
+        self.selection = agg[agg>=minimum_gene]
+
+        # keep only gene in the selection
+        self.unified = self.unified.query("GENE in @self.selection.index")
+        #self._update_unified()
 
     def filter_by_tissue_list(self, tissue_list):
         self.df = self.df.query("TISSUE_TYPE in @tissue_list")
         self._update_unified()
 
     def filter_by_type_list(self, type_list):
+        """Keeps only required type"""
         self.df = self.df.query("TYPE in @type_list")
         self._update_unified()
 
     def filter_by_sample_list(self, sample_list):
+        """Filter data to keep sample in sample_list"""
         self.df = self.df.query("SAMPLE in @sample_list")
         self._update_unified()
 
     def filter_by_cosmic_list(self, cosmic_list):
+        """Filter data to keep cosmic ids in cosmic_list"""
         self.df = self.df.query("COSMIC_ID in @cosmic_list")
         self._update_unified()
 
     def plot_number_alteration_by_tissue(self):
+        """Plot number of alterations
+
+        .. plot::
+            :width: 50%
+            :include-source:
+
+            from gdsctools import *
+            data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
+            bem = OmniBEMBuilder(data)
+            bem.filter_by_gene_list(gdsctools_data("test_omnibem_genes.txt"))
+            bem.plot_number_alteration_by_tissue()
+
+        """
         count = self.unified.groupby(['TISSUE_TYPE'])['GENE'].count()
         count.sort_values(inplace=True, ascending=False)
         count.plot(kind="bar")
@@ -114,9 +180,22 @@ def plot_number_alteration_by_tissue(self):
         except:pass
 
     def plot_alterations_per_cellline(self):
+        """Plot number of alterations
+
+        .. plot::
+            :width: 50%
+            :include-source:
+
+            from gdsctools import *
+            data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
+            bem = OmniBEMBuilder(data)
+            bem.filter_by_gene_list(gdsctools_data("test_omnibem_genes.txt"))
+            bem.plot_alterations_per_cellline()
+        """
         df = self.summary.sort_values("fraction", ascending=False)
         df.plot(y="fraction", legend=False, kind='bar', fontsize=10, width=0.8)
         pylab.ylabel("Alterations per cell line")
+        pylab.grid()
         try:pylab.tight_layout()
         except:pass
 

diff --git a/test/gdsctools/test_omnibem.py b/test/gdsctools/test_omnibem.py
@@ -1,36 +1,41 @@
 from gdsctools.omnibem import OmniBEMBuilder
 from gdsctools import gdsctools_data
 
-omnibem_data = gdsctools_data("test_omnibem_genomic_alteration.csv.gz")
+omnibem_data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
 omnibem_genes = gdsctools_data("test_omnibem_genes.txt")
 
 
 def test_omnibem():
     ob = OmniBEMBuilder(omnibem_data)
     ob.filter_by_gene_list(omnibem_genes)
     mobem = ob.get_mobem()
-    assert mobem[mobem.columns[3:]].sum().sum() == 4971
+    assert mobem[mobem.columns[3:]].sum().sum() == 57678
 
     #
     ob.plot_number_alteration_by_tissue()
     ob.plot_alterations_per_cellline()
     ob.get_significant_genes()
 
 
+    # filter by cosmic id
+    ob = OmniBEMBuilder(omnibem_data)
+    ob.filter_by_cosmic_list([910916])
+    mobem = ob.get_mobem()
+    assert mobem.shape == (1,105)
+    assert mobem.ix[0,3:].sum() == 102
+
+
     ob = OmniBEMBuilder(omnibem_data)
-    ob.filter_by_gene_list(omnibem_genes)
     ob.filter_by_type_list(["Methylation"])
     mobem = ob.get_mobem()
-    assert mobem[mobem.columns[3:]].sum().sum() == 127
+    assert mobem[mobem.columns[3:]].sum().sum() == 12964
 
     ob = OmniBEMBuilder(omnibem_data)
-    ob.filter_by_gene_list(omnibem_genes)
     ob.filter_by_tissue_list(["HNSC"])
     mobem = ob.get_mobem()
-    assert mobem[mobem.columns[3:]].sum().sum() == 115
+    assert mobem[mobem.columns[3:]].sum().sum() == 4124
 
     ob = OmniBEMBuilder(omnibem_data)
-    ob.filter_by_gene_list(omnibem_genes)
     ob.filter_by_sample_list(["SNU-423"])
-    #mobem = ob.get_mobem()
-    #assert mobem[mobem.columns[3:]].sum().sum() == 4971
+    mobem = ob.get_mobem()
+    assert mobem[mobem.columns[3:]].sum().sum() == 63