In [1]:
from __future__ import absolute_import
import utils
import numpy as np
import pandas as pd
from pandas.api.types import is_categorical_dtype , is_numeric_dtype
import plotly.graph_objects as go
from utils import _get_hover_text
import pickle
import dash
import dash_core_components as dcc
import dash_html_components as html
from jupyter_dash import JupyterDash
import os
import sys
from bioservices.kegg import KEGG

## Main functions that draw Manhattan plot

In [10]:
SUGGESTIVE_LINE_LABEL = "suggestive line"
GENOMEWIDE_LINE_LABEL = "genomewide line"


def ManhattanPlot(
        dataframe,
        chrm="CHR",
        bp="BP",
        p="P",
        snp="SNP",
        gene="GENE",
        annotation=None,
        logp=True,
        title="Manhattan Plot",
        showgrid=True,
        xlabel=None,
        ylabel='-log10(p)',
        point_size=5,
        showlegend=True,
        col=None,
        suggestiveline_value= -np.log10(1e-8), #160
        suggestiveline_color='#636efa',
        suggestiveline_width=1,
        genomewideline_value=  -np.log10(5e-8), #100, #
        genomewideline_color='#EF553B',
        genomewideline_width=1,
        highlight=True,
        highlight_color="red",
):
    """Returns a figure for a manhattan plot.
Keyword arguments:
- dataframe (dataframe; required): A pandas dataframe which must contain at
    least the following three columns:
            - the chromosome number
            - genomic base-pair position
            - a numeric quantity to plot such as a p-value or zscore
- chrm (string; default 'CHR'): A string denoting the column name for
    the chromosome. This column must be float or integer. Minimum
    number of chromosomes required is 1. If you have X, Y, or MT
    chromosomes, be sure to renumber these 23, 24, 25, etc.
- bp (string; default 'BP'): A string denoting the column name for the
    chromosomal position.
- p (string; default 'P'): A string denoting the column name for the
    float quantity to be plotted on the y-axis. This column must be
    numeric. It does not have to be a p-value. It can be any numeric
    quantity such as peak heights, Bayes factors, test statistics. If
    it is not a p-value, make sure to set logp = False.
- snp (string; default 'SNP'): A string denoting the column name for
    the SNP names (e.g., rs number). More generally, this column could
    be anything that identifies each point being plotted. For example,
    in an Epigenomewide association study (EWAS), this could be the
    probe name or cg number. This column should be a character. This
    argument is optional, however it is necessary to specify it if you
    want to highlight points on the plot, using the highlight argument
    in the figure method.
- gene (string; default 'GENE'): A string denoting the column name for
    the GENE names. This column could be a string or a float. More
    generally, it could be any annotation information that you want
    to include in the plot.
- annotation (string; optional): A string denoting the column to use
    as annotations. This column could be a string or a float. It
    could be any annotation information that you want to include in
    the plot (e.g., zscore, effect size, minor allele frequency).
- logp (bool; optional): If True, the -log10 of the p-value is
    plotted. It isn't very useful to plot raw p-values; however,
    plotting the raw value could be useful for other genome-wide plots
    (e.g., peak heights, Bayes factors, test statistics, other
    "scores", etc.)
- title (string; default 'Manhattan Plot'): The title of the graph.
- showgrid (bool; default true): Boolean indicating whether gridlines
    should be shown.
- xlabel (string; optional): Label of the x axis.
- ylabel (string; default '-log10(p)'): Label of the y axis.
- point_size (number; default 5): Size of the points of the Scatter
    plot.
- showlegend (bool; default true): Boolean indicating whether legends
    should be shown.
- col (string; optional): A string representing the color of the
    points of the scatter plot. Can be in any color format accepted by
    plotly.graph_objects.
- suggestiveline_value (bool | float; default 8): A value which must
    be either False to deactivate the option, or a numerical value
    corresponding to the p-value at which the line should be drawn.
    The line has no influence on the data points.
- suggestiveline_color (string; default 'grey'): Color of the suggestive
  line.
- suggestiveline_width (number; default 2): Width of the suggestive
    line.
- genomewideline_value (bool | float; default -log10(5e-8)): A boolean
    which must be either False to deactivate the option, or a numerical value
    corresponding to the p-value above which the data points are
    considered significant.
- genomewideline_color (string; default 'red'): Color of the genome-wide
    line. Can be in any color format accepted by plotly.graph_objects.
- genomewideline_width (number; default 1): Width of the genome-wide
  line.
- highlight (bool; default True): turning on/off the highlighting of
    data points considered significant.
- highlight_color (string; default 'red'): Color of the data points
    highlighted because they are significant. Can be in any color
    format accepted by plotly.graph_objects.
    # ...
    Example 1: Random Manhattan Plot
    '''
    dataframe = pd.DataFrame(
        np.random.randint(0,100,size=(100, 3)),
        columns=['P', 'CHR', 'BP'])
    fig = create_manhattan(dataframe, title='XYZ Manhattan plot')
    plotly.offline.plot(fig, image='png')
    '''
    """

    mh = _ManhattanPlot(
        dataframe,
        chrm=chrm,
        bp=bp,
        p=p,
        snp=snp,
        gene=gene,
        annotation=annotation,
        logp=logp
    )

    return mh.figure(
        title=title,
        showgrid=showgrid,
        xlabel=xlabel,
        ylabel=ylabel,
        point_size=point_size,
        showlegend=showlegend,
        col=col,
        suggestiveline_value=suggestiveline_value,
        suggestiveline_color=suggestiveline_color,
        suggestiveline_width=suggestiveline_width,
        genomewideline_value=genomewideline_value,
        genomewideline_color=genomewideline_color,
        genomewideline_width=genomewideline_width,
        highlight=highlight,
        highlight_color=highlight_color
    )


class _ManhattanPlot():

    def __init__(
            self,
            x,
            chrm="CHR",
            bp="BP",
            p="P",
            snp="SNP",
            gene="GENE",
            annotation=None,
            logp=True
    ):
        """
        Keyword arguments:
        - dataframe (dataframe; required): A pandas dataframe which
        must contain at least the following three columns:
            - the chromosome number
            - genomic base-pair position
            - a numeric quantity to plot such as a p-value or zscore
        - chrm (string; default 'CHR'): A string denoting the column name for the
        chromosome.  This column must be float or integer.  Minimum number
        of chromosomes required is 1. If you have X, Y, or MT chromosomes,
        be sure to renumber these 23, 24, 25, etc.
        - bp (string; default 'BP'): A string denoting the column name for the
        chromosomal position.
        - p (string; default 'P'): A string denoting the column name for the
        float quantity to be plotted on the y-axis. This column must be
        numeric. This does not have to be a p-value. It can be any
        numeric quantity such as peak heights, bayes factors, test
        statistics. If it is not a p-value, make sure to set logp = FALSE.
        - snp (string; default 'SNP'): A string denoting the column name for the
        SNP names (e.g. rs number). More generally, this column could be
        anything that identifies each point being plotted. For example, in
        an Epigenomewide association study (EWAS) this could be the probe
        name or cg number. This column should be a character. This
        argument is optional, however it is necessary to specify if you
        want to highlight points on the plot using the highlight argument
        in the figure method.
        - gene (string; default 'GENE'): A string denoting the column name for the
        GENE names. This column could be a string or a float. More
        generally, it could be any annotation information that you want
        to include in the plot.
        - annotation (string; optional): A string denoting the column name for
        an annotation. This column could be a string or a float.  This
        could be any annotation information that you want to include in
        the plot (e.g. zscore, effect size, minor allele frequency).
        - logp (bool; default True): If True, the -log10 of the p-value is
        plotted.  It isn't very useful to plot raw p-values; however,
        plotting the raw value could be useful for other genome-wide plots
        (e.g., peak heights, Bayes factors, test statistics, other
        "scores", etc.).
        Returns:
        - A ManhattanPlot object."""

        # checking the validity of the arguments

        # Make sure you have chrm, bp and p columns and that they are of
        # numeric type
        if chrm not in x.columns.values:
            raise KeyError("Column %s not found in 'x' data.frame" % chrm)
        else:
            if not is_numeric_dtype(x[chrm].dtype):
                raise TypeError("%s column should be numeric. Do you have "
                                "'X', 'Y', 'MT', etc? If so change to "
                                "numbers and try again." % chrm)

        if bp not in x.columns.values:
            raise KeyError("Column %s not found in 'x' data.frame" % bp)
        else:
            if not is_numeric_dtype(x[bp].dtype):
                raise TypeError("%s column should be numeric type" % bp)

        if p not in x.columns.values:
            raise KeyError("Column %s not found in 'x' data.frame" % p)
        else:
            if not is_numeric_dtype(x[p].dtype):
                raise TypeError("%s column should be numeric type" % p)

        # Create a new DataFrame with columns named after chrm, bp, and p.
        self.data = pd.DataFrame(data=x[[chrm, bp, p]])

        if snp is not None:
            if snp not in x.columns.values:
                # Warn if you don't have a snp column
                raise KeyError(
                    "snp argument specified as %s but column not found in "
                    "'x' data.frame" % snp)
            else:
                # If the input DataFrame has a snp column, add it to the new
                # DataFrame
                self.data[snp] = x[snp]

        if gene is not None:
            if gene not in x.columns.values:
                # Warn if you don't have a gene column
                raise KeyError(
                    "gene argument specified as %s but column not found in "
                    "'x' data.frame" % gene)
            else:
                # If the input DataFrame has a gene column, add it to the new
                # DataFrame
                self.data[gene] = x[gene]

        if annotation is not None:
            if annotation not in x.columns.values:
                # Warn if you don't have an annotation column
                raise KeyError(
                    "annotation argument specified as %s but column not "
                    "found in 'x' data.frame" % annotation
                )
            else:
                # If the input DataFrame has a gene column, add it to the new
                # DataFrame
                self.data[annotation] = x[annotation]

        self.xlabel = ""
        self.ticks = []
        self.ticksLabels = []
        self.nChr = len(x[chrm].unique())
        self.chrName = chrm
        self.pName = p
        self.snpName = snp
        self.geneName = gene
        self.annotationName = annotation
        self.logp = logp

        # Set positions, ticks, and labels for plotting

        self.index = 'INDEX'
        self.pos = 'POSITION'

        # Fixes the bug where one chromosome is missing by adding a sequential
        # index column.
        idx = 0
        for i in self.data[chrm].unique():
            idx = idx + 1
            self.data.loc[self.data[chrm] == i, self.index] = int(idx)
        # Set the type to be the same as provided for chrm column
        self.data[self.index] = \
            self.data[self.index].astype(self.data[chrm].dtype)

        # This section sets up positions and ticks. Ticks should be placed in
        # the middle of a chromosome. The new pos column is added that keeps
        # a running sum of the positions of each successive chromosome.
        # For example:
        # chrm bp pos
        # 1   1  1
        # 1   2  2
        # 2   1  3
        # 2   2  4
        # 3   1  5

        if self.nChr == 1:
            # For a single chromosome
            self.data[self.pos] = self.data[bp]
            self.ticks.append(int(len(self.data[self.pos]) / 2.) + 1)
            self.xlabel = "Chromosome %s position" % (self.data[chrm].unique())
            self.ticksLabels = self.ticks
        else:
            # For multiple chromosomes
            lastbase = 0
            for i in self.data[self.index].unique():
                if i == 1:
                    self.data.loc[self.data[self.index] == i, self.pos] = \
                        self.data.loc[self.data[self.index] == i, bp].values
                else:
                    prevbp = self.data.loc[self.data[self.index] == i - 1, bp]
                    # Shift the basepair position by the largest bp of the
                    # current chromosome
                    lastbase = lastbase + prevbp.iat[-1]

                    self.data.loc[self.data[self.index] == i, self.pos] = \
                        self.data.loc[self.data[self.index] == i, bp].values \
                        + lastbase

                tmin = min(self.data.loc[self.data[self.index] == i, self.pos])
                tmax = max(self.data.loc[self.data[self.index] == i, self.pos])
                self.ticks.append(int((tmin + tmax) / 2.) + 1)

            self.xlabel = 'Chromosome'
            self.data[self.pos] = self.data[self.pos].astype(
                self.data[bp].dtype)

#             if self.nChr > 10:  # To avoid crowded labels
#                 self.ticksLabels = [
#                     t if np.mod(int(t), 2)  # Only every two ticks
#                     else ''
#                     for t in self.data[chrm].unique()
#                 ]
#             else:
            self.ticksLabels = self.data[chrm].unique()  # All the ticks

    def figure(
            self,
            title="Manhattan Plot",
            showgrid=True,
            xlabel=None,
            ylabel='-log10(p)',
            point_size=3,
            showlegend=True,
            col=None,
            suggestiveline_value= -np.log10(1e-8), #160, 
            suggestiveline_color='blue',
            suggestiveline_width=1,
            genomewideline_value= -np.log10(1e-6), #100,#
            genomewideline_color='red',
            genomewideline_width=1,
            highlight=True,
            highlight_color="red",
    ):
        """Keyword arguments:
    - title (string; default 'Manhattan Plot'): The title of the
        graph.
    - showgrid (bool; default True): Boolean indicating whether
        gridlines should be shown.
    - xlabel (string; optional): Label of the x axis.
    - ylabel (string; default '-log10(p)'): Label of the y axis.
    - point_size (number; default 5): Size of the points of the
        scatter plot.
    - showlegend (bool; default True): Boolean indicating whether
        legends should be shown.
    - col (string; optional): A string representing the color of the
        points of the Scatter plot. Can be in any color format
        accepted by plotly.graph_objects.
    - suggestiveline_value (bool | float; default 8): A value which
        must be either False to deactivate the option, or a numerical value
        corresponding to the p-value at which the line should be
        drawn. The line has no influence on the data points.
    - suggestiveline_color (string; default 'grey'): Color of the
        suggestive line.
    - suggestiveline_width (number; default 2): Width of the
        suggestive line.
    - genomewideline_value (bool | float; default -log10(5e-8)): A
        boolean which must be either False to deactivate the option, or a
        numerical value corresponding to the p-value above which the
        data points are considered significant.
    - genomewideline_color (string; default 'red'): Color of the
        genome-wide line. Can be in any color format accepted by
        plotly.graph_objects.
    - genomewideline_width (number; default 1): Width of the genome
      wide line.
    - highlight (bool; default True): Whether to turn on or off the
        highlighting of data points considered significant.
    - highlight_color (string; default 'red'): Color of the data
        points highlighted because they are significant. Can be in any
        color format accepted by plotly.graph_objects.
    Returns:
    - A figure formatted for plotly.graph_objects.
        """

        xmin = min(self.data[self.pos].values) # min pos value along x axis
        xmax = max(self.data[self.pos].values) # Max Pos value along x axis

        horizontallines = []

        if suggestiveline_value:
            suggestiveline = go.layout.Shape(
                name=SUGGESTIVE_LINE_LABEL,
                type="line",
                fillcolor=suggestiveline_color,
                line=dict(
                    color=suggestiveline_color,
                    width=suggestiveline_width
                ),
                x0=xmin, x1=xmax, xref="x",
                y0=suggestiveline_value, y1=suggestiveline_value, yref="y"
            )
            horizontallines.append(suggestiveline)

        if genomewideline_value:
            genomewideline = go.layout.Shape(
                name=GENOMEWIDE_LINE_LABEL,
                type="line",
                fillcolor=genomewideline_color,
                line=dict(
                    color=genomewideline_color,
                    width=genomewideline_width
                ),
                x0=xmin, x1=xmax, xref="x",
                y0=genomewideline_value, y1=genomewideline_value, yref="y"
            )
            horizontallines.append(genomewideline)
       
        data_to_plot = []  # List to contain the data traces
        tmp = pd.DataFrame()  # Empty DataFrame to contain the highlighted data
        pathways_tmp_empty = []# hold unique pathways from the temp dataFrame 

        if highlight:
            if not isinstance(highlight, bool):
                if self.snpName not in self.data.columns.values:
                    raise KeyError(
                        "snp argument specified for highlight as %s but "
                        "column not found in the data.frame" % self.snpName
                    )
            else:
                if not genomewideline_value:
                    raise Warning(
                        "The genomewideline_value you entered is not a "
                        "positive value, or False, you cannot set highlight "
                        "to True in that case.")
                tmp = self.data

                # Sort the p-values (or -log10(p-values) above the line
                if genomewideline_value:
                    if self.logp:
                        tmp = tmp.loc[-np.log10(tmp[self.pName])
                                      > genomewideline_value]
                    else:
                        tmp = tmp.loc[tmp[self.pName] > genomewideline_value]

                highlight_hover_text = _get_hover_text(
                    tmp,
                    snpname=self.snpName,
                    genename=self.geneName,
                    annotationname=self.annotationName
                )

                if not tmp.empty:
                    #### Containing the significnat ploting points 
                    data_to_plot.append(
                        go.Scattergl(
                            x=tmp[self.pos].values,
                            y=-np.log10(tmp[self.pName].values) if self.logp
                            else tmp[self.pName].values,
                            mode= "markers",
                            text=highlight_hover_text,
                            marker=dict(
                                color=highlight_color,
                                size=point_size
                            ),
                            name="Point(s) of interest"
                        )
                    )

        # Remove the highlighted data from the DataFrame if not empty
        if tmp.empty:
            data = self.data
        else:
            data = self.data.drop(self.data.index[tmp.index])

        if self.nChr == 1:

            if col is None:
                col = ['black']

            # If single chromosome, ticks and labels automatic.
            layout = go.Layout(
                title=title,
                xaxis={
                    'title': self.xlabel if xlabel is None else xlabel,
                    'showgrid': showgrid,
                    'range': [xmin, xmax],
                },
                yaxis={'title': ylabel},
                hovermode='closest'
            )

            hover_text = _get_hover_text(
                data,
                snpname=self.snpName,
                genename=self.geneName,
                annotationname=self.annotationName
            )

            data_to_plot.append(
                go.Scattergl(
                    x=data[self.pos].values,
                    y=-np.log10(data[self.pName].values) if self.logp
                    else data[self.pName].values,
                    mode="markers",
                    showlegend=showlegend,
                    marker={
                        'color': col[0],
                        'size': point_size,
                        'name': "chr%i" % data[self.chrName].unique()
                    },
                    text=hover_text
                )
            )
        else:
            # if multiple chrms, use the ticks and labels you created above.
            layout = go.Layout(
                title=title,
                xaxis={
                    'title': self.xlabel if xlabel is None else xlabel,
                    'showgrid': showgrid,
                    'range': [xmin, xmax],
                    'tickmode': "array",
                    'tickvals': self.ticks,
                    'ticktext': self.ticksLabels,
                    'ticks': "outside"
                },
                yaxis={'title': ylabel},
                hovermode='closest'
            )
            # Color variable
            icol = 0
            if col is None:
                col = [
                    'black' if np.mod(i, 2)
                    else 'grey' for i in range(self.nChr)
                ]

            for i in data[self.index].unique():

                tmp = data[data[self.index] == i]

                chromo = tmp[self.chrName].unique()  # Get chromosome name

                hover_text = _get_hover_text(
                    data,
                    snpname=self.snpName,
                    genename=self.geneName,
                    annotationname=self.annotationName
                )

                data_to_plot.append(
                    go.Scattergl(
                        x=tmp[self.pos].values,
                        y=-np.log10(tmp[self.pName].values) if self.logp
                        else tmp[self.pName].values,
                        mode="markers",
                        showlegend=showlegend,
                        name="Chr%i" % chromo,
                        marker={
                            'color': col[icol],
                            'size': point_size
                        },
                        text=hover_text
                    )
                )

                icol = icol + 1

        layout.shapes = horizontallines

        return go.Figure(data=data_to_plot, layout=layout)

#### Testing datasets

In [23]:
df = pd.read_csv( 'https://raw.githubusercontent.com/plotly/dash-bio-docs-files/master/' +
                 'manhattan_data.csv')

In [24]:
df.describe()

Unnamed: 0,CHR,BP,P,ZSCORE,EFFECTSIZE,DISTANCE
count,14412.0,14412.0,14412.0,14412.0,14412.0,14412.0
mean,9.24882,78270180.0,0.4959331,0.813815,0.000756,89970.08
std,6.285682,56439740.0,0.288276,0.639327,0.11056,196998.1
min,1.0,92220.0,6.7501e-10,0.0001,-1.7873,-1.0
25%,4.0,31598270.0,0.2483034,0.32725,-0.066925,0.0
50%,8.0,68379400.0,0.4944957,0.6832,0.0002,2783.5
75%,14.0,115426800.0,0.743478,1.154475,0.0675,80041.0
max,23.0,246586400.0,0.9999448,6.1718,2.6549,1968847.0


In [None]:
geneList = df['GENE'].tolist()
geneList = list(dict.fromkeys(geneList))

In [None]:
df 

In [24]:
#fig = ManhattanPlot(df, title='Manhattan plot')
#fig.show()

![](Man_plot.png)

### Reading the GWAS Datasets to a DF 

In [None]:
GWAS_DF2 = pd.read_excel('GWASData\GWAS\hum0014.v3.T2DM-2.v1.xlsx', keep_default_na=False, na_values=[""]) 
GWAS_rename_DF2 = GWAS_DF2.rename(columns={"#SNPID": "SNP",
                        "chr": "CHR",
                        "chrloc　(hg18)":"BP",
                       "ptrend":"P",
                        "maf_total":"annotation"
                       }, errors="raise")

with open('GWAS_rename_DF2.pickle', 'wb') as GWAS_rename_DF_Obj2: # Creating pickle file object
    pickle.dump(GWAS_rename_DF2,GWAS_rename_DF_Obj2) # Dumping the object into the pickle file

In [None]:
GWAS_DF2

##### Deseralising the pickle objec from directory

In [11]:
with open('GWAS_rename_DF2.pickle', 'rb') as GWAS_rename_DF_Obj2:

    GWAS_rename_DF_Ob2 = pickle.load(GWAS_rename_DF_Obj2)

    GWAS_rename_DF_Obj2.close()
    

In [12]:
GWAS_rename_DF_Ob2

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP
0,rs2237896,683,2631,2332,0,3359,9351,6696,14,2.640000e-30,1.289090,1.234070,1.346570,0.353967,0.414021,0.400487,A,G,11,2815016
1,rs2299620,745,2686,2180,35,3516,9448,6294,162,4.070000e-26,1.263870,1.210050,1.320080,0.372126,0.427874,0.415296,A,G,11,2814871
2,rs2237892,653,2599,2393,1,3043,9333,7039,5,5.960000e-23,1.247430,1.193710,1.303560,0.345881,0.397090,0.385555,T,C,11,2796327
3,rs2383208,2163,2663,819,1,6400,9416,3601,3,7.440000e-19,0.823873,0.789288,0.859973,0.380957,0.427924,0.417345,A,G,9,22122076
4,rs234853,2068,2743,827,8,6166,9582,3637,35,1.740000e-17,0.829544,0.794549,0.866081,0.389943,0.434769,0.424669,T,C,11,2807404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479083,rs4244872,838,2669,2139,0,2908,9126,7382,4,9.999920e-01,1.000000,0.957967,1.043880,0.384786,0.384786,0.384786,T,C,12,33684341
479084,rs4693013,2640,2470,534,2,9165,8335,1919,1,9.999930e-01,1.000000,0.955791,1.046250,0.313430,0.313430,0.313430,A,G,4,83134726
479085,rs6988928,83,1142,4421,0,276,3947,15197,0,9.999940e-01,1.000000,0.936889,1.067360,0.115834,0.115834,0.115834,A,G,8,13781245
479086,rs372143,4341,1215,90,0,14916,4208,295,1,9.999970e-01,1.000000,0.938331,1.065720,0.123539,0.123539,0.123539,A,G,14,67657091


In [13]:
GWAS_rename_DF_Ob2.describe()

Unnamed: 0,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,CHR,BP
count,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0,479088.0
mean,1793.8434,1859.663166,1991.130588,1.362846,6167.959431,6400.690276,6846.888221,4.462072,0.4867605,1.000542,0.942809,1.062603,0.246531,0.246535,0.246539,8.783136,77648980.0
std,1667.189797,815.539372,1705.398125,3.816454,5734.408601,2805.966913,5866.219741,12.771642,0.2918356,0.034712,0.039922,0.049094,0.141741,0.141734,0.141732,5.780515,56713480.0
min,0.0,81.0,0.0,0.0,0.0,374.0,0.0,0.0,2.64e-30,0.657528,0.518163,0.76768,0.007173,0.010015,0.009575,1.0,11244.0
25%,289.0,1208.0,394.0,0.0,992.0,4159.0,1355.0,0.0,0.2312057,0.980746,0.926618,1.03361,0.12263,0.12276,0.122736,4.0,30568960.0
50%,1262.0,2058.0,1586.0,0.0,4337.0,7082.0,5450.0,1.0,0.482017,0.999938,0.948604,1.05406,0.241808,0.241798,0.24182,8.0,67898110.0
75%,3079.0,2610.0,3405.0,1.0,10584.0,8988.0,11704.0,3.0,0.739506,1.01952,0.967411,1.07901,0.368402,0.368317,0.368412,13.0,114311800.0
max,5565.0,2955.0,5565.0,56.0,19037.0,9957.0,19037.0,194.0,1.0,1.52085,1.30383,1.92989,0.5,0.5,0.5,22.0,247177300.0


#### Removing all rows with Null value of SNPS

In [14]:
#GWAS_rename_DF.isnull().sum()
#Creaties a series of of null values in SNPS columns  
#gapminder_no_NA = gapminder[gapminder.year.notnull()]
GWAS_rename_DF_bool_series = GWAS_rename_DF_Ob2[GWAS_rename_DF_Ob2['SNP'].isnull()] 
# filtering data  
# displaying data only with Gender = NaN  
#GWAS_rename_DF_Ob2[GWAS_rename_DF_bool_series] 
GWAS_rename_DF_bool_series

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP


In [15]:
# creating bool series True for NaN values  
GWAS_rename_DF_Ob2_bool_series = pd.notnull(GWAS_rename_DF_Ob2["SNP"])  
    
# filtering data  
# displayind data only with Gender = Not NaN  
GWAS_rename_DF_Ob2 = GWAS_rename_DF_Ob2[GWAS_rename_DF_Ob2_bool_series]  
GWAS_rename_DF_Ob2 = GWAS_rename_DF_Ob2.dropna()
GWAS_rename_DF_Ob2 =GWAS_rename_DF_Ob2.drop_duplicates(['SNP'], keep = 'first')
GWAS_rename_DF_Ob2

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP
0,rs2237896,683,2631,2332,0,3359,9351,6696,14,2.640000e-30,1.289090,1.234070,1.346570,0.353967,0.414021,0.400487,A,G,11,2815016
1,rs2299620,745,2686,2180,35,3516,9448,6294,162,4.070000e-26,1.263870,1.210050,1.320080,0.372126,0.427874,0.415296,A,G,11,2814871
2,rs2237892,653,2599,2393,1,3043,9333,7039,5,5.960000e-23,1.247430,1.193710,1.303560,0.345881,0.397090,0.385555,T,C,11,2796327
3,rs2383208,2163,2663,819,1,6400,9416,3601,3,7.440000e-19,0.823873,0.789288,0.859973,0.380957,0.427924,0.417345,A,G,9,22122076
4,rs234853,2068,2743,827,8,6166,9582,3637,35,1.740000e-17,0.829544,0.794549,0.866081,0.389943,0.434769,0.424669,T,C,11,2807404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479083,rs4244872,838,2669,2139,0,2908,9126,7382,4,9.999920e-01,1.000000,0.957967,1.043880,0.384786,0.384786,0.384786,T,C,12,33684341
479084,rs4693013,2640,2470,534,2,9165,8335,1919,1,9.999930e-01,1.000000,0.955791,1.046250,0.313430,0.313430,0.313430,A,G,4,83134726
479085,rs6988928,83,1142,4421,0,276,3947,15197,0,9.999940e-01,1.000000,0.936889,1.067360,0.115834,0.115834,0.115834,A,G,8,13781245
479086,rs372143,4341,1215,90,0,14916,4208,295,1,9.999970e-01,1.000000,0.938331,1.065720,0.123539,0.123539,0.123539,A,G,14,67657091


In [16]:
duplicateRowsDF2 = GWAS_rename_DF_Ob2[GWAS_rename_DF_Ob2.duplicated(['BP'])]
duplicateRowsDF2_droped = GWAS_rename_DF_Ob2.drop_duplicates(subset='BP', keep="first")
#df = df.drop_duplicates('column_name', keep='last')
duplicateRowsDF2

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP
23046,rs7302233,4169,1358,119,0,14052,4942,425,1,0.039738,0.939128,0.884556,0.997067,0.141339,0.149132,0.147377,A,C,12,80976062
30344,rs1323271,898,2692,2055,1,3256,9320,6840,4,0.053046,1.043010,0.999444,1.088470,0.397520,0.407705,0.405411,T,C,9,834487
33436,rs12056091,1457,2854,1335,0,5269,9693,4458,0,0.058940,1.041280,0.998471,1.085930,0.489196,0.479119,0.481389,T,C,7,96596607
40155,rs11082439,117,1284,4244,1,321,4328,14769,2,0.071858,0.945065,0.888668,1.005040,0.134455,0.127974,0.129434,T,C,18,41124301
40160,500301,146,1506,3994,0,482,4949,13988,1,0.071864,0.948827,0.896072,1.004690,0.159228,0.152248,0.153820,A,G,1,224098852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475385,rs944269,3907,1570,168,1,13467,5344,607,2,0.991710,0.999706,0.945764,1.056730,0.168822,0.168864,0.168854,T,C,20,59647962
475498,rs10757447,4011,1474,161,0,13718,5228,474,0,0.991976,1.000290,0.944600,1.059270,0.159051,0.159011,0.159020,A,G,9,23720654
475668,rs6942837,520,2368,2758,0,1801,8122,9497,0,0.992359,1.000220,0.955743,1.046770,0.301807,0.301854,0.301843,T,C,7,47737034
476312,rs6768455,188,1717,3739,2,690,5820,12909,1,0.993736,0.999784,0.947409,1.055060,0.185418,0.185385,0.185393,A,G,3,85123339


In [17]:
duplicateRowsDF2_droped

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP
0,rs2237896,683,2631,2332,0,3359,9351,6696,14,2.640000e-30,1.289090,1.234070,1.346570,0.353967,0.414021,0.400487,A,G,11,2815016
1,rs2299620,745,2686,2180,35,3516,9448,6294,162,4.070000e-26,1.263870,1.210050,1.320080,0.372126,0.427874,0.415296,A,G,11,2814871
2,rs2237892,653,2599,2393,1,3043,9333,7039,5,5.960000e-23,1.247430,1.193710,1.303560,0.345881,0.397090,0.385555,T,C,11,2796327
3,rs2383208,2163,2663,819,1,6400,9416,3601,3,7.440000e-19,0.823873,0.789288,0.859973,0.380957,0.427924,0.417345,A,G,9,22122076
4,rs234853,2068,2743,827,8,6166,9582,3637,35,1.740000e-17,0.829544,0.794549,0.866081,0.389943,0.434769,0.424669,T,C,11,2807404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479083,rs4244872,838,2669,2139,0,2908,9126,7382,4,9.999920e-01,1.000000,0.957967,1.043880,0.384786,0.384786,0.384786,T,C,12,33684341
479084,rs4693013,2640,2470,534,2,9165,8335,1919,1,9.999930e-01,1.000000,0.955791,1.046250,0.313430,0.313430,0.313430,A,G,4,83134726
479085,rs6988928,83,1142,4421,0,276,3947,15197,0,9.999940e-01,1.000000,0.936889,1.067360,0.115834,0.115834,0.115834,A,G,8,13781245
479086,rs372143,4341,1215,90,0,14916,4208,295,1,9.999970e-01,1.000000,0.938331,1.065720,0.123539,0.123539,0.123539,A,G,14,67657091


#### Retrieving rs SNPS

In [18]:
#GWAS_rename_DF.loc[GWAS_rename_DF.str.startswith('rs', na=False)]
duplicateRowsDF2_droped_rs = duplicateRowsDF2_droped[duplicateRowsDF2_droped['SNP'].str.match('rs', na = False)]
#duplicateRowsDF2_droped_rs
#sorting by chromosomes
duplicateRowsDF2_droped_rs_sorted_DF=duplicateRowsDF2_droped_rs.sort_values(by=['CHR'])
duplicateRowsDF2_droped_rs_sorted_DF

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP
310870,rs10159099,3552,1849,245,0,12268,6337,814,1,0.634300,1.012640,0.961567,1.06642,0.207138,0.205083,0.205546,T,C,1,56573324
38939,rs4518905,965,2711,1970,0,3523,9291,6605,1,0.069637,1.039860,0.996864,1.08470,0.410999,0.420645,0.418472,A,G,1,226164963
172262,rs4657733,382,2167,3097,0,1345,7566,10509,0,0.339431,1.023560,0.975807,1.07365,0.259564,0.264058,0.263046,A,G,1,166353058
172254,rs1337444,206,1689,3751,0,644,5785,12991,0,0.339413,0.974084,0.922996,1.02800,0.186061,0.182106,0.182997,A,G,1,163856159
352610,rs761430,38,854,4754,0,148,2943,16327,2,0.724795,1.013720,0.939649,1.09364,0.082359,0.083402,0.083167,T,G,1,29551361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439591,rs4819804,383,2192,3071,0,1380,7394,10645,1,0.913927,0.997388,0.951188,1.04583,0.261955,0.261445,0.261560,A,G,22,17912378
214011,rs2041607,1087,2756,1803,0,3845,9429,6143,3,0.428184,1.017110,0.975310,1.06070,0.436592,0.440825,0.439872,A,G,22,15830515
214030,rs1034420,87,1182,4377,0,261,4036,15123,0,0.428221,0.974218,0.913254,1.03925,0.120085,0.117353,0.117969,T,G,22,33660152
439408,rs5760680,34,783,4825,4,104,2709,16605,2,0.913555,0.995607,0.919526,1.07798,0.075416,0.075111,0.075180,T,C,22,23541793


#### retrieving SNPS without rs

In [19]:
# That dosent strats with rs
#df[~df['col'].str.startswith('t')]
#GWAS_sorted_DF_Not_rs = GWAS_sorted_DF[~GWAS_sorted_DF['SNP'].str.match('rs')]
duplicateRowsDF2_droped_sorted_DF_Not_rs = duplicateRowsDF2_droped[~duplicateRowsDF2_droped['SNP'].str.startswith('rs',na = False)].reset_index(drop = True)
duplicateRowsDF2_droped_sorted_DF_Not_rs

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP
0,200519,637,2433,2576,0,2266,8679,8475,0,0.01956,1.05425,1.00851,1.10206,0.328286,0.340139,0.337469,T,C,6,33274012
1,500694,2845,2315,486,0,10075,7785,1555,5,0.029647,1.05247,1.00506,1.10212,0.291091,0.280582,0.28295,T,C,16,86433796
2,200521,1691,2772,1183,0,5566,9637,4217,0,0.054923,0.959727,0.920275,1.00087,0.455012,0.465268,0.462958,T,C,6,33281976
3,200262,2643,2418,585,0,9302,8207,1910,1,0.104876,1.03777,0.992292,1.08534,0.317747,0.309671,0.31149,T,C,1,20866804
4,200124,2896,2293,457,0,9773,7956,1689,2,0.107534,0.962757,0.919267,1.00831,0.284006,0.291843,0.290077,T,C,7,150564216
5,200260,1204,2800,1639,3,4335,9566,5508,11,0.120311,1.03373,0.991355,1.07792,0.461457,0.469782,0.467907,G,C,1,20865406
6,500556,596,2401,2644,5,2086,8489,8840,5,0.131119,1.03507,0.989771,1.08244,0.318472,0.326062,0.324353,A,G,17,17350246
7,200070,1532,2791,1321,2,5152,9611,4656,1,0.270142,0.976788,0.936852,1.01843,0.481308,0.487229,0.485896,G,C,16,16174099
8,400630,4362,1200,84,0,15144,3994,282,0,0.273082,1.03657,0.972083,1.10533,0.121148,0.117353,0.118208,A,G,11,118530141
9,500715,3545,1867,234,0,12424,6117,875,4,0.334218,1.02564,0.974269,1.07972,0.206784,0.202591,0.203535,A,C,14,22723148


### Sample/Trim the Datasets to be used

In [20]:
DATASET_smaple = duplicateRowsDF2_droped_rs_sorted_DF.sample(n=None, frac=0.1, replace=False, weights=None, random_state=1, axis=0)
#DATASET_smaple
SampleData = DATASET_smaple.groupby(['CHR','BP']).apply(lambda u:u[:])#.reset_index(drop =True) #.head(30000))
#SampleData1 = DATASET_smaple.groupby(['CHR','BP']).apply(lambda u:u)
SampleData1 = SampleData.droplevel(['BP','CHR']).reset_index(drop =True)
#SampleData1 = SampleData.sort_values(by ="CHR").reset_index(drop =True)
# SampleData1
# SampleData1 = SampleData.droplevel('CHR').reset_index(drop =True)
# SampleData1
SampleData1

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP
0,rs4970420,3640,1790,212,4,12501,6165,732,22,0.917874,0.997219,0.945806,1.05143,0.196207,0.196644,0.196546,T,C,1,1096336
1,rs13303016,3472,1925,247,2,11988,6577,850,5,0.793916,1.006870,0.956453,1.05995,0.214298,0.213160,0.213416,A,G,1,1936451
2,rs2254669,26,664,4956,0,76,2403,16941,0,0.367653,1.040220,0.954697,1.13340,0.063408,0.065783,0.065248,A,G,1,1995968
3,rs424079,3536,1861,245,4,11987,6479,927,27,0.139649,0.962079,0.913970,1.01272,0.208348,0.214846,0.213381,A,C,1,2061200
4,rs7512482,4637,951,58,0,15912,3322,185,1,0.856273,0.993432,0.925055,1.06686,0.094492,0.095061,0.094933,T,C,1,2136826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47838,rs11703226,2494,2543,603,6,8787,8502,2098,33,0.332329,1.022280,0.977734,1.06886,0.332358,0.327487,0.328585,T,C,22,49000551
47839,rs5771242,208,1644,3793,1,632,5913,12869,6,0.565587,1.016040,0.962358,1.07271,0.182462,0.184841,0.184305,A,G,22,49003996
47840,rs131816,5172,461,13,0,17706,1657,52,5,0.318233,0.949418,0.857404,1.05131,0.043128,0.045352,0.044851,T,C,22,49304328
47841,rs470119,362,2173,3111,0,1359,7428,10632,1,0.319039,1.024600,0.976775,1.07476,0.256553,0.261239,0.260184,T,C,22,49313780


In [21]:
groupbyDF = DATASET_smaple.groupby(['CHR'])

In [22]:
groupbyDF.count()

Unnamed: 0_level_0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,BP
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759,3759
2,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928,3928
3,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355
4,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940
5,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065
6,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304,3304
7,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668
8,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712
9,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392
10,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535,2535


In [25]:
#fig2 = ManhattanPlot(SampleData1, title='Manhattan plot', annotation = 'annotation', gene= None)
#fig2.show()

### Selecting the significant SNPS from the sampled datasest

In [16]:
grouped_dataFrame_sign_snps = SampleData.loc[SampleData['P'] <= 0.005]
grouped_dataFrame_sign_snps

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,OR_logistic,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP
CHR,BP,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,3755615,2382,rs4648347,1777,2766,1103,0,6407,9531,3479,3,0.002954,1.066250,1.022080,1.112330,0.440312,0.424602,0.428141,T,C,1,3755615
1,11737218,2074,rs1133398,2321,2601,721,3,8400,8711,2299,10,0.002519,1.069770,1.023960,1.117640,0.358231,0.342839,0.346306,T,C,1,11737218
1,21688247,2124,rs904927,191,1579,3876,0,702,5828,12889,1,0.002591,1.087590,1.029750,1.148680,0.173663,0.186209,0.183383,T,C,1,21688247
1,28084237,1179,rs1467464,1965,2713,968,0,6389,9400,3627,4,0.001238,0.932670,0.894036,0.972974,0.411707,0.428873,0.425006,A,G,1,28084237
1,33019027,463,rs2282293,4029,1486,131,0,14340,4675,405,0,0.000310,1.113240,1.050170,1.180110,0.154800,0.141220,0.144279,T,C,1,33019027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,33742833,2867,rs2008519,2661,2460,524,1,8850,8505,2061,4,0.003776,0.935371,0.894015,0.978641,0.310717,0.325170,0.321915,T,C,22,33742833
22,35839033,1803,rs11914132,111,1435,4100,0,353,4555,14512,0,0.002154,0.910620,0.857731,0.966770,0.146741,0.135453,0.137996,T,C,22,35839033
22,46513031,254,rs132198,465,2318,2863,0,1845,8220,9355,0,0.000112,1.094980,1.045710,1.146580,0.287637,0.306643,0.302362,A,C,22,46513031
22,46547970,868,rs4315633,567,2419,2660,0,2155,8563,8700,2,0.000828,1.079540,1.032160,1.129090,0.314648,0.331471,0.327681,A,G,22,46547970


### Creating a list of ref significnat SNPS from the Sample Datasets
and plicling the object for use in GENE extraction from Ensemble DB

In [17]:
# Create the lists of selected snips. 
rsfsnps = list(grouped_dataFrame_sign_snps['SNP'])
#rsfsnps
# Creating a pickle object for the list
with open('selected_rsf_snps.pickle', 'wb') as selected_rsf_snps_obj: # Creating pickle file object
    pickle.dump(rsfsnps,selected_rsf_snps_obj) # Dumping the object into the pickle file

In [18]:
print(rsfsnps)

['rs4648347', 'rs1133398', 'rs904927', 'rs1467464', 'rs2282293', 'rs4927309', 'rs3820577', 'rs12116456', 'rs6679774', 'rs1983967', 'rs10494267', 'rs10796927', 'rs10908498', 'rs1148821', 'rs8972', 'rs861581', 'rs549890', 'rs12072845', 'rs7554672', 'rs729386', 'rs7606197', 'rs556902', 'rs548032', 'rs10928585', 'rs12999294', 'rs7586173', 'rs6745954', 'rs13383927', 'rs6435979', 'rs7608896', 'rs1860700', 'rs11674589', 'rs1355070', 'rs1601375', 'rs4488811', 'rs9837421', 'rs259557', 'rs259489', 'rs13098412', 'rs1353322', 'rs6799309', 'rs1798802', 'rs4973937', 'rs7650709', 'rs28758977', 'rs6785239', 'rs17790790', 'rs1159290', 'rs260176', 'rs12639377', 'rs9859887', 'rs9851621', 'rs6805459', 'rs6809037', 'rs6438385', 'rs13074860', 'rs6794836', 'rs1421425', 'rs11929676', 'rs6809208', 'rs6786711', 'rs4130791', 'rs4692128', 'rs17653012', 'rs885443', 'rs13147707', 'rs7654525', 'rs4860532', 'rs4860707', 'rs11735016', 'rs2319691', 'rs11945668', 'rs4695789', 'rs1485936', 'rs4635969', 'rs13167887', 'rs1

##### Reading  CSV  files of extrcated genes from Ensemble DB

In [18]:
genes_b_data = pd.read_table("genes_b_data.csv", sep = '\s+')

genes_a_data = pd.read_csv('genes_a_data.csv', sep = '\s+')

genes_c_data = pd.read_csv('genes_c_data.csv', sep ='\s+')

In [19]:
genes_b_data

Unnamed: 0,refsnp_id,associated_gene
0,rs929250,HOTTIP
1,rs1557643,
2,rs2237731,
3,rs1206367,
4,rs3808081,
...,...,...
127,rs10756819,
128,rs13288849,
129,rs12342334,
130,rs10963522,


In [20]:
genes_a_data

Unnamed: 0,refsnp_id,associated_gene
0,rs8972,
1,rs259489,
2,rs259557,
3,rs260176,
4,rs549890,
...,...,...
127,rs7654525,
128,rs11945668,
129,rs13147707,
130,rs17653012,


In [21]:
genes_c_data

Unnamed: 0,refsnp_id,associated_gene
0,rs745299,
1,rs935214,"RPL28P4,LIPC"
2,rs2173288,
3,rs4334271,
4,rs7181383,
...,...,...
101,rs12967286,
102,rs16941039,
103,rs9965472,
104,rs9952696,


### Concatnating the two data frames which contain the genes, and renaming the columns

In [22]:
genes_data_merged = pd.concat([genes_a_data, genes_b_data,genes_c_data ], 
                              axis = 0).rename(columns = {'refsnp_id': 'SNP','associated_gene': 'GENE' }).reset_index(drop = True)
genes_data_merged

Unnamed: 0,SNP,GENE
0,rs8972,
1,rs259489,
2,rs259557,
3,rs260176,
4,rs549890,
...,...,...
365,rs12967286,
366,rs16941039,
367,rs9965472,
368,rs9952696,


#### The original genes with nan values dropped

In [51]:
genes_data_merged_nanDropped = genes_data_merged[genes_data_merged['GENE'].notna()].reset_index(drop = True)
#df = df[df['EPS'].notna()]
genes_data_merged_nanDropped

Unnamed: 0,SNP,GENE
0,rs4488811,SYN2
1,rs12116456,"ZNF326,BARHL2"
2,rs9837421,SH3BP5
3,rs6785239,"FAM19A4,C3orf64"
4,rs7554672,LOC101929750
5,rs6579767,HMGXB3xEPC1
6,rs17061327,"MRP63P6,CCNG1"
7,rs13167887,LPCAT1
8,rs4635969,"TERT,CLPTM1L"
9,rs4635969,CLPTM1L


#### Spliting the Genes with comma str and droping the  rows with nan GENE columns

In [24]:
genes_data_merged_split = genes_data_merged.join(pd.DataFrame(genes_data_merged.GENE.str.split(',', expand=True)
                                                              .stack().reset_index(level=1, drop=True)
                                                               ,columns=['GENE '])).drop('GENE',1).rename(columns=str.strip).reset_index(drop=True)
# spliting GENEs containg 'x', eg: 'ANP32DxHNF1B'
genes_data_merged_split = genes_data_merged_split.join(pd.DataFrame(genes_data_merged_split.GENE.str.split('x', expand=True)
                                                              .stack().reset_index(level=1, drop=True)
                                                              ,columns=['GENE '])).drop('GENE',1).rename(columns=str.strip).reset_index(drop=True)

genes_data_merged_split_na_droped = genes_data_merged_split[genes_data_merged_split['GENE'].notna()].reset_index(drop = True)

In [25]:
genes_data_merged_split 

Unnamed: 0,SNP,GENE
0,rs8972,
1,rs259489,
2,rs259557,
3,rs260176,
4,rs549890,
...,...,...
398,rs12967286,
399,rs16941039,
400,rs9965472,
401,rs9952696,


#### DF with nan values droped 

In [26]:
genes_data_merged_split_na_droped

Unnamed: 0,SNP,GENE
0,rs4488811,SYN2
1,rs12116456,ZNF326
2,rs12116456,BARHL2
3,rs9837421,SH3BP5
4,rs6785239,FAM19A4
...,...,...
85,rs8141797,SUSD2
86,rs8141797,SUSD2
87,rs8141797,CABIN1
88,rs8141797,GGT5


### Merge the Genes DF with the original sampled datasets

In [27]:
SampleData_Merged_Genes = pd.merge(left =SampleData , right = genes_data_merged,
                                   how='left', left_on='SNP', right_on='SNP')

In [28]:
SampleData_Merged_Genes

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,...,OR_lower,OR_upper,maf_case,maf_ctrl,annotation,alleleA,alleleB,CHR,BP,GENE
0,rs4970420,3640,1790,212,4,12501,6165,732,22,0.917874,...,0.945806,1.05143,0.196207,0.196644,0.196546,T,C,1,1096336,
1,rs13303016,3472,1925,247,2,11988,6577,850,5,0.793916,...,0.956453,1.05995,0.214298,0.213160,0.213416,A,G,1,1936451,
2,rs2254669,26,664,4956,0,76,2403,16941,0,0.367653,...,0.954697,1.13340,0.063408,0.065783,0.065248,A,G,1,1995968,
3,rs424079,3536,1861,245,4,11987,6479,927,27,0.139649,...,0.913970,1.01272,0.208348,0.214846,0.213381,A,C,1,2061200,
4,rs7512482,4637,951,58,0,15912,3322,185,1,0.856273,...,0.925055,1.06686,0.094492,0.095061,0.094933,T,C,1,2136826,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47854,rs11703226,2494,2543,603,6,8787,8502,2098,33,0.332329,...,0.977734,1.06886,0.332358,0.327487,0.328585,T,C,22,49000551,
47855,rs5771242,208,1644,3793,1,632,5913,12869,6,0.565587,...,0.962358,1.07271,0.182462,0.184841,0.184305,A,G,22,49003996,
47856,rs131816,5172,461,13,0,17706,1657,52,5,0.318233,...,0.857404,1.05131,0.043128,0.045352,0.044851,T,C,22,49304328,
47857,rs470119,362,2173,3111,0,1359,7428,10632,1,0.319039,...,0.976775,1.07476,0.256553,0.261239,0.260184,T,C,22,49313780,


In [26]:
#fig3 = ManhattanPlot(SampleData_Merged_Genes, title='Manhattan plot', annotation = 'annotation', gene= 'GENE')
#fig3.show()

In [29]:
unique_gene = list(set(genes_data_merged_split_na_droped['GENE']))
len(unique_gene)

71

### Looking for pathway (by genes i.e., IDs or usual name)

In [30]:
k = KEGG()
k.organism = "hsa"
human_Path = k.pathwayIds

In [32]:
run_gene_list = unique_gene #genelist_pickl_object

    #run_gene_list = ['FADS1', 'FADS2', 'FADS3']
pathway_dict = {}    
list_pathways = []
list_res = []
list_entry = []  
Genes_WithPath_hit =[] # Genes With Pathway hits 

for a_gene in run_gene_list:
    pathways = k.get_pathway_by_gene(a_gene, "hsa")
        #print(pathways)
        
    if pathways != None:
        
        pathway_dict[a_gene] = pathways
        list_pathways.append(pathways)
        Genes_WithPath_hit.append(a_gene)
        
        for a_pathway in pathways.keys():
           # search for pathways that contain the required gene Id and relations
            res = k.parse_kgml_pathway(a_pathway) 
            list_res.append(a_pathway)  
            #print(res.keys())
            
            gene_id = None
            
            for entry in res['entries']:  
                
                if entry['gene_names'] != None:
                    
                    if a_gene in entry['gene_names'].split(', '):
                        list_entry.append(entry)

                           
print ("____Done Execution!____")

____Done Execution!____


### Serialise the objects from KEGG Database

In [39]:
pickle_pathway_Obj = list_pathways

pickle_dict_res_Obj = list_res

pickle_list_entries = list_entry

pickle_pathway_dict= pathway_dict

with open('pathways.obj', 'wb') as file_pathway_Obj: # Creating pickle file object
    pickle.dump(pickle_pathway_Obj,file_pathway_Obj) # Dumping the object into the pickle file
    file_pathway_Obj.close()

    # Pathway IDS
with open('res.obj','wb') as file_dict_res_Obj:
        pickle.dump(pickle_dict_res_Obj,file_dict_res_Obj)
        file_dict_res_Obj.close()

with open('entries.obj','wb') as file_list_entry_Obj:
    pickle.dump(pickle_list_entries,file_list_entry_Obj)
    file_list_entry_Obj.close()
    
with open('pathway_dict.obj','wb') as pathway_dict_Obj:
    pickle.dump(pickle_pathway_dict,pathway_dict_Obj)
    pathway_dict_Obj.close()


### Deseralise the stored pickle files for processing 

In [41]:
with open('pathways.obj', 'rb') as ds_file_pathway_Obj:

    pickle_pathway_Obj = pickle.load(ds_file_pathway_Obj)

    ds_file_pathway_Obj.close()

with open('res.obj','rb') as ds_file_dict_res_Obj:

    pickle_dict_res_Obj = pickle.load(ds_file_dict_res_Obj)

    ds_file_dict_res_Obj.close()


with open("entries.obj",'rb') as ds_file_list_entry_Obj:

    pickle_list_entry_Obj = pickle.load(ds_file_list_entry_Obj)

    ds_file_list_entry_Obj.close()

with open('pathway_dict.obj','rb') as ds_pathway_dict_Obj:
    pickle_pathway_dict_Obj = pickle.load(ds_pathway_dict_Obj)
    ds_pathway_dict_Obj.close()

#return (pickle_pathway_Obj, pickle_dict_res_Obj, pickle_list_entry_Obj)

#### Creating the DF from the deseralised objs

In [46]:
pathways_DF = pd.DataFrame(pathway_dict)
# #list_pathways1 = pd.DataFrame(list_pathways).T
# new_df = list_pathway.stack().reset_index(level=1, drop=True).to_frame()
pathways_DF 

Unnamed: 0,HHEX,LPCAT1,TERT,KCNQ1,ADCY9,GGT5,OR2AG1,SGCG,DOCK1,ACACA,SOCS7,KL,TCF7L2,BRCA1,CCNG1,HNF1B,LIPC,ABCA1,PAX4,OR2AG2
hsa04950,Maturity onset diabetes of the young,,,,,,,,,,,,,,,Maturity onset diabetes of the young,,,Maturity onset diabetes of the young,
hsa05202,Transcriptional misregulation in cancer,,,,,,,,,,,,,,,,,,,
hsa00564,,Glycerophospholipid metabolism,,,,,,,,,,,,,,,,,,
hsa00565,,Ether lipid metabolism,,,,,,,,,,,,,,,,,,
hsa01100,,Metabolic pathways,,,Metabolic pathways,Metabolic pathways,,,,Metabolic pathways,,Metabolic pathways,,,,,Metabolic pathways,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
hsa04115,,,,,,,,,,,,,,,p53 signaling pathway,,,,,
hsa00561,,,,,,,,,,,,,,,,,Glycerolipid metabolism,,,
hsa04979,,,,,,,,,,,,,,,,,Cholesterol metabolism,Cholesterol metabolism,,
hsa02010,,,,,,,,,,,,,,,,,,ABC transporters,,


##### Converting the dictionary of pathways to a DF

In [58]:
Pathway_DF = pd.concat({k: pd.Series(v) for k, v in pathway_dict.items()}).reset_index()
Pathway_DF = Pathway_DF.rename(columns ={'level_0':'GENE','level_1':'Pathway_ID',0:'Pathway_Name'})
#pd.DataFrame(pathway_dict)

In [59]:
Pathway_DF

Unnamed: 0,GENE,Pathway_ID,Pathway_Name
0,HHEX,hsa04950,Maturity onset diabetes of the young
1,HHEX,hsa05202,Transcriptional misregulation in cancer
2,LPCAT1,hsa00564,Glycerophospholipid metabolism
3,LPCAT1,hsa00565,Ether lipid metabolism
4,LPCAT1,hsa01100,Metabolic pathways
...,...,...,...
127,ABCA1,hsa02010,ABC transporters
128,ABCA1,hsa04975,Fat digestion and absorption
129,ABCA1,hsa04979,Cholesterol metabolism
130,PAX4,hsa04950,Maturity onset diabetes of the young


In [69]:
Pathway_DF_Duplicate_removed = Pathway_DF.drop_duplicates(keep= 'first').reset_index(drop =True)
Pathway_DF_Duplicate_removed

Unnamed: 0,GENE,Pathway_ID,Pathway_Name
0,HHEX,hsa04950,Maturity onset diabetes of the young
1,HHEX,hsa05202,Transcriptional misregulation in cancer
2,LPCAT1,hsa00564,Glycerophospholipid metabolism
3,LPCAT1,hsa00565,Ether lipid metabolism
4,LPCAT1,hsa01100,Metabolic pathways
...,...,...,...
127,ABCA1,hsa02010,ABC transporters
128,ABCA1,hsa04975,Fat digestion and absorption
129,ABCA1,hsa04979,Cholesterol metabolism
130,PAX4,hsa04950,Maturity onset diabetes of the young


In [61]:
genes_data_merged_split_na_droped

Unnamed: 0,SNP,GENE
0,rs4488811,SYN2
1,rs12116456,ZNF326
2,rs12116456,BARHL2
3,rs9837421,SH3BP5
4,rs6785239,FAM19A4
...,...,...
85,rs8141797,SUSD2
86,rs8141797,SUSD2
87,rs8141797,CABIN1
88,rs8141797,GGT5


#### Merging the DFs with Genes and SNPs, keeping both occurances 

In [75]:
Genes_rsSNP_Merged = pd.merge(left =Pathway_DF , right = genes_data_merged_split_na_droped,
                                   how='outer', left_on='GENE', right_on='GENE').drop_duplicates(keep = 'first')
Genes_rsSNP_Merged_renamed =Genes_rsSNP_Merged.rename(columns ={'GENE':'GENE_Name'})
Genes_rsSNP_Merged_renamed

Unnamed: 0,GENE_Name,Pathway_ID,Pathway_Name,SNP
0,HHEX,hsa04950,Maturity onset diabetes of the young,rs7923837
2,HHEX,hsa05202,Transcriptional misregulation in cancer,rs7923837
4,LPCAT1,hsa00564,Glycerophospholipid metabolism,rs13167887
5,LPCAT1,hsa00565,Ether lipid metabolism,rs13167887
6,LPCAT1,hsa01100,Metabolic pathways,rs13167887
...,...,...,...,...
196,GPR179,,,rs757210
197,ARHGAP23,,,rs757210
198,BCAS3,,,rs11651885
199,CABIN1,,,rs8141797


#### Creating Dataframes from the List of dictionaries 

In [63]:
#pathways_list = 
DF_entry = pd.DataFrame(pickle_list_entry_Obj)
DF_entry

Unnamed: 0,id,name,type,link,gene_names
0,34,hsa:3087,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:3087,"HHEX, HEX, HMPH, HOX11L-PEN, PRH, PRHX"
1,444,hsa:3087,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:3087,"HHEX, HEX, HMPH, HOX11L-PEN, PRH, PRHX"
2,131,hsa:7015,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1,..."
3,326,hsa:7015,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1,..."
4,151,hsa:7015,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1,..."
5,634,hsa:7015,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1,..."
6,159,hsa:7015,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1,..."
7,110,hsa:7015,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1,..."
8,140,hsa:7015,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1,..."
9,116,hsa:3784 hsa:9992,gene,http://www.kegg.jp/dbget-bin/www_bget?hsa:3784...,"KCNQ1, ATFB1, ATFB3, JLNS1, KCNA8, KCNA9, KVLQ..."


#### Drop Duplicates in the name column

In [65]:
pathways_list_Duplicate_removed = DF_entry.drop_duplicates(['gene_names'], 
                                                                keep= 'first').reset_index(drop =True).drop(['type','id'], axis = 1)

pathways_list_Duplicate_removed
#pathways_list

Unnamed: 0,name,link,gene_names
0,hsa:3087,http://www.kegg.jp/dbget-bin/www_bget?hsa:3087,"HHEX, HEX, HMPH, HOX11L-PEN, PRH, PRHX"
1,hsa:7015,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1,..."
2,hsa:3784 hsa:9992,http://www.kegg.jp/dbget-bin/www_bget?hsa:3784...,"KCNQ1, ATFB1, ATFB3, JLNS1, KCNA8, KCNA9, KVLQ..."
3,hsa:3784,http://www.kegg.jp/dbget-bin/www_bget?hsa:3784,"KCNQ1, ATFB1, ATFB3, JLNS1, KCNA8, KCNA9, KVLQ..."
4,hsa:6445,http://www.kegg.jp/dbget-bin/www_bget?hsa:6445,"SGCG, 35DAG, A4, DAGA4, DMDA, DMDA1, LGMD2C, L..."
5,hsa:1793,http://www.kegg.jp/dbget-bin/www_bget?hsa:1793,"DOCK1, DOCK180, ced5"
6,hsa:31 hsa:32,http://www.kegg.jp/dbget-bin/www_bget?hsa:31+h...,"ACACA, ACAC, ACACAD, ACC, ACC1, ACCA..."
7,hsa:31,http://www.kegg.jp/dbget-bin/www_bget?hsa:31,"ACACA, ACAC, ACACAD, ACC, ACC1, ACCA"
8,hsa:9365,http://www.kegg.jp/dbget-bin/www_bget?hsa:9365,"KL, HFTC3"
9,hsa:672,http://www.kegg.jp/dbget-bin/www_bget?hsa:672,"BRCA1, BRCAI, BRCC1, BROVCA1, FANCS, IRIS, PNC..."


#### Creating clickable links

In [70]:
def make_Link_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

pathways_list_Duplicate_removed.style.format({'link': make_Link_clickable})

Unnamed: 0,name,link,gene_names
0,hsa:3087,http://www.kegg.jp/dbget-bin/www_bget?hsa:3087,"HHEX, HEX, HMPH, HOX11L-PEN, PRH, PRHX"
1,hsa:7015,http://www.kegg.jp/dbget-bin/www_bget?hsa:7015,"TERT, CMM9, DKCA2, DKCB4, EST2, PFBMFT1, TCS1, TP2, TRT, hEST2, hTRT"
2,hsa:3784 hsa:9992,http://www.kegg.jp/dbget-bin/www_bget?hsa:3784+hsa:9992,"KCNQ1, ATFB1, ATFB3, JLNS1, KCNA8, KCNA9, KVLQT1, Kv1.9, Kv7.1, LQT, LQT1, RWS, SQT2, WRS..."
3,hsa:3784,http://www.kegg.jp/dbget-bin/www_bget?hsa:3784,"KCNQ1, ATFB1, ATFB3, JLNS1, KCNA8, KCNA9, KVLQT1, Kv1.9, Kv7.1, LQT, LQT1, RWS, SQT2, WRS"
4,hsa:6445,http://www.kegg.jp/dbget-bin/www_bget?hsa:6445,"SGCG, 35DAG, A4, DAGA4, DMDA, DMDA1, LGMD2C, LGMDR5, MAM, SCARMD2, SCG3, gamma-SG"
5,hsa:1793,http://www.kegg.jp/dbget-bin/www_bget?hsa:1793,"DOCK1, DOCK180, ced5"
6,hsa:31 hsa:32,http://www.kegg.jp/dbget-bin/www_bget?hsa:31+hsa:32,"ACACA, ACAC, ACACAD, ACC, ACC1, ACCA..."
7,hsa:31,http://www.kegg.jp/dbget-bin/www_bget?hsa:31,"ACACA, ACAC, ACACAD, ACC, ACC1, ACCA"
8,hsa:9365,http://www.kegg.jp/dbget-bin/www_bget?hsa:9365,"KL, HFTC3"
9,hsa:672,http://www.kegg.jp/dbget-bin/www_bget?hsa:672,"BRCA1, BRCAI, BRCC1, BROVCA1, FANCS, IRIS, PNCA4, PPP1R53, PSCP, RNF53"


### Final DataFrame for drawing Manhattan plot 

In [81]:
Final_DataFrame = pd.merge(left =SampleData_Merged_Genes, right = Genes_rsSNP_Merged_renamed,
                                   how='left', left_on='SNP', right_on='SNP')
Final_DataFrame

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,...,maf_ctrl,annotation,alleleA,alleleB,CHR,BP,GENE,GENE_Name,Pathway_ID,Pathway_Name
0,rs4970420,3640,1790,212,4,12501,6165,732,22,0.917874,...,0.196644,0.196546,T,C,1,1096336,,,,
1,rs13303016,3472,1925,247,2,11988,6577,850,5,0.793916,...,0.213160,0.213416,A,G,1,1936451,,,,
2,rs2254669,26,664,4956,0,76,2403,16941,0,0.367653,...,0.065783,0.065248,A,G,1,1995968,,,,
3,rs424079,3536,1861,245,4,11987,6479,927,27,0.139649,...,0.214846,0.213381,A,C,1,2061200,,,,
4,rs7512482,4637,951,58,0,15912,3322,185,1,0.856273,...,0.095061,0.094933,T,C,1,2136826,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48145,rs11703226,2494,2543,603,6,8787,8502,2098,33,0.332329,...,0.327487,0.328585,T,C,22,49000551,,,,
48146,rs5771242,208,1644,3793,1,632,5913,12869,6,0.565587,...,0.184841,0.184305,A,G,22,49003996,,,,
48147,rs131816,5172,461,13,0,17706,1657,52,5,0.318233,...,0.045352,0.044851,T,C,22,49304328,,,,
48148,rs470119,362,2173,3111,0,1359,7428,10632,1,0.319039,...,0.261239,0.260184,T,C,22,49313780,,,,


In [91]:
kk2 = Final_DataFrame.dropna().reset_index(drop = True)
kk2

Unnamed: 0,SNP,case11,case12,case22,case??,ctrl11,ctrl12,ctrl22,ctrl??,P,...,maf_ctrl,annotation,alleleA,alleleB,CHR,BP,GENE,GENE_Name,Pathway_ID,Pathway_Name
0,rs4635969,68,1123,4455,0,197,3574,15649,0,0.004226,...,0.102163,0.104265,T,C,5,1361552,"TERT,CLPTM1L",TERT,hsa05165,Human papillomavirus infection
1,rs4635969,68,1123,4455,0,197,3574,15649,0,0.004226,...,0.102163,0.104265,T,C,5,1361552,"TERT,CLPTM1L",TERT,hsa05166,Human T-cell leukemia virus 1 infection
2,rs4635969,68,1123,4455,0,197,3574,15649,0,0.004226,...,0.102163,0.104265,T,C,5,1361552,"TERT,CLPTM1L",TERT,hsa05200,Pathways in cancer
3,rs4635969,68,1123,4455,0,197,3574,15649,0,0.004226,...,0.102163,0.104265,T,C,5,1361552,"TERT,CLPTM1L",TERT,hsa05225,Hepatocellular carcinoma
4,rs4635969,68,1123,4455,0,197,3574,15649,0,0.004226,...,0.102163,0.104265,T,C,5,1361552,"TERT,CLPTM1L",TERT,hsa05226,Gastric cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,rs8141797,4707,903,33,3,15896,3327,187,10,0.002310,...,0.095338,0.093202,A,G,22,22912041,"CABIN1,SUSD2",GGT5,hsa01100,Metabolic pathways
214,rs8141797,4707,903,33,3,15896,3327,187,10,0.002310,...,0.095338,0.093202,A,G,22,22912041,"SUSD2,CABIN1,GGT5",GGT5,hsa00430,Taurine and hypotaurine metabolism
215,rs8141797,4707,903,33,3,15896,3327,187,10,0.002310,...,0.095338,0.093202,A,G,22,22912041,"SUSD2,CABIN1,GGT5",GGT5,hsa00480,Glutathione metabolism
216,rs8141797,4707,903,33,3,15896,3327,187,10,0.002310,...,0.095338,0.093202,A,G,22,22912041,"SUSD2,CABIN1,GGT5",GGT5,hsa00590,Arachidonic acid metabolism


In [94]:
#Final_DataFrame.columns

In [27]:
#fig4 = ManhattanPlot(Final_DataFrame, title='Manhattan plot', annotation = 'Pathway_Name', gene= 'GENE')
#fig4.show()

### Creating Sliders in Dash

In [None]:
#!pip install jupyter-dash

In [28]:
app = JupyterDash(__name__)
#n_chr = 23  # number of chromosome pairs in humans
assert 'CHR' in Final_DataFrame.columns
#assert Final_DataFrame['CHR'].max() == n_chr

# # Trim down the data
# DATASET = df.groupby('CHR').apply(lambda u: u.head(50))
# DATASET = DATASET.droplevel('CHR').reset_index(drop=True)

manhattanplot = ManhattanPlot(
    dataframe=Final_DataFrame,
    suggestiveline_color='#AA00AA',
    genomewideline_color='#AA5500'
)
app.layout = html.Div([
    ' - log10(P-Value Threshold)',
    dcc.Slider(
        id='manhattanplot-input',
        min=1,
        max=10,
        marks={
            i: 'P-Value{}'.format(i) for i in range(10)
        },
        value=6
    ),
     html.Br(),
    html.Div(
        dcc.Graph(
            id='my-dashbio-manhattanplot',
            figure=ManhattanPlot(
                dataframe=Final_DataFrame
            )
        )
    )
])

@app.callback(
    dash.dependencies.Output('my-dashbio-manhattanplot', 'figure'),
    [dash.dependencies.Input('manhattanplot-input', 'value')]
)
def update_manhattanplot(threshold):

    return ManhattanPlot(
        dataframe=Final_DataFrame,
        genomewideline_value=threshold
    )


if __name__ == '__main__':
    app.run_server(mode = 'inline')  #Turn off reloader if inside Jupyter

In [None]:
# # Load Data
# df = px.data.tips()
# # Build App
# app = JupyterDash(__name__)
# app.layout = html.Div([
#     html.H1("JupyterDash Demo"),
#     dcc.Graph(id='graph'),
#     html.Label([
#         "colorscale",
#         dcc.Dropdown(
#             id='colorscale-dropdown', clearable=False,
#             value='plasma', options=[
#                 {'label': c, 'value': c}
#                 for c in px.colors.named_colorscales()
#             ])
#     ]),
# ])
# # Define callback to update graph
# @app.callback(
#     Output('graph', 'figure'),
#     [Input("colorscale-dropdown", "value")]
# )
# def update_figure(colorscale):
#     return px.scatter(
#         df, x="total_bill", y="tip", color="size",
#         color_continuous_scale=colorscale,
#         render_mode="webgl", title="Tips"
#     )
# # Run app and display result inline in the notebook
# app.run_server(mode='inline')

In [None]:
with open('GWAS_rename_DF.pickle', 'rb') as GWAS_rename_DF_Obj:

    GWAS_rename_DF_Ob = pickle.load(GWAS_rename_DF_Obj)

    GWAS_rename_DF_Obj.close()
    

In [None]:
#GWAS_rename_DF_Ob

In [None]:
#get the snps as a list
snps_lists = GWAS_rename_DF_Ob['SNP']

# remove duplicates if any
snps_lists_duplicate_removed = list(set(snps_lists))
#pickling the Snps for use in biomartRt
with open('snps_list.pickle', 'wb') as fsnps_list_Obj: # Creating pickle file object
    pickle.dump(snps_lists_duplicate_removed,fsnps_list_Obj) # Dumping the object into the pickle file

In [None]:
# grouped_dataFrame2 = pd.DataFrame()

# for num, (name, group) in enumerate(DATASET_trimgroup):
#     #px.scatter(group, x='ind', y='p_adj', color="Chr")
#     grouped_dataFrame2 = grouped_dataFrame2.append(group)

## .Second Datasets just in Case

In [None]:
gwas_DF4 = pd.read_csv('GWASData/GWAS/DIAGRAMv3.2012DEC17.txt', sep='\t')

In [None]:
gwas_DF4

In [None]:
gwas_DF4.dtypes

In [None]:
gwas_DF4_sorted_DF_Not_rs = gwas_DF4[gwas_DF4['SNP'].str.startswith('rs')]

In [None]:
gwas_DF4_sorted_DF_Not_rs

In [None]:
gwas_DF4_sorted_DF_Not_rs.isnull().sum()

In [None]:
DATASET_trim4 = gwas_DF4_sorted_DF_Not_rs.groupby(['CHROMOSOME', 'POSITION']).apply(lambda u: u.head(1500))
DATASET_trim4 = DATASET_trim4.droplevel(['CHROMOSOME','POSITION']).reset_index(drop=True)

In [None]:
DATASET_trim4

In [None]:
DATASET_trim4 = DATASET_trim4.rename(columns={
                        "CHROMOSOME": "CHR",
                        "POSITION":"BP",
                       "P_VALUE":"P"
                        
                       }, errors="raise")

### Groupby Testings

In [190]:
aList = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
'one', 'two', 'three', 'four', 'five',
'six', 'seven', 'eight', 'nine', 'ten']

# Open the file for writing
dataFile = open('writetest.txt', 'w')

# Loop through each item in the list
# and write it to the output file.
for eachitem in aList:
    dataFile.write(str(eachitem)+'\n')

# Close the output file
dataFile.close()

In [193]:
kf = pd.read_csv('writetest.txt')

##### Coverting list of dicts to dictionary 

In [126]:
from collections import ChainMap
#data = dict(ChainMap(*data))
dicts = dict(ChainMap(*list_pathways))

In [2]:
import plotly.graph_objs as go
import numpy as np

In [29]:
layout1= go.Layout(title=go.layout.Title(text="A graph",x=0.5),
        xaxis={'title':'x[m]'},
        yaxis={'title':'y[m]','range':[-10,10]})

point_plot=[
            go.Scatter(x=[1,2,3,3,4],y=[1,1,np.nan, 1,2],name="V0"),
            go.Scatter(x=[5,6],y=[2,3],name="GT")
    ]
      
      
      
go.Figure(data=point_plot, layout=layout1).show()


In [6]:
lake = pd.DataFrame({'co tp': ['DE Lake', 'Forest', 'FR Lake', 'Forest'], 
                 'area': [10, 20, 30, 40], 
                 'count': [7, 5, 2, 3]})
lake.set_index('co tp', inplace=True)

In [8]:
lake

Unnamed: 0_level_0,area,count
co tp,Unnamed: 1_level_1,Unnamed: 2_level_1
DE Lake,10,7
Forest,20,5
FR Lake,30,2
Forest,40,3


In [19]:
# to get key value using pandas
area_dict = lake.set_index('area').to_dict()['count']#('records')[0]
print(area_dict)


{10: 7, 20: 5, 30: 2, 40: 3}


In [20]:
area_dict

{10: 7, 20: 5, 30: 2, 40: 3}

In [10]:
df = pd.DataFrame(np.random.randint(32, 120, 100000).reshape(50000,2),columns=list('AB'))
df['A'] = df['A'].apply(chr)

# %timeit dict(zip(df.A,df.B))
# %timeit pd.Series(df.A.values,index=df.B).to_dict()
# %timeit df.set_index('A').to_dict()['B']

In [11]:
df.head()

Unnamed: 0,A,B
0,B,75
1,U,112
2,`,107
3,P,50
4,s,46


In [15]:
k = dict(zip(df.A,df.B))

k1= pd.Series(df.A.values,index=df.B).to_dict()

k2= df.set_index('A').to_dict()['B']


In [21]:
print(k2)

{'B': 118, 'U': 56, '`': 68, 'P': 113, 's': 39, 'A': 117, 'W': 60, '^': 96, '1': 57, '5': 75, 'j': 98, 'f': 110, '(': 51, '*': 119, 'G': 37, '+': 38, 'L': 61, 't': 70, '?': 94, 'b': 90, 'S': 43, '.': 113, 'r': 54, 'd': 36, '4': 33, 'l': 78, '<': 73, 'i': 104, '%': 74, 'n': 53, 'J': 99, '$': 119, 'D': 84, 'k': 44, 'K': 83, '=': 64, 'h': 35, '!': 105, 'a': 54, 'N': 107, 'p': 89, 'E': 76, 'C': 69, "'": 82, 'v': 45, '\\': 68, 'M': 32, '[': 74, '6': 71, ' ': 47, 'T': 45, '_': 44, 'X': 60, '-': 69, 'u': 94, '8': 72, '"': 51, 'o': 96, ';': 114, 'V': 114, 'R': 32, '2': 47, '0': 49, 'g': 47, 'e': 92, '&': 115, 'H': 54, 'Z': 113, '/': 58, '#': 62, '@': 40, ':': 113, 'O': 86, 'm': 47, 'Q': 115, ']': 48, '9': 70, ')': 111, '>': 37, 'F': 67, 'I': 85, ',': 101, '3': 119, '7': 43, 'Y': 61, 'q': 71, 'c': 110, 'w': 102}


In [13]:
a, b = 20, 2
while a < 150:
    
    a, b = b, a+b
    print(a)

2
22
24
46
70
116
186


In [14]:
range(5)

range(0, 5)

In [15]:
for i in range(5):
    print (i)

0
1
2
3
4


In [30]:
# sample data
d={'A':[3,3,2,1,5],
   'B':[4,4,1,4,7],
   'A_info':['nothing', '', '', 'bad', 'good'],
   'B_info':['', '', 'bad', 'better', 'best']}

# pandas dataframe
df=pd.DataFrame(d, index=[10,11,12,13,14])

# set up plotly figure
fig = go.Figure()

# add line / trace 1 to figure
fig.add_trace(go.Scatter(
    x=df.index,
    y=df['A'],
    hovertext=df['A_info'],
    hoverinfo="text",
    marker=dict(
        color="blue"
    ),
    showlegend=False
))

# add line / trace 2 to figure
fig.add_trace(go.Scatter(
    x=df.index,
    y=df['B'],
    hovertext=df['B_info'],
    hoverinfo="text",
    marker=dict(
        color="green"
    ),
    showlegend=False
))

fig.show()

In [5]:
df

Unnamed: 0,A,B,A_info,B_info
10,3,4,nothing,
11,3,4,,
12,2,1,,bad
13,1,4,bad,better
14,5,7,good,best
