# Extracting metadata for identified carriers (from GP2 NBA, CES and WGS)

## Exploring the Global Landscape of Rare Causal and Common High-Risk Variants in Parkinson’s Disease

`GP2 ❤️ Open Science 😍`

## Description:

This notebook contains the code and workflow used in the study: **“Exploring the Global Landscape of Rare Causal and Common High-Risk Variants in Parkinson’s Disease”**.

In this notebook we extract metadata for identified carriers (from GP2 NBA, CES and WGS).

### Outline:

* **0. Set Up**

* **1. Read in and prepare clinical data**

* **2. Read in file with your carrier/individual IDs and merge with clinical data**

## 0. Set Up

In [None]:
# Use the os package to interact with the environment
import os

# Bring in Pandas for Dataframe functionality
import pandas as pd

# Numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
#from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

#Import Sys
import sys as sys

import openpyxl
import glob

In [None]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("gs://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

## 1. Read in and prepare clinical data

In [None]:
# Load the master key
master_key_df = pd.read_csv('/home/jupyter/workspace/path/to/release9/clinical_data/master_key_release9_final_vwb.csv') #Edit this path if you want to look into data from previous releases
print(master_key_df.shape)

In [None]:
#Print available data variables
print('Basic information:')
print()
master_key_df.info()

In [None]:
# Rename some variables (Optional!)
master_key_df.rename(columns = {'GP2ID': 'IID', 
                                'family_history_for_qc': 'FH', 
                                'biological_sex_for_qc':'gender', 
                                'age_at_sample_collection': 'age',
                                'age_of_onset':'AAO', 
                                'age_at_diagnosis':'AAD', 
                                'biological_sex_for_qc':'gender', 
                                'baseline_GP2_phenotype_for_qc': 'phenotype', 
                                'nba_prune_reason':'nba_prune', 
                                'wgs_prune_reason':'wgs_prune', 
                                'race_for_qc':'race'}, 
                                inplace = True)

In [None]:
# Extract only the necessary columns from the 'master_key_df'
columns_to_extract = ['IID', 'wgs_GP2ID_r8', 'nba', 'wgs', 'clinical_exome', 
                      'nba_label', 'wgs_label', 'study_type', 'gender', 
                      'age', 'AAO', 'AAD', 'FH', 'diagnosis', 'phenotype'] #Adjust according to the data you're interested in  

master_key_subset = master_key_df[columns_to_extract]

print(master_key_subset)

## 2. Read in file with your carrier/individual IDs and merge with clinical data

In [None]:
# Create a folder on your workspace
print("Making a working directory")

!mkdir -p /home/jupyter/workspace/ws_files/meta

workdir="/home/jupyter/workspace/ws_files/meta"

In [None]:
# Read in file with variant carriers
# Make sure your sample ID 'IID' column has the same header you chose for the master key subset 
carrier = pd.read_csv('/home/jupyter/workspace/ws_files/meta/carrier_ID_input_file.csv')
print(carrier)

In [None]:
# Merge the dataframes on sample IDs using 'IID' column
carrier_meta = pd.merge(carrier, master_key_subset, on='IID', how='left')

print(carrier_meta)