# Correlation Analysis

In bioinformatics, correlation can be used to identify coregulated gene expression, or identify an association between gene expression and effect of a drug. We will explore both in this example notebook.

### 1. Import Libraries we need for our analysis

In [None]:
import requests
import pandas as pd
import string
import random
import pprint
import json
import time
import os
pp = pprint.PrettyPrinter(width=79, compact=True)

### Define your credentials

Credentials can be found on [your Lumin Workspace page](https://database.championsoncology.com/lumin/workstation/)

In [None]:
# Define user credentials
username = '5036-99-ca8c10'
client_id = 99
user_id = 5036
token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJwZXJtaXNzaW9uIjoid29ya3N0YXRpb24iLCJpYXQiOjE2NDAxMjUyNzUsImV4cCI6MTY0MDEzOTY3NX0.CYUw1P9vIkHPLtm95fYbzY2nKedh3ecNpTljH-lHQxo'

Below is a function we use later to retrieve data. We use this to abstract away the work of a data request. You shouldn't need to change anything here

In [None]:
def request_data(table="expression", cancer_type = ["all"], genes = [], samples = [], drugs = None, source = "PDX", data_set_name = None):
    
    if data_set_name is None:
        # generate a folder name if none was specified
        res = ''.join(random.choices(string.ascii_uppercase +
                             string.digits, k = 6))
        data_set_name = 'Data-'+res
    else:
        # Remove any special characters
        data_set_name = ''.join(e for e in data_set_name if e.isalnum())

    # Define the data request dictionary
    d = {
        "request_data_type": table,
        "request_cancer_type": cancer_type,
        "request_genes": genes,
        "request_models": samples,
        "request_agents": None,
        "request_dataset": source,
        "request_workspace_id": username,
        "request_client": client_id,
        "request_user": user_id,
        "request_mode": True,
        "request_display_error": False,
        "preview": True
    }

    # create the request
    headers={"authorization":"Bearer "+token}
    response = requests.post('https://stag.lumin-fast-api.championsoncology.com/workstation/',json=d, headers=headers)
    task_id = None
    task_id = response.json()['task_id']

    if task_id is None:
        print("Error: No task_id returned. Please check the request details")
        return None
    else:
        print('')
        print("Data Request ID: {}".format(task_id))

        # check the status of the request until it's complete
        while True:
            response = requests.get('https://stag.lumin-fast-api.championsoncology.com/tasks/poll/'+task_id+'?old_progress=0',json=d, headers=headers)
            state = response.json()['state']
            if state == 'SUCCESS':
                result = response.json()['result']
                rd = json.loads(result)
                #pp.pprint(rd)
                folder = rd['name']
                break
            elif state == 'PROGRESS':
                print("Still loading...")
            elif state == 'error':
                print("Error: {}".format(response.json()['error']))
                return None, None
            else:
                print("Status: {}".format(state))
                time.sleep(20)
            #pp.pprint(response.json())

        # Change the folder name to the data set name
        directory = os.path.expanduser("~")
        #print(directory)
        os.rename(directory+'/'+folder, directory+'/'+data_set_name)

        # combine the files and save as a CSV
        folder = directory+"/"+data_set_name
        data = pd.DataFrame()
        i = 0
        for file in os.listdir(folder):
            i = i+1
            print(f'reading file {i} {file} ...')
            df = pd.read_json(folder+'/'+file)
            data = pd.concat([data, df])
    
        #print(data)
        data.to_csv(folder+'/'+data_set_name+'.csv', encoding='utf-8')

        # return the file name and location to the user
        return data,folder+'/'+data_set_name+'.csv'

### 2. Request Data

In [None]:
# Request TGI data - run once
#df, location = request_data(table="TGI", drugs=["Olaparib"])
#print("Data saved to {} and available as df".format(location))

# Save just the columns we need
#data_df = df[['model_name','agent','TGI','tumor_type']]
#print(data_df)

#data_df.to_csv(location, encoding='utf-8')


# Load from CSV next time
location = '../Data/TGI.csv'
load_data_df = pd.read_csv(location)
print(load_data_df.head(5))

#df, location = request_data(table="expression", genes = ['BRCA1', 'BRCA2'], data_set_name="BRCA!_BRCA2")
#print("Data saved to {} and available as df".format(location))

#### 2.1 Define the data we want

In [None]:
# Get distinct models or drugs
unique_samples = load_data_df.agent.unique().tolist()
print(unique_samples)

In [None]:
# Get selected rows
selected_rows = load_data_df.loc[load_data_df['agent'] == 'Olaparib']
print(selected_rows)

#### 2.2 Make our request

In [None]:
# Select the models we are interested in as a list
unique_samples = selected_rows.model_name.unique().tolist()
#print(unique_samples)
#df, location = request_data(table="expression", genes = [], samples = unique_samples, data_set_name="Olaparib_expression")
#print("Data saved to {} and available as df".format(location))

location = '../Data/Olaparib_expression.csv'
load_data_df = pd.read_csv(location)
print(load_data_df.head(5))

exp_df = load_data_df[['model_name','log.tpm','tumor_type','gene']]
print(exp_df)

## 3. Analysis

In [None]:
# Create a matrix
pivot_df = exp_df.reset_index().pivot_table(index="model_name", columns="gene", values="log.tpm", aggfunc='mean')
print(pivot_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Keep only a random 100 genes so the example runs fast and we can plot the results
small_pivot_df = pivot_df.sample(n=40,axis='columns')

corr = small_pivot_df.corr()
plt.figure(figsize = (16,16))
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# Add Olaparib Response data

# Add index to values_df
new_small_pivot_df = small_pivot_df.reset_index()
#print(new_small_pivot_df.shape)

TGI_data = selected_rows.groupby(['model_name']).mean()
#print(TGI_data.shape)

tgi_values = TGI_data['TGI'].tolist()
corr_data = new_small_pivot_df

corr_data['TGI'] = tgi_values[:-1]
#print(corr_data)

#TGI_corr_results = new_small_pivot_df.corrwith(TGI_data['TGI'], axis='columns', drop=True, method='pearson')
#print(TGI_corr_results)
results = corr_data[corr_data.columns[:]].corr()['TGI'][:-1]
print(results)

#plt.figure(figsize = (16,16))
##ax = sns.heatmap(
#    results, 
#    vmin=-1, vmax=1, center=0,
#    cmap=sns.diverging_palette(20, 220, n=200),
#    square=True
#)
#ax.set_xticklabels(
#    ax.get_xticklabels(),
#    rotation=45,
#    horizontalalignment='right'
#);
