## Process GTEx RNA-Seq data

Data from https://www.gtexportal.org/home/

NOTE: Large files, takes some time (~20-30 mins) to run.

In [2]:
import os
import pandas as pd
import re
import numpy as np
import scipy.stats as stats

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)

# Can read the zipped file directly with pandas
file_gtex_expr = get_data_path(['GTEx'], 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz')
file_paralog_pairs = get_local_data_path(['processed', 'ensembl93'], 'all_pairs.csv')

# Output
file_gtex_expr_features = get_local_data_path(['processed', 'paralog_features'], 'gtex_expr_features.csv')

### Extract expression for genes that are in a paralog pair
Only needs to run once, filtered data is saved

In [3]:
all_pairs = pd.read_csv(file_paralog_pairs)[['A1_ensembl', 'A2_ensembl']]
print('N pairs:', all_pairs.shape[0])
paralog_genes = all_pairs.A1_ensembl.unique()
print('N genes:', len(paralog_genes))
all_pairs[:1]

N pairs: 73296
N genes: 13320


Unnamed: 0,A1_ensembl,A2_ensembl
0,ENSG00000092850,ENSG00000163060


In [4]:
# Skip first 2 rows of metadata in gct file
# Read X rows at a time and only keep expression data for paralogs
df = pd.DataFrame()
for chunk in pd.read_csv(file_gtex_expr, sep='\t', skiprows=2, iterator=True, chunksize=1000):
    chunk_df = chunk.assign(ensembl_id = chunk.Name.apply(lambda x: x.split('.')[0])).drop(columns=['Description'])
    df = pd.concat([df, chunk_df[chunk_df.ensembl_id.isin(paralog_genes)].set_index('ensembl_id')])

In [6]:
df = df.drop(columns=['Name'])

### Clean up expression data

In [8]:
print('N genes:', df.shape[0])
print('N samples:', df.shape[1])
df[:1]

N genes: 13299
N samples: 17382


Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000186092,0.0,0.1025,0.07434,0.0,0.04233,0.05609,0.1743,0.0,0.0,0.0591,...,0.129,0.1367,0.05729,0.1269,0.1198,0.06767,0.1135,0.0,0.148,0.0


In [9]:
# Some genes are duplicated but it seems one of them is always 0 so just keep the max value
# The same gene on X and Y chromosome?
duplicate_indices = df.index[df.index.duplicated()]
display(duplicate_indices)

Index(['ENSG00000182378', 'ENSG00000167393', 'ENSG00000185960',
       'ENSG00000198223', 'ENSG00000185291', 'ENSG00000169100',
       'ENSG00000169093', 'ENSG00000182162', 'ENSG00000196433',
       'ENSG00000169084', 'ENSG00000168939'],
      dtype='object', name='ensembl_id')

In [10]:
df.loc['ENSG00000182378']

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000182378,6.936,0.7066,1.764,3.83,1.889,6.063,8.98,11.41,8.254,11.9,...,1.889,2.098,8.393,8.124,2.48,16.34,12.83,0.9995,0.5816,10.58
ENSG00000182378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Drop duplicate rows where expression is always 0
expr_data = df[~((df == 0).all(axis=1) & df.index.isin(duplicate_indices))]

In [13]:
%reset_selective -f "df"

In [14]:
print('N genes:', expr_data.shape[0])
expr_data[:1]

N genes: 13288


Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000186092,0.0,0.1025,0.07434,0.0,0.04233,0.05609,0.1743,0.0,0.0,0.0591,...,0.129,0.1367,0.05729,0.1269,0.1198,0.06767,0.1135,0.0,0.148,0.0


### Calculate expression features of paralog pairs
Min mean, max mean, correlation

In [15]:
mean_expr = pd.DataFrame(expr_data.mean(axis=1), columns=['mean_expr']).reset_index()
mean_expr[:1]

Unnamed: 0,ensembl_id,mean_expr
0,ENSG00000186092,0.084124


In [16]:
# Merge mean
pairs_expr = pd.merge(all_pairs, mean_expr.rename(columns={'ensembl_id':'A1_ensembl', 'mean_expr':'A1_mean_expr'}), how='left')
pairs_expr = pd.merge(pairs_expr, mean_expr.rename(columns={'ensembl_id':'A2_ensembl', 'mean_expr':'A2_mean_expr'}), how='left')

print('Pairs where both genes have expr:', 
      pairs_expr[(~pairs_expr.A1_mean_expr.isna()) & (~pairs_expr.A2_mean_expr.isna())].shape[0], '/', pairs_expr.shape[0])
pairs_expr[:1]

Pairs where both genes have expr: 72982 / 73296


Unnamed: 0,A1_ensembl,A2_ensembl,A1_mean_expr,A2_mean_expr
0,ENSG00000092850,ENSG00000163060,4.524118,1.181367


In [17]:
# Filter pairs down to those where both have expression (not NA and not 0)
pairs_expr_subset = pairs_expr[(~pairs_expr.A1_mean_expr.isna()) & (~pairs_expr.A2_mean_expr.isna()) & 
                               (pairs_expr.A1_mean_expr!=0) & (pairs_expr.A2_mean_expr!=0)]
print('N:', pairs_expr_subset.shape[0])

N: 72282


In [18]:
def compute_spearman_corr(pair):
    return stats.spearmanr(expr_data.loc[pair.A1_ensembl], expr_data.loc[pair.A2_ensembl])[0]

features = pairs_expr_subset.assign(spearman_corr = pairs_expr_subset.apply(compute_spearman_corr, axis=1))
features[:1]

Unnamed: 0,A1_ensembl,A2_ensembl,A1_mean_expr,A2_mean_expr,spearman_corr
0,ENSG00000092850,ENSG00000163060,4.524118,1.181367,0.457234


In [19]:
def compute_pearson_corr(pair):
    return stats.pearsonr(expr_data.loc[pair.A1_ensembl], expr_data.loc[pair.A2_ensembl])[0]

# NaNs in results are due to one of the genes in the pair having all 0 expression - PearsonRConstantInputWarning
features2 = features.assign(pearson_corr = features.apply(compute_pearson_corr, axis=1))
features2[:1]

Unnamed: 0,A1_ensembl,A2_ensembl,A1_mean_expr,A2_mean_expr,spearman_corr,pearson_corr
0,ENSG00000092850,ENSG00000163060,4.524118,1.181367,0.457234,0.965852


In [25]:
# Put all expr features together
all_expr_features = pd.merge(pairs_expr, features2[['A1_ensembl','A2_ensembl','spearman_corr','pearson_corr']], how='left')
assert(all_expr_features.shape[0] == pairs_expr.shape[0])
all_expr_features[:1]

Unnamed: 0,A1_ensembl,A2_ensembl,A1_mean_expr,A2_mean_expr,spearman_corr,pearson_corr
0,ENSG00000092850,ENSG00000163060,4.524118,1.181367,0.457234,0.965852


In [26]:
all_expr_features.to_csv(file_gtex_expr_features, index=0)