## Data import

please see Preprocessing > PreprocessingMain.ipynb for more information regarding data import

In [70]:
import pandas as pd
from Preprocessing.Funcs.ReorderUROMOL import UROMOLreorder

# load in both Gene and TE transcriptomic expression
Gene = pd.read_csv('~/Desktop/UROMOL-TE/UROMOL-Gene.csv', index_col='Unnamed: 0')
TE = pd.read_csv('~/Desktop/UROMOL-TE/UROMOL-TE.csv', index_col='Unnamed: 0')

# order the dataframe by UROMOL ID
IDs = []
for sample in TE.columns.tolist():
    IDs.append(int(sample.split('U')[1]))

TE = UROMOLreorder(IDs, TE)
Gene = UROMOLreorder(IDs, Gene)

# eliminate all simple repeat elements from the TE dataset
nonTE = []
for element in TE.index.values.tolist():
    if '(' in element:
        nonTE.append(element)
TE = TE.drop(nonTE, axis='rows')

# remove all TE elements and Genes with no expression
TE = TE.loc[~(TE==0).all(axis=1)]
Gene = Gene.loc[~(Gene==0).all(axis=1)]

## Sum TE expression

in order to preform Gsea analysis on TE expression, we need to sum TE expression by class. There are 5 classes: LINE,
SINE, LTR, DNA, Retroposon

In [71]:
# load in class information and merge with TE dataframe
RepName = pd.read_csv('~/Desktop/UROMOL-TE/TE-RepName.csv', index_col= 'Unnamed: 0', usecols=[0, 3])
RepName = RepName.drop(nonTE, axis='rows')
TEclass = pd.concat([TE, RepName], axis=1)

# Isolate TE expression by class and drop 'repClass' column
LINE = TEclass[TEclass['repClass'] == 'LINE'].drop('repClass', axis='columns')
SINE = TEclass[TEclass['repClass'] == 'SINE'].drop('repClass', axis='columns')
LTR = TEclass[TEclass['repClass'] == 'LTR'].drop('repClass', axis='columns')
DNA = TEclass[TEclass['repClass'] == 'DNA'].drop('repClass', axis='columns')
Retroposon = TEclass[TEclass['repClass'] == 'Retroposon'].drop('repClass', axis='columns')

# Sum TE expression (isolated by class) for each patient to generate a 1D vector of expression for each TE class
LINE = LINE.sum(axis='rows')
SINE = SINE.sum(axis='rows')
LTR = LTR.sum(axis='rows')
DNA = DNA.sum(axis='rows')
Retroposon = Retroposon.sum(axis='rows')

In [72]:
from scipy.stats import pearsonr
from tqdm import tqdm

titleCol = ['Gene',
            'LINE-corr', 'LINE-pval',
            'SINE-corr', 'SINE-pval',
            'LTR-corr', 'LTR-pval',
            'DNA-corr', 'DNA-pval',
            'RETRO-corr', 'RETRO-pval']

Correlation = []

for gene in tqdm(Gene.index.values.tolist()):
    geneComparitor = Gene.loc[gene, :]
    LINEcorr, LINEpval = pearsonr(geneComparitor, LINE)
    SINEcorr, SINEpval = pearsonr(geneComparitor, SINE)
    LTRcorr, LTRpval = pearsonr(geneComparitor, LTR)
    DNAcorr, DNApval = pearsonr(geneComparitor, DNA)
    Retrocorr, Retropval = pearsonr(geneComparitor, Retroposon)

    out = [gene,
           LINEcorr, LINEpval,
           SINEcorr, SINEpval,
           LTRcorr, LTRpval,
           DNAcorr, DNApval,
           Retrocorr, Retropval]

    Correlation.append(out)

Corrdf = pd.DataFrame(Correlation, columns=titleCol)
Corrdf.set_index('Gene', inplace=True)

100%|██████████| 55588/55588 [00:14<00:00, 3940.50it/s]


In [80]:
import pyensembl
import gseapy
# load gene names from GRCh38
data = pyensembl.EnsemblRelease(77)

# remove 2 ensembl IDs that do not have a gene name (therefore cannot be used for gsea analysis)
rename = {}

for i, item in enumerate(Corrdf.index.values.tolist()):

    if ',' in item:
        newitem = item.split(',')[0]
        gene = data.gene_by_id(gene_id=newitem).gene_name
        rename[item] = gene
    else:
        gene = data.gene_by_id(gene_id=item).gene_name
        rename[item] = gene

Corrdf = Corrdf.rename(index=rename)

In [89]:
def GeneratePreRank(df, ElementType):

    # this function evalautes the output of pearsonr analysis and ranks the geneset for gsea analysis

    df = df.loc[: ,[f'{ElementType}-corr', f'{ElementType}-pval']]
    df = df[df[f'{ElementType}-pval'] < 0.05].drop(f'{ElementType}-pval', axis=1)
    df = df.sort_values(f'{ElementType}-corr', ascending=False)
    return df

linecorr = GeneratePreRank(Corrdf, 'LINE')
sinecorr = GeneratePreRank(Corrdf, 'SINE')
ltrcorr = GeneratePreRank(Corrdf, 'LTR')
dnacorr = GeneratePreRank(Corrdf, 'DNA')
retrocorr = GeneratePreRank(Corrdf, 'RETRO')

               RETRO-corr
Gene                     
YBX1P4           0.972023
RP11-638I8.1     0.962612
TNRC18           0.959586
GIGYF1           0.950677
CLASRP           0.950032
...                   ...
CTB-79E8.3      -0.091106
RPL41           -0.095600
RP11-572P18.1   -0.097974
RP11-20G6.3     -0.110345
RPL7P1          -0.118278

[43275 rows x 1 columns]


In [32]:
import gseapy

