In [12]:
# The goal of this function will be to take in normalized RNA seq datasets and then return a dataset with only genes that 
# are most important for the tissues that we are looking at.

import pandas as pd
import gzip
import requests

In [32]:
sequencing = "TCGA-COAD.htseq_fpkm.tsv.gz"
lung_sequencing = "TCGA-LUAD.htseq_fpkm.tsv.gz"
colon_enhanced_genes = "tissue_specificity_rna_colon_Group.tab"
lung_enhanced_genes = "tissue_category_rna_lung_Tissue.tsv"

In [33]:
with gzip.open(lung_sequencing, 'rt') as file:
    # Read the TSV file into a pandas DataFrame
    lung = pd.read_csv(file, sep='\t')
#Make a list of file_paths for each tissue

In [15]:
# Code that creates the lists of genes in cancer and the list of enriched genes in each tissue to extract for testing
# Make a list of file paths for each sequencing data
# Specify the file path

with gzip.open(sequencing, 'rt') as file:
    # Read the TSV file into a pandas DataFrame
    df = pd.read_csv(file, sep='\t')
#Make a list of file_paths for each tissue

# Open the gzip-compressed file and read it with pandas
with open(colon_enhanced_genes, 'rt') as file:
    # Read the TSV file into a pandas DataFrame
    colon_genes = pd.read_csv(file, sep='\t')
colon_gene_names = colon_genes['Gene']
colon_gene_list = colon_gene_names.tolist()

with open(lung_enhanced_genes, 'rt') as file:
    # Read the TSV file into a pandas DataFrame
    lung_genes = pd.read_csv(file, sep='\t')
lung_gene_names = lung_genes['Gene']
lung_gene_list = lung_gene_names.tolist()

In [26]:
# Function to convert gene names in enriched genes to ensembl
def convert_gene_names_to_ensembl(gene_names):
    ensembl_ids = []
    # Ensembl REST API endpoint for mapping gene names to Ensembl IDs
    api_endpoint = "https://rest.ensembl.org/xrefs/symbol/homo_sapiens/"
    for gene_name in gene_names:
        # Make a request to the Ensembl API
        response = requests.get(f"{api_endpoint}{gene_name}?content-type=application/json")
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the JSON response and extract the Ensembl ID
            data = response.json()
            if data:
                ensembl_id = data[0]["id"]
                ensembl_ids.append(ensembl_id)
            else:
                ensembl_ids.append(None)
        else:
            # If the request was not successful, append None to the list
            ensembl_ids.append(None)
    return ensembl_ids
ensembl_ids = convert_gene_names_to_ensembl(gene_names)

In [24]:
# Convert gene names to Ensembl IDs
# colon_ensembl_ids = convert_gene_names_to_ensembl(colon_gene_names)
# lung_ensembl_ids = convert_gene_names_to_ensembl(lung_gene_names)
gene_names = pd.concat([colon_gene_names, lung_gene_names], ignore_index=True)
gene_names

0      ADH1C
1        AMN
2       AQP8
3      ARL14
4      ATOH1
       ...  
298    TRPC6
299    VEGFD
300    VEPH1
301    VIPR1
302    VSIG4
Name: Gene, Length: 303, dtype: object

In [27]:
#TM function that returns the dataframe 
def TM(df, ensembl_ids):
    genes_in_df = df["Ensembl_ID"].tolist()
    genes_in_df_no_decimal = []
    for gene in genes_in_df:
        genes_in_df_no_decimal.append(gene.split('.')[0])
    genes_in_df_no_decimal
    df["genes"] = genes_in_df_no_decimal
    df = df[df["genes"].isin(ensembl_ids)]
    return df

In [29]:
df = TM(df, ensembl_ids)
df

Unnamed: 0,Ensembl_ID,TCGA-DM-A288-01A,TCGA-QL-A97D-01A,TCGA-CM-6164-01A,TCGA-G4-6299-01A,TCGA-AA-3511-11A,TCGA-F4-6463-01A,TCGA-AZ-4615-01A,TCGA-AA-3549-01A,TCGA-AY-4071-01A,...,TCGA-G4-6588-01A,TCGA-A6-2680-01A,TCGA-QG-A5YV-01A,TCGA-AA-3561-01A,TCGA-AA-3525-01A,TCGA-AA-3531-01A,TCGA-AA-3973-01A,TCGA-AA-3527-01A,TCGA-AA-3555-01A,genes
33,ENSG00000166391.13,0.928559,3.371069,1.894823,0.308125,5.001098,2.850026,1.232057,1.770905,4.371192,...,0.992612,3.805808,2.578133,1.854186,2.554196,2.456132,0.636021,2.753313,1.844561,ENSG00000166391
676,ENSG00000132031.11,0.104972,0.225836,0.928804,1.883706,0.185614,1.080540,1.266185,0.192798,0.859388,...,0.139693,0.479996,0.324114,0.577108,0.428708,0.464904,1.954739,0.638530,0.759541,ENSG00000132031
924,ENSG00000112818.8,5.636364,6.147465,5.646107,0.157510,7.371349,3.952473,0.821866,4.144076,6.753656,...,2.194073,5.822070,5.765638,7.109456,5.353628,6.741643,6.404397,4.745519,2.806230,ENSG00000112818
938,ENSG00000016602.9,0.649766,0.146173,0.554440,0.019058,7.083650,0.077079,0.124316,2.682350,2.082553,...,1.003282,4.019835,0.199254,0.450376,4.573389,0.875992,0.021172,0.018510,0.291441,ENSG00000016602
1270,ENSG00000160867.13,4.191618,4.276965,4.175837,4.166122,3.516989,5.004253,3.571448,4.963783,4.445346,...,4.259562,5.499977,4.495079,5.278741,4.826512,6.777268,5.278763,5.308861,4.883400,ENSG00000160867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59046,ENSG00000168878.15,0.009149,0.023991,0.006286,0.031826,0.007494,0.104061,0.104973,0.030963,0.015827,...,0.017019,0.015913,0.006876,0.000000,0.032562,0.039099,0.052689,0.046127,0.000000,ENSG00000168878
59062,ENSG00000070985.12,1.270408,1.091283,0.154773,0.958196,1.347751,0.642491,0.281321,0.182724,0.048695,...,0.266766,2.477086,1.222269,0.411595,0.206540,1.028693,0.523368,3.187651,0.303010,ENSG00000070985
59599,ENSG00000073861.2,0.014359,0.425177,0.278391,0.650663,0.600162,0.206373,0.812712,0.162703,0.391980,...,0.177091,0.049460,0.124512,0.072815,0.193536,0.147938,0.159588,0.095062,0.141851,ENSG00000073861
59890,ENSG00000070526.13,5.013639,3.379537,2.378663,4.471846,7.592188,5.090115,4.465763,3.515265,4.839243,...,5.853818,4.593923,4.482895,5.095441,4.952228,3.298873,2.235607,3.844083,6.357216,ENSG00000070526


In [34]:
lung = TM(lung, ensembl_ids)
lung

Unnamed: 0,Ensembl_ID,TCGA-MP-A4T4-01A,TCGA-05-4250-01A,TCGA-64-5774-01A,TCGA-97-7937-01A,TCGA-97-A4M6-01A,TCGA-44-A4SU-01A,TCGA-78-7155-01A,TCGA-78-7163-01A,TCGA-55-7574-01A,...,TCGA-86-7714-01A,TCGA-99-8025-01A,TCGA-55-6969-11A,TCGA-86-8076-01A,TCGA-86-8359-01A,TCGA-05-4398-01A,TCGA-86-8075-01A,TCGA-44-3396-01A,TCGA-50-5944-01A,genes
33,ENSG00000166391.13,0.000000,0.010538,0.016229,0.019396,0.104790,0.044079,0.000000,0.192850,0.086638,...,0.014708,0.000000,0.011991,0.091552,0.029313,0.003392,0.028364,0.018558,0.009255,ENSG00000166391
676,ENSG00000132031.11,1.125931,3.216603,1.004427,3.818638,3.974103,2.342071,1.775676,0.506655,1.440218,...,1.654985,2.065889,1.931345,0.802958,1.512382,1.483700,2.990048,1.522675,5.395443,ENSG00000132031
924,ENSG00000112818.8,0.097060,0.043409,0.033608,0.894130,0.133780,0.306844,0.091546,0.007725,0.213687,...,0.079880,0.064728,0.033066,0.453833,0.854198,0.069069,0.039247,0.124205,0.087557,ENSG00000112818
938,ENSG00000016602.9,0.611737,0.068208,0.019465,0.015547,0.000000,0.052767,0.000000,0.000000,0.089093,...,0.000000,0.049997,0.014386,0.055660,0.091834,0.000000,0.050692,0.008943,0.000000,ENSG00000016602
1270,ENSG00000160867.13,1.182227,2.175899,0.373773,0.128040,1.620848,0.881322,2.622321,0.493143,1.603407,...,0.949340,1.088417,3.243346,0.908748,0.738581,0.774133,1.450490,0.140239,2.021963,ENSG00000160867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59046,ENSG00000168878.15,9.244820,8.666906,11.863690,9.079101,8.857434,8.397702,0.250961,9.021170,12.106502,...,12.233467,10.350601,12.006209,11.680269,11.710530,7.163064,6.331294,8.901995,11.155341,ENSG00000168878
59062,ENSG00000070985.12,0.041354,0.000000,0.366861,0.053397,0.016130,0.140953,0.010120,0.017372,0.077819,...,0.000000,0.016503,0.012524,0.163209,0.020478,0.010604,0.068187,0.011660,0.028808,ENSG00000070985
59599,ENSG00000073861.2,4.141725,0.747640,0.193204,0.156137,1.453237,0.951107,0.435184,0.569019,1.344836,...,0.295315,0.239016,1.095260,1.259408,1.221893,1.793818,0.317615,0.706087,0.327399,ENSG00000073861
59890,ENSG00000070526.13,2.031211,1.521047,5.491814,5.207084,4.294497,5.333468,1.167122,5.428651,3.148090,...,4.922425,2.794951,3.283387,4.658627,4.848892,2.329532,5.063775,5.535020,5.089521,ENSG00000070526


In [30]:
csv_file_path = "/home/ani/ML_Project_2024/tm_function/colon_tm.csv"
df.to_csv(csv_file_path, index=False)

In [35]:
csv_file_path = "/home/ani/ML_Project_2024/tm_function/lung_tm.csv"
lung.to_csv(csv_file_path, index=False)