In [None]:
#!usr/bin/env python3
#Author: Deepali L. Kundnani
#Institution: Georgia Institute of Technology (Georgia Tech)

Packages needed to be imported and input files need to be processed using

In [2]:
import pandas as pd
import argparse


#parser = argparse.ArgumentParser(description='Getting TPM values from counts')
counts='exons_pc.counts' #parser.add_argument('--filename', default='exons_pc.counts', help="counts file, output from feature counts")
#parser.add_argument("-r", "--referencefasta" ,required=True, default='/storage/home/hcoda1/5/dkundnani3/p-fstorici3-0/rich_project_bio-storici/reference/sacCer2/sacCer2.fa', help="Specify the fasta file for reference of the sequenced Libraries. Make sure you have Ribosemap environment activated or have bedtools available to be used") 
#args= parser.parse_args()



Functions required to convert counts to tpm

In [3]:
import pandas as pd

def filterchr(df,col):
    """
    Filter out X and Y chromosomes

    Parameters:
    df (pd.DataFrame): A DataFrame containing necesarry details. Rows represent genomic regions, columns represent samples.
    col (num): Column index containing chromosome names

    Returns:
    pd.DataFrame: A DataFrame with required chromosomes
    """
    filtered_df = df[~df.iloc[:,col].str.contains('chrX|chrY|chrM')]

    return filtered_df

def convert_counts_to_tpm(df, gene_lengths):
    """
    Convert raw RNA-seq counts to TPM.

    Parameters:
    df (pd.DataFrame): A DataFrame containing RNA-seq counts. Rows represent genes, columns represent samples.
    gene_lengths (pd.Series): A Series containing the gene lengths in base pairs. The index should match the DataFrame index.

    Returns:
    pd.DataFrame: A DataFrame with TPM values.
    """
    # Step 0: Covert dataframes to float before any calculations
    df = df.astype(float) ; gene_lengths = gene_lengths.astype(float)
    
    # Step 1: Convert gene lengths from base pairs to kilobases
    gene_lengths_kb = gene_lengths / 1000
    
    # Step 2: Compute Reads Per Kilobase (RPK)
    rpk = df.div(gene_lengths_kb, axis=0)
    
    # Step 3: Compute scaling factor (sum of RPKs per sample)
    scaling_factors = rpk.sum(axis=0)
    
    # Step 4: Calculate TPM
    tpm = rpk.div(scaling_factors, axis=1) * 1e6
    
    return tpm

In [183]:
# Read counts file
counts_df = pd.read_csv(counts, sep='\t',skiprows=1)
samples=counts_df.columns[6:counts_df.shape[1]].values
# Filtere Chromosome X, Y and M
counts_df = filterchr(counts_df,1)
# Convert counts to TPM
tpm_df = convert_counts_to_tpm(counts_df[samples], counts_df[['Length']].values)
# Merge TPM with Original file
merged_df = pd.concat([counts_df.iloc[:,0:6], tpm_df], axis=1)
# Save the TPM file for merging with other files
merged_df.to_csv('HEK_tpm.csv', index=False)


In [196]:
# Taking averages for each cell type

merged_df['HEK293T-KO-T3-17'] = merged_df.filter(like='HEK293T-KO-T3-17').mean(axis=1)
merged_df['HEK293T-KO-T3-8'] = merged_df.filter(like='HEK293T-KO-T3-8').mean(axis=1)
merged_df['HEK293T-WT'] = merged_df.filter(like='HEK293T-WT').mean(axis=1)

# Saving average TPM file
merged_df[['Geneid','HEK293T-WT','HEK293T-KO-T3-8','HEK293T-KO-T3-17']].to_csv('HEK_tpm_avg.csv', index=False)


In [21]:
# AGS counts analysis
counts_df = pd.read_csv('AGS_rnaseq_counts.tsv', sep='\t')
samples=counts_df.columns[0:7].values
tpm_df = convert_counts_to_tpm(counts_df[samples], counts_df[['Length']].values)
merged_df = pd.concat([counts_df.iloc[:,7:counts_df.shape[1]], tpm_df], axis=1)
merged_df.to_csv('AGS_tpm.csv', index=False)
