In [6]:
import pandas as pd

counts_df = pd.read_csv("SL4_counts.txt", sep='\t', comment='#')
counts_df.rename(
    columns=lambda col: col.split('/')[-1].split('_')[0] if 'alignment_results' in col else col,
    inplace=True
)
counts_df.to_csv("SL4_counts_renamed.txt", sep='\t', index=False)
counts_df.head()




Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,ERR14096487,ERR14096488,ERR14096489,SRR19735893,...,SRR26436269,SRR26436270,SRR26436271,SRR26436275,SRR26436276,SRR26436277,SRR32302926,SRR32302927,SRR32302928,SRR32302929
0,gene:Solyc00g500001.1,SL4.0ch00,93750,94430,+,681,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,gene:Solyc00g500002.1,SL4.0ch00,305442,306257,-,816,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,gene:Solyc00g500003.1,SL4.0ch00;SL4.0ch00;SL4.0ch00;SL4.0ch00;SL4.0c...,311496;330270;344080;347298;351799;381867,311570;330628;344133;347428;352644;382066,-;-;-;-;-;-,1665,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,gene:Solyc00g500004.1,SL4.0ch00;SL4.0ch00,417592;417794,417723;418482,+;+,821,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,gene:Solyc00g500005.1,SL4.0ch00,478389,478640,+,252,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Se van a normalizar los datos con el método de TPMs (Transcript per million). La normalización por TPM (Transcripts Per Million) es una técnica que transforma los conteos brutos de expresión génica para hacerlos comparables dentro de cada muestra. Primero, los conteos se dividen entre la longitud del gen (en kilobases), lo que corrige el sesgo introducido por el hecho de que los genes más largos tienden a generar más lecturas. Luego, estos valores normalizados por longitud se escalan dividiendo entre la suma total de todos los genes normalizados en la muestra y multiplicando por un millón. Así, el total de TPMs en cada muestra es siempre 1 millón, lo que permite comparar la abundancia relativa de diferentes genes dentro de la misma muestra de forma precisa, aunque no es adecuado para comparar entre muestras sin más ajustes.

In [10]:
count_cols = counts_df.columns[6:]
counts_per_kb = counts_df[count_cols].div(counts_df["Length"] / 1000, axis=0)
scaling_factors = counts_per_kb.sum(axis=0) / 1e6
tpm = counts_per_kb.div(scaling_factors, axis=1)
counts_df[count_cols] = tpm


In [11]:
counts_df.head()


Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,ERR14096487,ERR14096488,ERR14096489,SRR19735893,...,SRR26436269,SRR26436270,SRR26436271,SRR26436275,SRR26436276,SRR26436277,SRR32302926,SRR32302927,SRR32302928,SRR32302929
0,gene:Solyc00g500001.1,SL4.0ch00,93750,94430,+,681,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,gene:Solyc00g500002.1,SL4.0ch00,305442,306257,-,816,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,gene:Solyc00g500003.1,SL4.0ch00;SL4.0ch00;SL4.0ch00;SL4.0ch00;SL4.0c...,311496;330270;344080;347298;351799;381867,311570;330628;344133;347428;352644;382066,-;-;-;-;-;-,1665,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,gene:Solyc00g500004.1,SL4.0ch00;SL4.0ch00,417592;417794,417723;418482,+;+,821,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,gene:Solyc00g500005.1,SL4.0ch00,478389,478640,+,252,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Vamos a comprobar que la normalización ha sido correcta


In [12]:
counts_df[count_cols].sum()


ERR14096487    1000000.0
ERR14096488    1000000.0
ERR14096489    1000000.0
SRR19735893    1000000.0
SRR19735894    1000000.0
SRR19735897    1000000.0
SRR19735898    1000000.0
SRR2002284     1000000.0
SRR2011289     1000000.0
SRR21847265    1000000.0
SRR21847266    1000000.0
SRR21847267    1000000.0
SRR26436257    1000000.0
SRR26436258    1000000.0
SRR26436259    1000000.0
SRR26436263    1000000.0
SRR26436264    1000000.0
SRR26436265    1000000.0
SRR26436269    1000000.0
SRR26436270    1000000.0
SRR26436271    1000000.0
SRR26436275    1000000.0
SRR26436276    1000000.0
SRR26436277    1000000.0
SRR32302926    1000000.0
SRR32302927    1000000.0
SRR32302928    1000000.0
SRR32302929    1000000.0
dtype: float64

Todas las columnas suman aproximadamente 1M, por lo que podemos decir que la normalización ha sido correcta. Guardamos el dataframe en un archivo

In [14]:
counts_df.to_csv("SL4_counts_scaled.txt", sep='\t', index=False)