In [1]:
import polars as pl
from polars import col

In [14]:
in_fn = "rna-genes.t2t.bo_and_trac51bp-seq.long_name_position.MT_fix.tsv"

df = pl.read_csv(in_fn, separator="\t", comment_char="#", columns=range(0,10), infer_schema_length=10_000)
#df = df.drop(['Chr', 'Start','End',  'Strand', 'Length'])


In [17]:
df.columns

['Geneid', 'D1_BH4', 'D1_NT', 'NoD1_BH4', 'NoD1_NT']

In [20]:
df = df.with_columns(col("Geneid").str.split("__").list.first().alias("RNA_class"))

In [25]:
rna_df = df.groupby("RNA_class", maintain_order=True).count().rename({"count":"total"})
rna_df

RNA_class,total
str,u32
"""snRNA""",1902
"""miRNA""",2046
"""misc_RNA""",2231
"""snoRNA""",948
"""scaRNA""",48
"""rRNA_pseudogen…",514
"""rRNA""",1007
"""Mt_rRNA""",3
"""sRNA""",5
"""scRNA""",2


In [22]:
rna_to_use = ["snRNA", "miRNA", "misc_RNA", "snoRNA", "scaRNA", "rRNA_pseudogene", "rRNA"]

In [32]:
for col_name in ['D1_BH4', 'D1_NT', 'NoD1_BH4', 'NoD1_NT']:
    tmp_df = df.filter(col(col_name) > 0).groupby("RNA_class", maintain_order=True).count()
    tmp_df = tmp_df.rename({"count" : col_name})
    print(tmp_df)
    #.rename({"count", f"{col_name}_counts"})
    rna_df = rna_df.join(tmp_df, on="RNA_class")

rna_df

shape: (10, 2)
┌───────────┬────────┐
│ RNA_class ┆ D1_BH4 │
│ ---       ┆ ---    │
│ str       ┆ u32    │
╞═══════════╪════════╡
│ miRNA     ┆ 245    │
│ snRNA     ┆ 399    │
│ scaRNA    ┆ 21     │
│ misc_RNA  ┆ 583    │
│ …         ┆ …      │
│ rRNA      ┆ 23     │
│ Mt_rRNA   ┆ 3      │
│ sRNA      ┆ 1      │
│ vault_RNA ┆ 1      │
└───────────┴────────┘
shape: (10, 2)
┌───────────┬───────┐
│ RNA_class ┆ D1_NT │
│ ---       ┆ ---   │
│ str       ┆ u32   │
╞═══════════╪═══════╡
│ miRNA     ┆ 138   │
│ snRNA     ┆ 333   │
│ misc_RNA  ┆ 433   │
│ scaRNA    ┆ 17    │
│ …         ┆ …     │
│ rRNA      ┆ 19    │
│ Mt_rRNA   ┆ 3     │
│ sRNA      ┆ 1     │
│ vault_RNA ┆ 1     │
└───────────┴───────┘
shape: (9, 2)
┌─────────────────┬──────────┐
│ RNA_class       ┆ NoD1_BH4 │
│ ---             ┆ ---      │
│ str             ┆ u32      │
╞═════════════════╪══════════╡
│ miRNA           ┆ 214      │
│ snRNA           ┆ 309      │
│ misc_RNA        ┆ 499      │
│ scaRNA          ┆ 21       │
│ 

RNA_class,total,D1_BH4,D1_NT,NoD1_BH4,NoD1_NT
str,u32,u32,u32,u32,u32
"""miRNA""",2046,245,138,214,262
"""misc_RNA""",2231,583,433,499,561
"""snRNA""",1902,399,333,309,419
"""scaRNA""",48,21,17,21,19
"""snoRNA""",948,339,309,319,356
"""rRNA_pseudogen…",514,259,238,218,248
"""rRNA""",1007,23,19,20,21
"""Mt_rRNA""",3,3,3,3,3
"""sRNA""",5,1,1,2,1


In [33]:
rna_df.write_csv("rna_types_counts.tsv", separator="\t")

In [35]:
ratios_df = df.with_columns([(col("D1_NT") + col("NoD1_NT")).alias("NT_sum"),
                 (col("D1_BH4") + col("NoD1_BH4")).alias("treated_sum")])

In [40]:
ratios_df.filter(col("NT_sum") > 10 ).filter(col("treated_sum") > 10).with_columns((col("treated_sum") / col("NT_sum")).alias("ratio")).sort("ratio").write_csv("rna_genes_ratios.tsv", separator="\t")