In [2]:
import polars as pl
import os
import re
from rich import print
from rich.console import Console
from pathlib import Path
import csv
from itertools import islice

console = Console()


In [3]:
def get_df(file_path):
  '''Carga un archivo, detecta su separador y devuelve un dataframe.

  Args:
      file_path(str): Ruta del archivo.

  Returns:
      pl.DataFrame: DataFrame con los datos.
  '''

  try:

    file_path = Path(file_path).resolve()
    print(f"[yellow]:warning:[/] Se utilizará el siguiente archivo [bold green]{file_path.name}[/]")

    with open(file_path, 'r', newline='', encoding='utf-8') as f:

        sample = ''.join(islice(f, 5)) # Lee (solo) las 5 primeras lineas y las concatena en una cadena ('' hace que no se meta nada en el medio) (Problema csvfile.readlines(): leería todo el archivo y despues se cogerian las 5 primeras)

        delimiter = csv.Sniffer().sniff(sample).delimiter # La clase sniffer detecta el formato de un csv. Sniff es un metodo que devuelve el objeto Dialect. delimiter es un atributo de dialect que contiene el separador
        print(f"[yellow]:warning:[/] Se ha detectado el separador: [bold yellow]{repr(delimiter)}[/]")

        f.seek(0) # Para devolver el puntero al principio del archivo

        df = pl.read_csv(f, separator=delimiter, infer_schema_length=10000) # Polars lee todo el archivo y devuelve el dataframe, aumentamos el infer_schema_length para que detecte bien los tipos de dato de la columna prob

    #with console.pager(styles=True): # Para que se abra la tabla como un scrolleable (esto lo puse por probar a usarlo porque polars te limita las lineas)
        #console.print(f"[bold] :bar_chart: Dataframe: [/] \n {df}")
    return df

  except FileNotFoundError:
      console.print(f":x: Archivo no encontrado en la ruta: {file_path}")

In [4]:
df_annotations=get_df(r"merge_tables\BeiRNA\BeiRNA_JoinedAnnotations.tsv")
df_counts=get_df(r"merge_tables\BeiRNA\countsfilteredBeiRNA_BagBrown_vs_countsfilteredBeiRNA_SeaWater.txt")

print(df_annotations)
print(df_counts)



## Validar la tabla de annotations 
 En este caso tengo que ver si la tabla tiene 15 o 35 columnas, si tiene solo 15 devuelve un df con estas tres X.ID", "GENENAME", "DESCRIPTION". Si la tabla tiene 35 se quedas solo con estas 12 ("ORF.ID", "Gene.name", "Gene.length", "ORF.length", "ORF.start",
  "ORF.end", "Strand", "Protein.sequence", "Pfam", "InterPro",
  "GENENAME", "DESCRIPTION").

Si tiene cualquier otro número de columnas lanzar un error e indicar que hay que usar una con 15 o 35

In [5]:
def validate_annotation_table(df, df_name = "df_annotations"):
    if df.width == 15:
        header = ["X ID", "GENENAME", "DESCRIPTION"]
        missing_columns = []

        console.print(f"El DataFrame {"df"} contiene [bold green]15[/] columnas.")

        for column in header:
            if column not in df.columns:
                missing_columns.append(column)    

        if len(missing_columns)!= 0:
            console.print(f"[bold red]:x:[/] La(s) columna(s) [italic purple]{missing_columns}[/] no está(n) presente(s) en el DataFrame {"df"}.")

        else:        
            console.print(f"[bold yellow]:warning: Se seleccionarán las siguientes columnas del DataFrame: [italic green]{header}[/]")
            return df.select(["X ID", "GENENAME", "DESCRIPTION"])

    elif df.width == 35:
        header = ["ORF ID", "Gene name", "Gene length", "ORF length", "ORF start", "ORF end", 
                  "Strand", "Protein sequence", "Pfam", "InterPro", "GENENAME", "DESCRIPTION"]
        missing_columns = []

        console.print(f"El DataFrame {df_name}contiene [bold green]35[/] columnas.")

        for column in header:
            if column not in df.columns:
                missing_columns.append(column)    

        if len(missing_columns)!= 0:
            console.print(f"[bold red]:x:[/] La(s) columna(s) [italic purple]{missing_columns}[/] no está(n) presente(s) en el DataFrame {"df"}.")

        else:        
            console.print(f"[bold yellow]:warning: Se seleccionarán las siguientes columnas del DataFrame: [italic green]{header}[/]")
            return df.select(["ORF ID", "Gene name", "Gene length", "ORF length", "ORF start", "ORF end", 
                  "Strand", "Protein sequence", "Pfam", "InterPro", "GENENAME", "DESCRIPTION"])
    
    else: 
        console.print(f"[bold red]:x:[/] DataFrame no válido. Debe contener 15 o 35 columnas.")

In [6]:
validate_annotation_table(df_annotations)

ORF ID,Gene name,Gene length,ORF length,ORF start,ORF end,Strand,Protein sequence,Pfam,InterPro,GENENAME,DESCRIPTION
str,str,i64,i64,i64,i64,str,str,str,str,str,str
"""TRINITY_DN1630_c0_g1_i4.p1""","""TRINITY_DN1630_c0_g1_i4""",5791,653,3,1964,"""+""","""STIDETRSYEGERNLNEERNSCGTKRLTSI…","""-""","""-""",,
"""TRINITY_DN27258_c0_g1_i5.p1""","""TRINITY_DN27258_c0_g1_i5""",1529,474,107,1528,"""+""","""MTEWIIFVQLFSGFLLISGKDACLIKLDPP…","""PF08205;PF07679;PF13927;PF0392…","""IPR003987 Intercellular adhesi…",,
"""TRINITY_DN210276_c0_g1_i1.p1""","""TRINITY_DN210276_c0_g1_i1""",673,129,284,673,"""-""","""VNGVSVCVNVCVCSLVLAGGIWLGWGVVVK…","""-""","""-""",,
"""TRINITY_DN23399_c0_g1_i5.p1""","""TRINITY_DN23399_c0_g1_i5""",868,289,2,868,"""+""","""TSHSQVFFVESICDDPEIIAENIKQVKFGS…","""PF00300;PF01591""","""IPR003094 Fructose-2,6-bisphos…",,"""6-phosphofructo-2-kinase"""
"""TRINITY_DN2814_c0_g1_i12.p1""","""TRINITY_DN2814_c0_g1_i12""",1625,397,1,1194,"""+""","""RKKNLLKDFVSIAGPLGVTHFIIFGKTSDS…","""PF04427""","""IPR007109 Brix domain;IPR04511…",,
…,…,…,…,…,…,…,…,…,…,…,…
"""TRINITY_DN6750_c1_g1_i6.p1""","""TRINITY_DN6750_c1_g1_i6""",2448,320,424,1386,"""+""","""MAEKMLFRFGVILTPESSDVEVLVLGSREE…","""PF00782;PF00686""","""IPR020422 Dual specificity pro…",,
"""TRINITY_DN6750_c1_g1_i5.p1""","""TRINITY_DN6750_c1_g1_i5""",2447,320,424,1386,"""+""","""MAEKMLFRFGVILTPESSDVEVLVLGSREE…","""PF00782;PF00686""","""IPR020422 Dual specificity pro…",,
"""TRINITY_DN875_c0_g1_i13.p2""","""TRINITY_DN875_c0_g1_i13""",2089,136,1029,1439,"""+""","""MKYKGYVNSSSQIKASFSHDYSFIVSGSED…","""-""","""IPR015943 WD40/YVTN repeat-lik…",,
"""TRINITY_DN88626_c0_g1_i1.p1""","""TRINITY_DN88626_c0_g1_i1""",1059,172,3,521,"""+""","""INDERLRKEFSPYGTITSAKVMTDGGQSKG…","""PF00076""","""IPR000504 RNA recognition moti…","""epab""","""Cytoplasmic poly(A)-binding pr…"


In [7]:
def validate_counts_table(df, df_name = "df_counts"):
    if df.width == 6:
        
        console.print(f"[bold yellow]:warning:[/] Se han cambiado los nombres de las columnas a: [italic green] ID, countsfiltered_ControlDMSO_mean, countsfiltered_DEHP_mean, theta, prob y log2FC[/].")
    
        return df.select([
            pl.col(df.columns[0]).alias("ID"),
            pl.col(df.columns[1]).alias("countsfiltered_ControlDMSO_mean"),
            pl.col(df.columns[2]).alias("countsfiltered_DEHP_mean"),
            pl.col(df.columns[3]).alias("theta"),
            pl.col(df.columns[4]).alias("prob"),
            pl.col(df.columns[5]).alias("log2FC"),
            ])
    
    else:
        console.print(f"[bold red]:x:[/] DataFrame no válido. Debe contener 6 columnas.")
    

In [8]:
validate_counts_table(df_counts)

ID,countsfiltered_ControlDMSO_mean,countsfiltered_DEHP_mean,theta,prob,log2FC
str,f64,f64,f64,f64,f64
"""TRINITY_DN10364_c0_g1~~TRINITY…",0.498321,48.792695,-1.972309,1.0,-6.613447
"""TRINITY_DN11499_c0_g1~~TRINITY…",3.606968,909.146477,-2.232298,1.0,-7.977582
"""TRINITY_DN12035_c60_g1~~TRINIT…",492.616367,0.510999,3.823713,1.0,9.912928
"""TRINITY_DN1212_c0_g1~~TRINITY_…",103.222961,51.28329,2.352558,1.0,1.009203
"""TRINITY_DN133801_c5_g1~~TRINIT…",143.21022,0.803017,2.169094,1.0,7.478488
…,…,…,…,…,…
"""TRINITY_DN18600_c0_g1~~TRINITY…",1.363475,2.794916,-0.452508,0.950005,-1.035517
"""TRINITY_DN2460_c0_g1~~TRINITY_…",0.30279,0.645784,-0.452295,0.950003,-1.092732
"""TRINITY_DN1743_c0_g1~~TRINITY_…",0.702961,1.526757,-0.452204,0.950002,-1.118954
"""TRINITY_DN64467_c1_g1~~TRINITY…",126.185108,179.536743,-0.452157,0.950002,-0.508737


In [9]:
def merge_tables (annotation_path, count_path, output_path, output_name):

    df_annotations = get_df(r"annotation_path")
    df_annotations = validate_annotation_table(df_annotations)

    df_counts = get_df(r"count_path")
    df_counts = validate_counts_table(df_counts)

    if df_annotations.width == 3:
        dfjdkajf

    
    elif df_annotations == 12: 

    
    

_IncompleteInputError: incomplete input (325762913.py, line 16)