In [1]:
import re 
import sys
import yaml 
import shutil
import pandas as pd
from collections import defaultdict
import logging
import os



In [2]:
import datetime 
now = datetime.datetime.now()
start_time = now.strftime("%Y-%m-%d %H:%M:%S")
import os
os.environ['start_time'] = start_time

In [3]:
from datetime import datetime

class RunTime:
    def __init__(self, start_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S")):
        self.start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')

    def elapsed(self):
        self.end_time = datetime.now()
        return f'Duration: {self.end_time - self.start_time}'

In [4]:
# cols
RED = "\033[;31;1m"
RED_ = "\033[;31;4m"
GRE = "\033[;32;1m"
YEL = "\033[;33;1m"
BLU = "\033[;34;1m"
PRP = "\033[;35;1m"
CYN = "\033[;36;1m"
BLD = "\033[;37;1m"
GRY = "\033[;30;1m"
NC = "\033[;39;0m"
NC_ = "\033[;39;4m"

start_time = os.environ['start_time']
runtime = RunTime(start_time)



# other vars
global_vars = {}




In [5]:
class CustomFormatter(logging.Formatter):
    ### code from:
    ## https://stackoverflow.com/questions/384076/how-can-i-color-python-logging-output

    format = "[%(asctime)s] %(levelname)s: %(message)s"
    datefmtstr = "%Y-%m-%d %H:%M:%S"

    FORMATS = {
        logging.DEBUG: format.replace('%(levelname)s', GRY + '%(levelname)s' + NC),
        logging.INFO: format.replace('%(levelname)s', GRY + '%(levelname)s' + NC),
        logging.WARNING: format.replace('%(levelname)s', YEL + '%(levelname)s' + NC),
        logging.ERROR: format.replace('%(levelname)s', RED + '%(levelname)s' + NC),
        logging.CRITICAL: format.replace('%(levelname)s', RED_ + '%(levelname)s' + NC),
    }

    def format(self, record):
        log_fmt = self.FORMATS.get(record.levelno)
        formatter = logging.Formatter(log_fmt, datefmt=self.datefmtstr)
        return formatter.format(record)
    

class GLogger:
    def __init__(self):
        # create logger attr and set level
        self.logger = logging.getLogger('GUAP_logger')
        self.formatter = logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

    def create_console_handler(self, verbose=False):
        # create handler
        self.ch = logging.StreamHandler()

        # set level according to user verbose option
        if verbose:
            self.ch.setLevel(logging.DEBUG)
        else:
            self.ch.setLevel(logging.WARNING)

        self.ch.setFormatter(CustomFormatter())

        # add handler
        self.logger.addHandler(self.ch)

    def create_file_handler(self, file):
        # create handler
        self.fh = logging.FileHandler(file)
        self.fh.setLevel(logging.DEBUG)
        self.fh.setFormatter(self.formatter)

        self.logger.addHandler(self.fh)


    def prnt_info(self, str):
        self.logger.info(f"{str}")


    def prnt_warning(self, str):
        self.logger.warning(f"{str}")


    def prnt_error(self, str):
        self.logger.error(f"{str}")


    def prnt_fatel(self, str):
        self.logger.fatal(f"{str}")
        print(f"{PRP}{runtime.elapsed()}{NC}")
        exit(1)



In [6]:
glogger = GLogger()

In [7]:

def check_extension(df): # takes pandas df and returns string
    # checks all files have same extension from pandas df, to use in generete sample table function
    uniques = df['ext'].unique()
    if len(uniques) > 1:
        glogger.prnt_fatel(f"Your input directory has multible fastq file extensions, please check directory.")

    else:
        return uniques[0]


def check_PE(df): # takes pandas df and returns string
    """checks all files either single or paried ended from pandas df, to use in generete sample table function"""
    uniques = df['PE'].unique()
    if len(uniques) > 1:
        glogger.prnt_fatel(f"Your input directory has both Paired and single files, please check directory.")
    else:
        return uniques[0]


def check_R(df): # takes pandas df and returns string
    """checks all files have same naming patterns from pandas df, to use in generete sample table function"""
    uniques = df['read_num'].unique()
    if len(uniques) > 1:
        glogger.prnt_fatel(f"Your input directory has multible fastq file naming patterns, please check directory.")
    else:
        return uniques[0].replace("1","")


def check_pattern(df): # takes pandas df and returns a string 
    """checks all files have same naming patterns from pandas df, to use in generete sample table function"""
    uniques = df['matched_pattern'].unique()
    if len(uniques) > 1:
        glogger.prnt_fatel(f"Your input directory has multible fastq file naming patterns, please check directory.")
    else:
        return uniques[0]
    

In [8]:
def recogize_pattern(file_name): # takes string of fastq file name and returns dict with read info and id
    """ using re to recognize the naming pattern of samples (illumina, srr and general naming patten)"""
    # naming pattern for re 
    patterns = { # ! fix (_|\.) group for R pattern in dict config !
        "Novagen1": "(((.+)_(.+)-(.+))_(L\d+)((_)([1|2]))\.(fastq\.gz|fastq|fq\.gz|fq))",
        "Novagen2": "(((.+)_(.+)-(.+))_(L\d+)((-)(r[1|2]))\.(fastq\.gz|fastq|fq\.gz|fq))",
        "illumina": "(((.+)_(S\d+)_(L00\d))_(R1|R2|r1|r2|read1|read2)_(00\d)\.(fastq\.gz|fastq|fq\.gz|fq))",
        "SRR": "(((SRR)(\d+))(_|\.)(1|2|R1|R2|r1|r2|read1|read2)\.(fastq\.gz|fastq|fq\.gz|fq))",
        "general": "(((.+))(_|\.)(1|2|R1|R2|r1|r2|read1|read2)\.(fastq\.gz|fastq|fq\.gz|fq))"
    }

    matched_pattern = None
    ## loop on pattern to and checks whichs one matches 
    ## starting with illumina because general would match any ways
    ## breaks once successful 
    for ptrn_name, pattern in patterns.items():
        try:
            matched = re.match(pattern, file_name) 
        except:
            continue
        
        if bool(matched) :
            matched_pattern = ptrn_name
            break
            
        else:
            continue

    if matched_pattern == "Novagen1":
        file_name, sample_name, sample_id, acc1, acc2, lane, R_pattern, R_sep, read_num, ext, tail, sample_number =  matched.groups()[0], matched.groups()[1], matched.groups()[2], matched.groups()[3], matched.groups()[4], matched.groups()[5], matched.groups()[6], matched.groups()[7] , matched.groups()[8] , matched.groups()[9], "", ""

    elif matched_pattern == "Novagen2":
        file_name, sample_name, sample_id, acc1, acc2, lane, R_pattern, R_sep, read_num, ext, tail, sample_number =  matched.groups()[0], matched.groups()[1], matched.groups()[2], matched.groups()[3], matched.groups()[4], matched.groups()[5], matched.groups()[6], matched.groups()[7] , matched.groups()[8] , matched.groups()[9], "", ""

    elif matched_pattern == "illumina":
        file_name, sample_name, sample_id, sample_number, read_num, lane, tail, ext = matched.groups()[0], matched.groups()[1], matched.groups()[2], matched.groups()[3], matched.groups()[5], matched.groups()[4], matched.groups()[6], matched.groups()[7]

    elif matched_pattern == "SRR":
        glogger.prnt_fatel(f"{RED}{matched_pattern}{NC} is currntly not supported sample naming pattern\nonly {GRE}'Illumina'{NC} naming pattern is supported at the moment")
        file_name, sample_name, sample_id, sample_number, read_num, lane, tail, ext = matched.groups()[0], matched.groups()[1], matched.groups()[3], "", matched.groups()[5], "", "", matched.groups()[6]

    elif matched_pattern == "general":
        glogger.prnt_fatel(f"{RED}{matched_pattern}{NC} is currntly not supported sample naming pattern\nonly {GRE}'Illumina'{NC} naming pattern is supported at the moment")
        file_name, sample_name, sample_id, sample_number, read_num, lane, tail, ext = matched.groups()[0], matched.groups()[1], matched.groups()[1], "", matched.groups()[4], "", "", matched.groups()[5]

    else:
        glogger.prnt_fatel(f"{RED}Your Samples Pattern is an unfamiler pattern.{NC}\nPlease contact my Developpers and they will look into it :D")
        file_name = sample_name = sample_id = sample_number = read_num = lane = tail = ext = None


    if matched_pattern == "Novagen1" or matched_pattern == "Novagen2":
        return {
            "file_name": file_name,
            "sample_name": sample_name,
            "sample_id": sample_id,
            "acc1": acc1,
            "acc2": acc2,
            "lane": lane,
            "R_pattern" : R_pattern, 
            "R_sep" : R_sep, 
            "read_num" : read_num, 
            "ext" : ext,
            "tail": tail,
            "sample_number": sample_number,
            "matched_pattern": ptrn_name
        }
    
    else:
        # Returns a dictionary of sample information
        return {
            "file_name": file_name,
            "sample_name": sample_name,
            "sample_id": sample_id,
            "sample_number": sample_number,
            "read_num": read_num,
            "lane": lane,
            "tail": tail,
            "ext": ext,
            "matched_pattern": ptrn_name
        }


In [9]:
def parse_samples(inpath): # takes path return contains fastq files, returns df contains sample information
    ## takes input path
    ## gets the file names containg fastq and fq
    ## performs the recogize_pattern function to 
    ## capture sample information and stores it in 
    ## pandas df
    # input path to absolute path
    path = os.path.abspath(inpath)
    # list all files
    all_files = os.listdir(path)
    samples = defaultdict(dict)
    # takes fastq files only
    for file_name in all_files:
        if os.path.isfile(path + "/" + file_name) and ("fastq" in file_name or "fq" in file_name):
            # Captures the file path and name
            filename, file_extension = os.path.splitext(file_name)
            if "fastq" in filename or "fq" in filename:
                filename, new_ext = os.path.splitext(filename)
                file_extension = new_ext + file_extension
            # recogize_pattern function returns a dictitionary with sample names, id, and read information
            sample_info = recogize_pattern(file_name)

            # get only forward reads and replace the read number to get R2
            # appends sample information to a dict of dicts
            
            if "1" in sample_info["read_num"]:
                read_2 = sample_info["read_num"].replace("1","2")
                if sample_info["matched_pattern"] == "illumina":
                    read_1 = f"{sample_info['read_num']}_{sample_info['tail']}.f"
                    read_2 = f"{read_2}_{sample_info['tail']}.f"
                else:
                    read_1 = f"{sample_info['read_num']}.f"
                    read_2 = f"{read_2}.f"

                f2 = file_name.replace(read_1, read_2)
                if f2 in all_files:
                    sample_info["file2"] = f2
                    sample_info["PE"] = True
                    samples[sample_info["sample_name"]] = sample_info

                else:
                    sample_info["file2"] = ""
                    sample_info["PE"] = False

    # converts the dict to pandas df and returns the df
    m_samples = samples
    samples= pd.DataFrame(samples).T
    samples = samples.sort_values(by=['sample_id'])

    return samples

In [10]:
samples = parse_samples("empty_sample_names")

In [30]:
sample_table_file=('samples.tsv')
SampleTable = pd.read_table(sample_table_file,index_col=0)
samples = list(SampleTable.index) # sample full name
files_R1s = list(SampleTable.iloc[:, 0])
files_R2s = list(SampleTable.iloc[:, 13])

SampleTable['library_index'] = SampleTable['acc1'].str.cat(SampleTable['acc2'], sep='-')


library_index = list(SampleTable.iloc[:, 3])
samples_IDs = list(SampleTable.iloc[:, 2])
samples_names = list(SampleTable.iloc[:, 1])

In [40]:
SampleTable.to_csv("samples.tsv",sep='\t',index=False)

In [41]:

units = (
    pd.read_csv('samples.tsv', sep="\t", dtype={"sample_id": str, "library_index": str, "lane": str})
    .set_index(["sample_id", "library_index", "lane"], drop=False)
    .sort_index()
)


In [43]:
units.to_csv("units.tsv",sep='\t',index=False)

In [46]:
unit = units.loc["RZ100"]

In [47]:
unit

Unnamed: 0_level_0,Unnamed: 1_level_0,file_name,sample_name,sample_id,acc1,acc2,lane,R_pattern,R_sep,read_num,ext,tail,sample_number,matched_pattern,file2,PE,library_index
library_index,lane,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
CKDN220028927-1A_H3FYWDSX5,L1,RZ100_CKDN220028927-1A_H3FYWDSX5_L1_1.fq.gz,RZ100_CKDN220028927-1A_H3FYWDSX5,RZ100,CKDN220028927,1A_H3FYWDSX5,L1,_1,_,1,fq.gz,,,Novagen1,RZ100_CKDN220028927-1A_H3FYWDSX5_L1_2.fq.gz,True,CKDN220028927-1A_H3FYWDSX5
CKDN220028927-1A_H3G37DSX5,L3,RZ100_CKDN220028927-1A_H3G37DSX5_L3_1.fq.gz,RZ100_CKDN220028927-1A_H3G37DSX5,RZ100,CKDN220028927,1A_H3G37DSX5,L3,_1,_,1,fq.gz,,,Novagen1,RZ100_CKDN220028927-1A_H3G37DSX5_L3_2.fq.gz,True,CKDN220028927-1A_H3G37DSX5


In [31]:
SampleTable['library_index']

RZ100_CKDN220028927-1A_H3FYWDSX5            CKDN220028927-1A_H3FYWDSX5
RZ100_CKDN220028927-1A_H3G37DSX5            CKDN220028927-1A_H3G37DSX5
RZ100M_CKDN22H000023-1A_HT3H7DSX5           CKDN22H000023-1A_HT3H7DSX5
RZ100M_CKDN22H000023-1A_HT3F3DSX5           CKDN22H000023-1A_HT3F3DSX5
RZ101_CKDN220028929-1A_H3FYWDSX5            CKDN220028929-1A_H3FYWDSX5
                                                       ...            
S_ZIKA_23B_CKDN210020708-1A_HFW5VDSX3       CKDN210020708-1A_HFW5VDSX3
S_ZIKA_23Mom_CKDN210020709-1A_HFW5VDSX3     CKDN210020709-1A_HFW5VDSX3
S_ZIKA_24Mom_CKDN210020710-1A_HFW5VDSX3     CKDN210020710-1A_HFW5VDSX3
S_ZIKA_25Baby_CKDN210020712-1A_HFW5VDSX3    CKDN210020712-1A_HFW5VDSX3
S_ZIKA_25Mom_CKDN210020711-1A_HFW5VDSX3     CKDN210020711-1A_HFW5VDSX3
Name: library_index, Length: 1193, dtype: object

In [26]:
library_index

['CKDN220028927',
 'CKDN220028927',
 'CKDN22H000023',
 'CKDN22H000023',
 'CKDN220028929',
 'CKDN220028929',
 'CKDN220028930',
 'CKDN220028930',
 'CKDN220028931',
 'CKDN220028931',
 'CKDN220028932',
 'CKDN220028932',
 'CKDN220028933',
 'CKDN220028933',
 'CKDN22H000024',
 'CKDN22H000024',
 'CKDN220028935',
 'CKDN220028936',
 'CKDN220028936',
 'CKDN220028937',
 'CKDN220028937',
 'CKDN220028938',
 'CKDN220028939',
 'CKDN220028939',
 'CKDN220028940',
 'CKDN220028940',
 'CKDN220028941',
 'CKDN220028942',
 'CKDN220028942',
 'CKDN220028943',
 'CKDN220028944',
 'CKDN220028944',
 'CKDN220028945',
 'CKDN220028945',
 'CKDN22H000025',
 'CKDN22H000025',
 'CKDN220028947',
 'CKDN220028947',
 'CKDN220028948',
 'CKDN220028948',
 'CKDN220028949',
 'CKDN220028949',
 'CKDN22H000026',
 'CKDN22H000026',
 'CKDN220028951',
 'CKDN220028952',
 'CKDN22H000027',
 'CKDN22H000027',
 'CKDN220028954',
 'CKDN220028954',
 'CKDN22H000028',
 'CKDN22H000028',
 'CKDN220028956',
 'CKDN220028956',
 'CKDN220028957',
 'CKDN2200

In [11]:
samples.to_csv("samples.tsv",sep='\t')