In [None]:
import os
import re
import csv
import shlex
import pandas as pd

def preprocess_token(token):
    token = token.strip("-:")
    return token

def find_operators_in_script(directory, pattern_list, output_file_name):
    result_list = []

    for pipeline in os.listdir(directory):
        subdirectory_path = os.path.join(directory, pipeline)
        folder_name = os.path.basename(subdirectory_path)
        #print(pipeline)

        for root, dirs, files in os.walk(subdirectory_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                if filename.endswith('.nf'):
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                            lines = file.readlines()
                        
                        inside_block = False
                        process_name = None
                        continuation_line = ""

                        process_pattern = r'process\s+(\w+)\s*{'
                        
                        for line_num, line in enumerate(lines):
                            line = continuation_line + line
                            continuation_line = ""

                            stripped_line = line.strip()
                            
                            if line.strip().startswith(('#', '//', '*')):
                                continue

                            if re.search(process_pattern, stripped_line):
                                process_flag = True
                                process_match = re.search(process_pattern, stripped_line)
                                if process_match:
                                    process_name = process_match.group(1)

                            if '"""' in stripped_line:
                                inside_block = not inside_block
                                continue

                            if inside_block:
                                if stripped_line.endswith("\\"):
                                    continuation_line = stripped_line[:-2]
                                    continue

                                lexer = shlex.shlex(stripped_line, posix=True)
                                lexer.whitespace_split = True
                                tokens = list(lexer)
                                #print(tokens)
                                
                                for token in tokens:
                                    processed_token = preprocess_token(token)
                                    if processed_token in pattern_list:
                                        result_list.append([folder_name, process_name, processed_token, file_path, line_num+1, stripped_line])
                    except Exception as e:
                        #print(f"Error processing file '{file_path}' in line {line_num}: {e}")
                        continue 

    write_to_csv(result_list, output_file_name)

def write_to_csv(result_list, output_file_name):
    with open(f'./results/{output_file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['pipeline', 'process_name', 'operator', 'file_path', 'line_number', 'line_content'])
        writer.writerows(result_list)
        
directory_path = './git_repos/'

df_operators = pd.read_excel('./data/all_operator_dataset.xlsx')
operator_list = [str(operator) for operator in df_operators['operator']]

output_file_name = 'all_operators_in_scripts_shlex'

find_operators_in_script(directory_path, operator_list, output_file_name)


Change Assigned Group of Bioinformatic Tools

In [None]:
'''import pandas as pd

bio_tools = pd.read_excel("./data/bioinformatics_tools.xlsx")
all_operators = pd.read_csv("./data/operator_dataset_clustered.csv")

operators_set = set(bio_tools['operator'])
all_operators.loc[all_operators['Operator'].isin(operators_set), 'Assigned Group'] = 0


#print(all_operators[all_operators['Assigned Group'] == 0])

all_operators.to_csv("./data/operator_dataset_clustered_edited.csv", index=False)'''

# Analysis of the processes

In [18]:
#operator_script = pd.read_csv('./results/all_operators_in_processes_shlex.csv')

operator_script = pd.read_csv('./results/all_operators_in_scripts_shlex.csv')
df_operator_clustered = pd.read_csv('./data/operator_dataset_clustered_edited.csv')

In [19]:
operator_script

Unnamed: 0,pipeline,process_name,operator,file_path,line_number,line_content
0,airrflow,FASTQC_POSTASSEMBLY,[,./git_repos/airrflow\modules\local\fastqc_post...,23,[ ! -f ${prefix}.fastq ] && ln -s $reads ${pr...
1,airrflow,FASTQC_POSTASSEMBLY,!,./git_repos/airrflow\modules\local\fastqc_post...,23,[ ! -f ${prefix}.fastq ] && ln -s $reads ${pr...
2,airrflow,FASTQC_POSTASSEMBLY,f,./git_repos/airrflow\modules\local\fastqc_post...,23,[ ! -f ${prefix}.fastq ] && ln -s $reads ${pr...
3,airrflow,FASTQC_POSTASSEMBLY,],./git_repos/airrflow\modules\local\fastqc_post...,23,[ ! -f ${prefix}.fastq ] && ln -s $reads ${pr...
4,airrflow,FASTQC_POSTASSEMBLY,&&,./git_repos/airrflow\modules\local\fastqc_post...,23,[ ! -f ${prefix}.fastq ] && ln -s $reads ${pr...
...,...,...,...,...,...,...
26717,viralrecon,UNTAR,touch,./git_repos/viralrecon\modules\nf-core\untar\m...,56,touch ${prefix}/file.txt
26718,viralrecon,UNTAR,cat,./git_repos/viralrecon\modules\nf-core\untar\m...,58,cat <<-END_VERSIONS > versions.yml
26719,viralrecon,UNTAR,sed,./git_repos/viralrecon\modules\nf-core\untar\m...,60,untar: \$(echo \$(tar --version 2>&1) | sed 's...
26720,viralrecon,VCFLIB_VCFUNIQ,bgzip,./git_repos/viralrecon\modules\nf-core\vcflib\...,28,vcfuniq $vcf | bgzip -c $args ...


In [20]:
value_counts = operator_script[operator_script['operator'] == 'if'].value_counts()

len(value_counts)

335

In [21]:
operator_script = operator_script[operator_script['operator'].str.len() >= 2]

In [22]:
all_process = operator_script.groupby(['pipeline', 'process_name']).size().reset_index(name='operator_count')

In [23]:
operators_in_all_process = operator_script.groupby(['pipeline', 'process_name']).size().reset_index(name='operator_count')
operators_in_all_process

Unnamed: 0,pipeline,process_name,operator_count
0,airrflow,ADD_META_TO_TAB,2
1,airrflow,AIRRFLOW_REPORT,4
2,airrflow,CHANGEO_ASSIGNGENES,4
3,airrflow,CHANGEO_CONVERTDB_FASTA,5
4,airrflow,CHANGEO_CREATEGERMLINES,5
...,...,...,...
2579,viralrecon,TABIX_BGZIP,8
2580,viralrecon,TABIX_TABIX,8
2581,viralrecon,UNICYCLER,8
2582,viralrecon,UNTAR,17


In [24]:
df_operator_clustered = df_operator_clustered.rename(columns={'Operator': 'operator'})

merged_df = pd.merge(operator_script, df_operator_clustered, on='operator')

In [25]:
merged_df

Unnamed: 0,pipeline,process_name,operator,file_path,line_number,line_content,Assigned Group
0,airrflow,FASTQC_POSTASSEMBLY,&&,./git_repos/airrflow\modules\local\fastqc_post...,23,[ ! -f ${prefix}.fastq ] && ln -s $reads ${pr...,6
1,airrflow,COLLAPSE_DUPLICATES,&&,./git_repos/airrflow\modules\local\enchantr\co...,34,cp -r enchantr ${meta.id}_collapse_report && r...,6
2,airrflow,DETECT_CONTAMINATION,&&,./git_repos/airrflow\modules\local\enchantr\de...,33,cp -r enchantr all_reps_cont_report && rm -rf ...,6
3,airrflow,DOWSER_LINEAGES,&&,./git_repos/airrflow\modules\local\enchantr\do...,51,cp -r enchantr ${id_name}_dowser_report && rm ...,6
4,airrflow,FIND_THRESHOLD,&&,./git_repos/airrflow\modules\local\enchantr\fi...,59,cp -r enchantr all_reps_dist_report && rm -rf ...,6
...,...,...,...,...,...,...,...
21640,viralrecon,NEXTCLADE_DATASETGET,nextclade,./git_repos/viralrecon\modules\nf-core\nextcla...,39,nextclade: \$(echo \$(nextclade --version 2>&1...,6
21641,viralrecon,NEXTCLADE_RUN,nextclade,./git_repos/viralrecon\modules\nf-core\nextcla...,40,nextclade run $args --...,6
21642,viralrecon,NEXTCLADE_RUN,nextclade,./git_repos/viralrecon\modules\nf-core\nextcla...,44,nextclade: \$(echo \$(nextclade --version 2>&1...,6
21643,viralrecon,PANGOLIN,pangolin,./git_repos/viralrecon\modules\nf-core\pangoli...,28,pangolin $fasta --outfile ${pre...,6


In [26]:
count_operators = merged_df[merged_df['Assigned Group'] == 0].groupby('operator')['operator'].count()
nunique_pipeline = merged_df[merged_df['Assigned Group'] == 0].groupby('operator')['pipeline'].nunique()
nunique_file_path = merged_df[merged_df['Assigned Group'] == 0].groupby('operator')['file_path'].nunique()

bio_tools = pd.DataFrame({
    'pipeline': nunique_pipeline,
    'file_path': nunique_file_path,
    'sum_operators': count_operators
})

bio_tools = bio_tools.sort_values(by='pipeline', ascending=False)

In [27]:
bio_tools

Unnamed: 0_level_0,pipeline,file_path,sum_operators
operator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
multiqc,75,84,354
fastqc,60,61,304
samtools,53,342,988
fasta,23,31,42
bwa,22,48,176
fastp,20,20,118
cutadapt,20,20,43
bedtools,19,56,149
bowtie2-build,18,19,19
bowtie2,17,34,94


In [28]:
merged_df['pipeline_process_name'] = merged_df['pipeline'] + "." + merged_df['process_name']

In [29]:
grouped_df = merged_df.groupby('Assigned Group').agg({'pipeline': 'nunique', 'pipeline_process_name': 'nunique', 'operator': 'count'}).reset_index()

In [30]:
sum_processes = 3081

In [31]:
grouped_df['Ratio to total processes'] = grouped_df['pipeline_process_name']/sum_processes

# insert to position 2
position_to_insert = 1
grouped_df.insert(position_to_insert, 'Ratio to total processes', grouped_df.pop('Ratio to total processes'))

In [32]:
grouped_df = grouped_df.rename(columns={'file_path': 'Files', 'operator' : 'Total operator occurrences'})

In [33]:
grouped_df

Unnamed: 0,Assigned Group,Ratio to total processes,pipeline,pipeline_process_name,Total operator occurrences
0,0,0.35508,85,1094,3217
1,1,0.101266,84,312,1288
2,2,0.283999,84,875,2001
3,3,0.741318,81,2284,4052
4,4,0.31678,91,976,1897
5,5,0.07887,64,243,469
6,6,0.714054,94,2200,8721


In [None]:
count_operators = merged_df[merged_df['Assigned Group'] == 0].groupby('operator')['operator'].count()
nunique_pipeline = merged_df[merged_df['Assigned Group'] == 0].groupby('operator')['pipeline'].nunique()
nunique_file_path = merged_df[merged_df['Assigned Group'] == 0].groupby('operator')['file_path'].nunique()

bio_tools = pd.DataFrame({
    'pipeline': nunique_pipeline,
    'file_path': nunique_file_path,
    'sum_operators': count_operators
})

bio_tools = bio_tools.sort_values(by='pipeline', ascending=False)

In [None]:
bio_tools

In [None]:
import os
import csv
import re

def find_operators_in_config(directory, operator_list, output_file_name):
    result_list = []

    for pipeline in os.listdir(directory):
        subdirectory_path = os.path.join(directory, pipeline)
        folder_name = os.path.basename(subdirectory_path)
        print(pipeline)

        for root, _, files in os.walk(subdirectory_path):
            for file in files:
                file_path = os.path.join(root, file)
                if file.endswith('.config'):
                    with open(file_path, 'r') as config_file:
                        for line_number, line in enumerate(config_file, start=1):
                            if line.strip().startswith(('#', '//', '*')):
                                continue
                            stripped_line = line.strip()
                            for pattern in operator_list:
                                pattern = r'\s' + re.escape(pattern) + r'\s'
                                
                                pattern = re.escape(pattern)
                                if re.search(pattern, stripped_line):
                                    result_list.append([folder_name, pattern, file_path, line_number, stripped_line])
    
    write_to_csv(result_list, output_file_name)

def write_to_csv(result_list, output_file_name):
    with open(f'./results/{output_file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['pipeline', 'operator', 'file_path', 'line_number', 'line_content'])
        writer.writerows(result_list)


In [None]:
directory_path = './git_repos/'

df_operators = pd.read_csv('./data/operator_dataset_clustered_edited.csv')
operator_list = [str(operator) for operator in df_operators[df_operators['Assigned Group'] == 0]['Operator']]

output_file_name = 'all_operators_in_configs'

find_operators_in_config(directory_path, operator_list, output_file_name)

In [None]:
import pandas as pd

bio_in_config = pd.read_csv('./results/all_operators_in_configs.csv')

bio_in_config

In [None]:
exact_entry = 'BATCH_PROC'
column_name = 'process_name'

if df[df['process_name'] == exact_entry].empty:
    print(f"No exact entry '{exact_entry}' found in column '{column_name}'.")
else:
    print(f"Exact entry '{exact_entry}' found in column '{column_name}'.")

In [None]:
import os
import re
import csv
import shlex
import pandas as pd

def preprocess_token(token):
    token = token.strip("-:")
    return token

def find_operators_in_script(directory, pattern_list, output_file_name):
    result_list = []

    for pipeline in os.listdir(directory):
        subdirectory_path = os.path.join(directory, pipeline)
        folder_name = os.path.basename(subdirectory_path)
        #print(pipeline)

        for root, dirs, files in os.walk(subdirectory_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                if filename.endswith('.nf'):
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                            lines = file.readlines()
                        
                        inside_block = False
                        process_name = None
                        continuation_line = ""

                        process_pattern = r'process\s+(\w+)\s*{'
                        
                        for line_num, line in enumerate(lines):
                            line = continuation_line + line
                            continuation_line = ""

                            stripped_line = line.strip()
                            
                            if line.strip().startswith(('#', '//', '*')):
                                continue

                            if re.search(process_pattern, stripped_line):
                                process_flag = True
                                process_match = re.search(process_pattern, stripped_line)
                                if process_match:
                                    process_name = process_match.group(1)

                            if '"""' in stripped_line:
                                inside_block = not inside_block
                                continue

                            if inside_block:
                                if stripped_line.endswith("\\"):
                                    continuation_line = stripped_line[:-2]
                                    continue

                                lexer = shlex.shlex(stripped_line, posix=True)
                                lexer.whitespace_split = True
                                tokens = list(lexer)
                                #print(tokens)
                                
                                for token in tokens:
                                    processed_token = preprocess_token(token)
                                    if processed_token in pattern_list:
                                        result_list.append([folder_name, process_name, processed_token, file_path, line_num+1, stripped_line])
                    except Exception as e:
                        #print(f"Error processing file '{file_path}' in line {line_num}: {e}")
                        continue 

    write_to_csv(result_list, output_file_name)

def write_to_csv(result_list, output_file_name):
    with open(f'./results/{output_file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['pipeline', 'process_name', 'operator', 'file_path', 'line_number', 'line_content'])
        writer.writerows(result_list)
        
directory_path = './git_repos/'

df_operators = pd.read_excel('./data/all_operator_dataset.xlsx')
operator_list = [str(operator) for operator in df_operators['operator']]

output_file_name = 'all_operators_in_scripts_shlex'

find_operators_in_script(directory_path, operator_list, output_file_name)


In [None]:
def find_operators_in_script(directory, operator_list, result_dir):
    result_list = []

    for pipeline in os.listdir(directory):
        subdirectory_path = os.path.join(directory, pipeline)
        folder_name = os.path.basename(subdirectory_path)
        print(pipeline)

        for root, dirs, files in os.walk(subdirectory_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                if filename.endswith('.nf'):
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        content = file.readlines()

                        brace_stack = []
                        process_flag = False  
                        process_name = None

                        for line_num, line in enumerate(content):  # Enumerate lines to get line number
                            stripped_line = line.strip()
                            
                            if line.strip().startswith(('#', '//', '*')):
                                continue
                                
                            process_pattern = r'process\s+(\w+)\s*{'

                            if re.search(process_pattern, stripped_line):
                                process_flag = True
                                process_match = re.search(process_pattern, stripped_line)
                                if process_match:
                                    process_name = process_match.group(1)
                                    print(f'Process {process_name} in filepath {file_path}')
                                    brace_stack.clear()

                            if '{' in stripped_line:
                                brace_stack.append('{')

                            if '}' in stripped_line:
                                if brace_stack:
                                    brace_stack.pop()
                                    if not brace_stack and process_flag:
                                        process_flag = False
                                        

                            if process_flag and len(brace_stack) > 0:
                                if stripped_line.endswith("\\"):
                                    continuation_line = stripped_line[:-2]
                                    continue

                                try:
                                    lexer = shlex.shlex(stripped_line, posix=True)
                                    lexer.whitespace_split = True
                                    tokens = list(lexer)
                                except ValueError:
                                    continue
                                
                                for token in tokens:
                                    processed_token = preprocess_token(token)
                                    if processed_token in operator_list:
                                        result_list.append([folder_name, process_name, processed_token, file_path, line_num+1, stripped_line])
                                
    write_to_csv(result_list, result_dir)


def write_to_csv(result_list, output_file_name):
    with open(output_file_name, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['pipeline', 'process_name', 'operator', 'file_path', 'line_number', 'line_content'])
        writer.writerows(result_list)
        
directory_path = './git_repos/'
output_file_name = './results/all_operators_in_scripts_shlex.csv'

# Read the Excel file to get the list of operators
df_operators = pd.read_excel('./data/all_operator_dataset.xlsx')
operator_list = [str(operator) for operator in df_operators['operator']]

find_operators_in_script(directory_path, operator_list, output_file_name)

In [None]:
import os
import re
import csv
import shlex
import pandas as pd

def preprocess_token(token):
    token = token.strip("-:")
    return token

def find_operators_in_script(directory, pattern_list, output_file_name):
    result_list = []

    for pipeline in os.listdir(directory):
        subdirectory_path = os.path.join(directory, pipeline)
        folder_name = os.path.basename(subdirectory_path)
        #print(pipeline)

        for root, dirs, files in os.walk(subdirectory_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                if filename.endswith('.nf'):
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                            lines = file.readlines()
                        
                        process_name = None
                        process_flag = False
                        brace_stack = [] 

                        process_pattern = r'process\s+(\w+)\s*{'
                        
                        try:
                            for line_num, line in enumerate(lines):

                                # so there wont be issues with double quotation
                                line = line.replace('"', '')

                                stripped_line = line.strip()
                                if line.strip().startswith(('#', '//', '*', "'")):
                                    continue

                                if re.search(process_pattern, stripped_line):
                                    process_flag = True
                                    process_match = re.search(process_pattern, stripped_line)
                                    if process_match:
                                        process_name = process_match.group(1)
                                    brace_stack.clear()

                                if '{' in stripped_line:
                                    brace_stack.append('{')

                                if '}' in stripped_line:
                                    if brace_stack:
                                        brace_stack.pop()
                                        if not brace_stack and process_flag:
                                            process_flag = False

                                if process_flag and len(brace_stack) > 0:
                                    if stripped_line.endswith("\\"):
                                        continuation_line = stripped_line[:-2]
                                        continue

                                    lexer = shlex.shlex(stripped_line, posix=True)
                                    lexer.whitespace_split = True
                                    tokens = list(lexer)

                                    for token in tokens:
                                        processed_token = preprocess_token(token)
                                        if processed_token in pattern_list:
                                            result_list.append([folder_name, process_name, processed_token, file_path, line_num+1, stripped_line])
                        except Exception as e:
                            print(f"Error, processing file '{file_path}' in line {line_num}: {e}")
                            continue 
                        
                    except Exception as e:
                        print(f"File not readable")
                        continue 

    write_to_csv(result_list, output_file_name)

def write_to_csv(result_list, output_file_name):
    with open(f'./results/{output_file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['pipeline', 'process_name', 'operator', 'file_path', 'line_number', 'line_content'])
        writer.writerows(result_list)
        
directory_path = './git_repos/'

df_operators = pd.read_excel('./data/all_operator_dataset.xlsx')
operator_list = [str(operator) for operator in df_operators['operator']]

output_file_name = 'all_operators_in_processes_shlex'

find_operators_in_script(directory_path, operator_list, output_file_name)