# Program type analysis in Git repos

In [3]:
import os

def get_file_endings(directory):
    file_endings = set()
    for root, _, files in os.walk(directory):
        for file in files:
            _, file_extension = os.path.splitext(file)
            if file_extension:
                file_endings.add(file_extension)
    return file_endings


repository_path = './git_repos'

file_endings = get_file_endings(repository_path)
print('File endings in the repository:', file_endings)

File endings in the repository: {'.yml', '.psd', '.odp', '.qmd', '.rst', '.ttf', '.mustache', '.db', '.cpp', '.tex', '.awk', '.jar', '.sample', '.pack', '.png', '.ico', '.nf', '.jpg', '.embl', '.diff', '.ipynb', '.gff3', '.svg', '.bib', '.ai', '.json', '.ini', '.fna', '.gtf', '.Rmd', '.fai', '.R', '.config', '.RData', '.woff', '.pyc', '.csv', '.file', '.html', '.idXML', '.fas', '.snap', '.toml', '.bed', '.fasta', '.md', '.pl', '.drawio', '.xml', '.js', '.sqlite', '.tsv', '.iml', '.scss', '.cfg', '.groovy', '.pptx', '.eot', '.pdf', '.test', '.dia', '.conf', '.h5', '.r', '.eps', '.yaml', '.gz', '.fa', '.css', '.theme', '.sh', '.treefile', '.log', '.pm', '.idx', '.interval_list', '.rev', '.xlsx', '.cff', '.txt', '.py', '.bz2'}


In [35]:
import os
import re
import csv

def find_pattern_types(directory, pattern_list, output_file_name):
    result_list = []

    for pipeline in os.listdir(directory):
        subdirectory_path = os.path.join(directory, pipeline)
        folder_name = os.path.basename(subdirectory_path)

        for root, dirs, files in os.walk(subdirectory_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                if filename.endswith('.nf'):
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                            lines = file.readlines()

                        for line_num, line in enumerate(lines):
                            stripped_line = line.strip()
                            
                            if stripped_line.startswith(('#', '//', '*')):
                                continue
                            
                            for pattern in pattern_list:
                                
                                regex_pattern = re.compile(re.escape(pattern))
                                if regex_pattern.search(stripped_line):
                                    result_list.append([folder_name, pattern, file_path, line_num+1, stripped_line])
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

    if not os.path.exists('./results'):
        os.makedirs('./results')
    
    with open(f'./results/{output_file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['pipeline', 'pattern', 'file_path', 'line_number', 'line_content'])
        writer.writerows(result_list)

In [36]:
'''directory_path = './git_repos'

file_endings = get_file_endings(repository_path)
output_file_name = 'file_ending_occurences'

find_pattern_types(directory_path, file_endings, output_file_name)'''

"directory_path = './git_repos'\n\nfile_endings = get_file_endings(repository_path)\noutput_file_name = 'file_ending_occurences'\n\nfind_pattern_types(directory_path, file_endings, output_file_name)"

In [37]:
'''df = pd.read_csv('./results/file_ending_occurences.csv')
df'''

"df = pd.read_csv('./results/file_ending_occurences.csv')\ndf"

In [38]:
'''grouped_df = df.groupby('pattern').size().reset_index(name='total_rows').sort_values(by='total_rows', ascending=False)
grouped_df'''

"grouped_df = df.groupby('pattern').size().reset_index(name='total_rows').sort_values(by='total_rows', ascending=False)\ngrouped_df"

In [55]:
directory_path = './git_repos'
keywords = ['python', 'run', 'rscript', 'perl', 'script', 'groovy', 'shell', 'exec']
output_file_name = 'program_types_occurrences'

find_pattern_types(directory_path, keywords, output_file_name)

In [58]:
import pandas as pd
types_df = pd.read_csv('./results/program_types_occurrences.csv')
types_df

Unnamed: 0,pipeline,pattern,file_path,line_number,line_content
0,airrflow,run,./git_repos\airrflow\main.nf,26,"def String command = ""nextflow run ${workflow...."
1,airrflow,script,./git_repos\airrflow\modules\local\fastqc_post...,19,script:
2,airrflow,script,./git_repos\airrflow\modules\local\fetch_datab...,22,script:
3,airrflow,script,./git_repos\airrflow\modules\local\gunzip.nf,17,script:
4,airrflow,python,./git_repos\airrflow\modules\local\merge_UMI.nf,6,"conda ""conda-forge::python=3.8.0 conda-forge::..."
...,...,...,...,...,...
6604,viralrecon,script,./git_repos\viralrecon\modules\nf-core\vcflib\...,21,script:
6605,viralrecon,script,./git_repos\viralrecon\subworkflows\local\asse...,105,transcripts = SPADES.out.transcripts ...
6606,viralrecon,run,./git_repos\viralrecon\subworkflows\local\cons...,7,include { NEXTCLADE_RUN } from '../../modu...
6607,viralrecon,groovy,./git_repos\viralrecon\subworkflows\local\fast...,12,import groovy.json.JsonSlurper


In [69]:
grouped_types_df = types_df.groupby('pattern').agg({'pipeline': 'nunique', 'file_path': 'nunique'}).reset_index().sort_values(by='file_path', ascending=False)

In [70]:
grouped_types_df

Unnamed: 0,pattern,pipeline,file_path
6,script,94,2676
3,python,70,292
5,run,82,274
0,exec,38,60
2,perl,17,38
1,groovy,26,27
4,rscript,3,13
7,shell,6,7


# Analysis of modularization concepts in Nextflow files

In [42]:
directory_path = './git_repos'
modularization_keywords = ['template', 'include', 'wrapper', 'module', 'subworkflow']
output_file_name = 'modularization_concepts_occurrences'

find_pattern_types(directory_path, modularization_keywords, output_file_name)

In [43]:
mod_df = pd.read_csv('./results/modularization_concepts_occurrences.csv')
mod_df

Unnamed: 0,pipeline,pattern,file_path,line_number,line_content
0,airrflow,include,./git_repos\airrflow\main.nf,20,include { validateParameters; paramsHelp } fro...
1,airrflow,include,./git_repos\airrflow\main.nf,44,include { AIRRFLOW } from './workflows/airrflow'
2,airrflow,template,./git_repos\airrflow\modules\nf-core\custom\du...,23,template 'dumpsoftwareversions.py'
3,airrflow,include,./git_repos\airrflow\subworkflows\local\assemb...,5,include { VALIDATE_INPUT } from '../../modules...
4,airrflow,module,./git_repos\airrflow\subworkflows\local\assemb...,5,include { VALIDATE_INPUT } from '../../modules...
...,...,...,...,...,...
8721,viralrecon,module,./git_repos\viralrecon\workflows\nanopore.nf,86,include { CUSTOM_DUMPSOFTWAREVERSIONS } from...
8722,viralrecon,include,./git_repos\viralrecon\workflows\nanopore.nf,87,include { MOSDEPTH as MOSDEPTH_GENOME } from...
8723,viralrecon,module,./git_repos\viralrecon\workflows\nanopore.nf,87,include { MOSDEPTH as MOSDEPTH_GENOME } from...
8724,viralrecon,include,./git_repos\viralrecon\workflows\nanopore.nf,88,include { MOSDEPTH as MOSDEPTH_AMPLICON } from...


Most subworkflow files are in the module folder.. module might be redundant here ?

In [44]:
grouped_conc_df = mod_df.groupby('pattern').size().reset_index(name='total_occurences').sort_values(by='total_occurences', ascending=False)
grouped_conc_df

Unnamed: 0,pattern,total_occurences
0,include,4105
1,module,3691
2,subworkflow,498
3,template,418
4,wrapper,14


module and include matches are duplicates, most of the subworkflow files are in folder named 'module'. The previous parser was for the exact matches with the pattern 'include {' with 4094 matches