# Benchmarking Transformer-based NER 

# Preprocess 

### Filter annotation files
* create a copy of annotation files by excluding: 
        * keywords_to_remove = {"Out-of-scope","LSF_out_of_context", "Geographical_Feature", "Occupations"}

In [13]:
import os

# Input folder containing .ann files
input_folder = '../../data/NER-Benchmarking/annotations/categorized/'
# Output folder for modified files
output_folder = '../../data/NER-Benchmarking/annotations/filtered/'

# Keywords to remove
keywords_to_remove = {"Out-of-scope","LSF_out_of_context", "Geographical_Feature", "Occupations"}

def remove_keywords(input_filepath, output_filepath):
    with open(input_filepath, 'r') as input_file, open(output_filepath, 'w') as output_file:
        for line in input_file:
            line = line.strip()
            if not any(keyword in line for keyword in keywords_to_remove):
                output_file.write(line + '\n')

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Iterate through .ann files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".ann"):
        input_filepath = os.path.join(input_folder, filename)
        output_filepath = os.path.join(output_folder, filename)
        remove_keywords(input_filepath, output_filepath)

print("Files with keywords removed have been created in the output folder.")


Files with keywords removed have been created in the output folder.


### Prepare dataset
* 20% hold out test data
* 80% divided into 5 fold for cross validation 

* Split with considering the LSF types distribution between train and test

In [3]:
import os
import shutil
import random
import random

# Set the random seed for reproducibility
random.seed(1361)
# Source directory containing your pairs of .txt and .ann files
source_directory_txt = '../../data/NER-Benchmarking/annotations/raw/'
source_directory_ann = '../../data/NER-Benchmarking/annotations/filtered/'

# Root destination directory for folds and test data
root_directory = '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/'

# Create the root directory if it doesn't exist
os.makedirs(root_directory, exist_ok=True)

# Create a directory for the test set
test_directory = os.path.join(root_directory, 'test_data')
os.makedirs(test_directory, exist_ok=True)

# List all .txt files in the source directory
txt_files = [filename for filename in os.listdir(source_directory_txt) if filename.endswith(".txt")]

# Shuffle the list of .txt files randomly
random.shuffle(txt_files)

# Define the 9 keywords
keywords = {
    'Environmental_exposures',
    'Physical_activity',
    'Socioeconomic_factors',
    'Drugs',
    'Mental_health_practices',
    'Non_physical_leisure_time_activities',
    'Beauty_and_Cleaning',
    'Nutrition',
    'Sleep'
}


# List all .txt files in the source directory
txt_files = [filename for filename in os.listdir(source_directory_txt) if filename.endswith(".txt")]

# Shuffle the list of .txt files randomly while keeping the test size at 20%
test_size = int(0.20 * len(txt_files))
test_files = random.sample(txt_files, test_size)
train_files = [file for file in txt_files if file not in test_files]



# Create a dictionary to track keyword distribution in each fold
keyword_counts = {keyword: [0] * 5 for keyword in keywords}

# Calculate the distribution of keywords in the test set
keyword_counts_test = {keyword: 0 for keyword in keywords}

for txt_file in test_files:
    ann_file = txt_file.replace(".txt", ".ann")
    source_ann_path = os.path.join(source_directory_ann, ann_file)

    # Read the .ann file to identify the keywords
    with open(source_ann_path, 'r') as ann_file_content:
        for line in ann_file_content:
            fields = line.strip().split()
            if fields and fields[0].startswith('T'):
                keyword = fields[1]  # Assumes the keyword appears as the second field
                if keyword in keywords:
                    keyword_counts_test[keyword] += 1

# Calculate the distribution of keywords in the train set
keyword_counts_train = {keyword: 0 for keyword in keywords}

for txt_file in train_files:
    ann_file = txt_file.replace(".txt", ".ann")
    source_ann_path = os.path.join(source_directory_ann, ann_file)

    # Read the .ann file to identify the keywords
    with open(source_ann_path, 'r') as ann_file_content:
        for line in ann_file_content:
            fields = line.strip().split()
            if fields and fields[0].startswith('T'):
                keyword = fields[1]  # Assumes the keyword appears as the second field
                if keyword in keywords:
                    keyword_counts_train[keyword] += 1

# Check keyword distribution in test and train sets
print("Distribution of Keywords in Test Set:")
for keyword, count in keyword_counts_test.items():
    print(f"  {keyword}: {count} files")

print("\nDistribution of Keywords in Train Set:")
for keyword, count in keyword_counts_train.items():
    print(f"  {keyword}: {count} files")

# Copy test files to the test folder
for txt_file in test_files:
    ann_file = txt_file.replace(".txt", ".ann")
    dest_txt_path = os.path.join(test_directory, txt_file)
    dest_ann_path = os.path.join(test_directory, ann_file)
    shutil.copy(os.path.join(source_directory_txt, txt_file), dest_txt_path)
    shutil.copy(os.path.join(source_directory_ann, ann_file), dest_ann_path)

# Create directories for each fold in the train set
num_folds = 5
fold_size = len(train_files) // num_folds

for fold in range(num_folds):
    fold_directory = os.path.join(root_directory, f'fold_{fold + 1}')
    os.makedirs(fold_directory, exist_ok=True)

# Iterate over the .txt files in the train set and distribute them to folds
for txt_file in train_files:
    ann_file = txt_file.replace(".txt", ".ann")
    source_ann_path = os.path.join(source_directory_ann, ann_file)

    fold_idx = txt_files.index(txt_file) % num_folds
    fold_directory = os.path.join(root_directory, f'fold_{fold_idx + 1}')
    dest_txt_path = os.path.join(fold_directory, txt_file)
    dest_ann_path = os.path.join(fold_directory, ann_file)

    shutil.copy(os.path.join(source_directory_txt, txt_file), dest_txt_path)
    shutil.copy(source_ann_path, dest_ann_path)


print("\nNumber of Keywords in Test Set:")
total_keywords_test = sum(keyword_counts_test.values())

print(f"Total: {total_keywords_test} files")

print("\nNumber of Keywords in Train Set:")
total_keywords_train = sum(keyword_counts_train.values())
print(f"Total: {total_keywords_train} files")

print("Folding complete.")


Distribution of Keywords in Test Set:
  Mental_health_practices: 11 files
  Non_physical_leisure_time_activities: 8 files
  Beauty_and_Cleaning: 26 files
  Drugs: 34 files
  Environmental_exposures: 25 files
  Socioeconomic_factors: 52 files
  Physical_activity: 21 files
  Nutrition: 66 files
  Sleep: 20 files

Distribution of Keywords in Train Set:
  Mental_health_practices: 63 files
  Non_physical_leisure_time_activities: 53 files
  Beauty_and_Cleaning: 30 files
  Drugs: 153 files
  Environmental_exposures: 174 files
  Socioeconomic_factors: 223 files
  Physical_activity: 201 files
  Nutrition: 379 files
  Sleep: 90 files

Number of Keywords in Test Set:
Total: 263 files

Number of Keywords in Train Set:
Total: 1366 files
Folding complete.


### Convert to CONLL format
* Note: this can be done even before spliiting

In [None]:
import os
import subprocess

def convert_to_CONLL(root_folder, script_path, output_suffix, singletype=None):
    for subdir in os.listdir(root_folder):
        subdir_path = os.path.join(root_folder, subdir)
        if os.path.isdir(subdir_path):
            output_file = os.path.join(root_folder, f'{subdir}_{output_suffix}.tsv')
            if singletype:
                command = f"python  {script_path}  -1 {singletype} {subdir_path} > {output_file}"
                subprocess.call(command, shell=True)
            else:
                command = f"python  {script_path}  {subdir_path} > {output_file}"
                subprocess.call(command, shell=True)

# Example usage:
root_folder = '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/'

script_path = '../../standoff2conll/standoff2conll.py'
output_suffix = 'merged_only_LSF'  # Customize the output file suffix

# Call the function with -1 LSF in arguments
singletype = 'LSF'  # Replace with the desired singletype
convert_to_CONLL(root_folder, script_path, output_suffix, singletype)

# Call the function without -1 LSF in arguments
# removed -1 LSF to explicitly mention LSF branch instead of single LSF type
output_suffix='merged_with_LSF_branches'
convert_to_CONLL(root_folder, script_path, output_suffix)


In [21]:
# Move to Transformer-NER folder
! cp ../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/test_data_merged_only_LSF.tsv  ../../S1000_Transformer_NER/data/
! cp ../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/test_data_merged_with_LSF_branches.tsv  ../../S1000_Transformer_NER/data/

#### Merge folds to create single train file
* to be used after grid search for training the final model 

In [16]:
import pandas as pd

def concatenate_files(file_names, output_file):
    # Initialize an empty DataFrame to store the concatenated data
    concatenated_df = pd.DataFrame()

    # Loop through each file and concatenate its contents to the DataFrame
    for file_name in file_names:
        # Assuming that the first row contains column headers
        df = pd.read_csv(file_name, sep='\t', header=None)
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    # Save the concatenated DataFrame to a new TSV file
    concatenated_df.to_csv(output_file, sep='\t', index=False, header=None)



# List of file names to concatenate
file_names = ['../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_1_merged_only_LSF.tsv',
              '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_2_merged_only_LSF.tsv',
              '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_3_merged_only_LSF.tsv',
              '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_4_merged_only_LSF.tsv',
              '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_5_merged_only_LSF.tsv']
              

# Output file name
output_file = '../../S1000_Transformer_NER/data/train_data_merged_only_LSF.tsv'

# Call the function to concatenate files
concatenate_files(file_names, output_file)



# List of file names to concatenate
file_names = ['../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_1_merged_with_LSF_branches.tsv',
              '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_2_merged_with_LSF_branches.tsv',
              '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_3_merged_with_LSF_branches.tsv',
              '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_4_merged_with_LSF_branches.tsv',
              '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/fold_5_merged_with_LSF_branches.tsv']

# Output file name
output_file = '../../S1000_Transformer_NER/data/train_data_merged_with_LSF_branches.tsv'

# Call the function to concatenate files
concatenate_files(file_names, output_file)




### Create single tsv test file from txt files
* there will be 6 tab separated columns, first is the abstract id and last is the contnt, other are just be compatible with requested format

In [18]:
import os

def create_single_tsv_from_txt_files(input_dir, output_tsv_file):
    # Filler dummy words for the middle columns
    dummy_words = ["other_ids", "authors", "forum", "year"]

    # Initialize an empty list to store the TSV lines
    tsv_lines = []

    # Loop through the TXT files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            # Read the content of the TXT file without modifications
            with open(os.path.join(input_dir, filename), 'r') as txt_file:
                file_content = txt_file.read().strip()

                # Remove newline characters from the file content
                #file_content = file_content.replace('\n', ' ').replace('\r', '')
                file_content = file_content.replace('\n', '')

            # Create a TSV line for the current file
            tsv_line = [filename] + dummy_words + ['"' + file_content + '"']
            tsv_lines.append("\t".join(tsv_line))

    # Write the TSV lines to the output file
    with open(output_tsv_file, 'w') as output_file:
        output_file.write("\n".join(tsv_lines))

    print(f"TSV file '{output_tsv_file}' created successfully.")

input_dir = '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/test_data/'
output_tsv_file = '../../S1000_Transformer_NER/data/test_data_merged_Tagger_format.tsv'
create_single_tsv_from_txt_files(input_dir, output_tsv_file)


TSV file '../../S1000_Transformer_NER/data/test_data_merged_Tagger_format.tsv' created successfully.


# Train NER

### 1. Grid Search 

* To find best hyperparameters we use the second approach from the following options:

    *  In cross-validation, calculating precision and recall by summing TP (True Positives), FP (False Positives), and FN (False Negatives) across all folds can yield different results compared to averaging precision and recall across folds. Here's how they differ:

1. Averaging Precision and Recall Across Folds:
   - For each fold, you calculate precision and recall separately.
   - Then, you average the precision and recall values obtained from all folds.
   - This method gives equal weight to each fold, ensuring that the contribution of each fold to the final result is the same.

2. Summing TP, FP, and FN Across All Folds:
   - In this approach, you sum TP, FP, and FN across all folds to calculate the overall values for the entire dataset.
   - Afterward, you calculate precision and recall based on these summed values.
   - This method treats the entire dataset as a single unit and computes precision and recall as if it were a single large dataset.

   Here's how you can calculate precision and recall in the summing approach for each fold:

    Sum TP, FP, and FN across all folds:
        Calculate the total TP, FP, and FN across all folds by summing the corresponding counts from each fold.

    Calculate Precision and Recall:

        Once you have the total TP, FP, and FN counts, you can calculate precision and recall as follows:

        Precision = Total TP / (Total TP + Total FP)

        Recall = Total TP / (Total TP + Total FN)

The key difference lies in how they handle the division of the dataset. Here are some considerations for each approach:

- Averaging precision and recall across folds is useful when you want to evaluate the model's performance on each fold independently and then obtain an average performance metric. It provides insights into how well the model generalizes across different subsets of the data.

- Summing TP, FP, and FN across all folds treats the entire dataset as a whole, which can be beneficial when you want an overall assessment of the model's performance across the entire dataset. However, it may not provide information about how consistent the model's performance is across different subsets of data.



### 2. Train with final hyperparametrs from grid search
* Data:
    * merge all 5 folds to create a merged file 
        * train file: '../../S1000_Transformer_NER/data/train_data_with_LSF_branches.tsv'
        * test data: '../../S1000_Transformer_NER/data/test_data_merged_with_LSF_branches.tsv'

* Hyperparametrs:
    * Best set from Grid Search
    * epoch:60, lr:1e-5, batch:16, seq:256

* Models:
    * RoBERTa large path:
        * running ./setup.sh in root path of S1000-transformer-ner repo downloads RoBERTa-large model in the root directory


* Run script:
    * update run-ner.sh for hyperparametrs and paths
    * run '../../S1000_Transformer_NER/scripts/run-ner.sh'




### 3. Output files:


* 1. NER output whcih shows what entities are detected in test set in the BIO format
    * '../../S1000_Transformer_NER/output/test_with_LSF_branches.tsv'

* 2. Performance results along with TP,FP,FN counts
    * '../../S1000_Transformer_NER/results/results_test_with_LSF_branches.csv'
    * results.columns=['experiment_ID','max_seq_length','model_name','num_train_epochs','learning_rate','batch_size','predict_position','train_data','test_data','method','prec','rec','F','TP','FP','FN']
    


# Get predictions using S1000 Transformer Tagger

* Note: Trained Transformer NER can be evaulated in 2 different ways:
    * 1. Evaluation script which is available in the repo of NER and by default is calculated (the approach used to do grid search)
    * 2. There is Transformer Taggger that uses the fine tuned model in the first step to tag or detect entities in the text
            * This approach recieves the input in a string db format (6 columns tsv file where each row is an abstract and first column is pmid abd the last column is the asbtract text)
            * This creates predictions in tagger format which is a tsv file for mentions and their offset in abstract and etc. 
            * This file can be converted to corresponding *.ann files similar to what we did for tagger macthes and then we can compare manual annotations with predicted *.ann similar to what we did for benchmarking
            * This approach produces improved results because we use -o argument which considers overlaping matches as match, without this arguument the result will be decreased significantly(similar can happen to tagger results) 

#### 1. Prepare files
* Modify script to point the required input files and trained model in the previous step:
    * '../S1000_Transformer_Tagger/scripts/run-bio-tagger.sh'
        1. Input files:
            * Test data file:
                * '../S1000_Transformer_NER/data/test_data_Tagger_format.tsv' 
        
        2. Trained Model:
            * '../S1000_Transformer_NER/ner-models/s1000'
        

#### 2. Run script
* cd S1000-transformer-tagger/
    * ./scripts/run-bio-tagger.sh
        * this will create ouputs in 
            * S1000-transformer-tagger/output/output-spans.tsv

#### 3. Convert predictions to *.ann files

In [22]:
import os
import pandas as pd

def create_ann_files_from_NER_output(input_file, output_ann_folder):
    # Read the input file, remove duplicates, and write to the output file
    df = pd.read_csv(input_file, sep='\t', header=None)
    df.drop_duplicates(inplace=True)
    df.to_csv(input_file, index=False, header=None, sep='\t')

    os.makedirs(output_ann_folder, exist_ok=True)

    previous_value = None
    counter = 1

    with open(input_file, 'r', encoding='utf-8-sig') as input_file:
        for line in input_file:
            fields = line.strip().split('\t')
            current_value = fields[0]
            # Remove .txt extension from filename
            current_value = current_value.rstrip()[:-4]

            if current_value != previous_value:
                counter = 1  # Reset the counter for a new value in the first column
                previous_value = current_value

            filename = os.path.join(output_ann_folder, current_value + '.ann')
            with open(filename, 'a', encoding='utf-8') as output_file:
                span_text = f'{fields[6]} {int(fields[3])-1} {int(fields[4])}'
                span_text = span_text.replace('\xa0', ' ')  # Replace non-breaking space with a normal space
                output_file.write(f'T{counter}\t{span_text}\t{fields[5]}\n')

            counter += 1


input_file = '../../S1000_Transformer_Tagger/output/output-spans.tsv'
output_ann_folder = '../../S1000_Transformer_Tagger/output/transformer_predicted_ann_files/'
create_ann_files_from_NER_output(input_file, output_ann_folder)



#### 4. Create empty ann files for those abstracts that don't have them because NER has not found any matches


In [23]:


import os

def create_missing_ann_files(ann_dir,txt_dir):
    for filename in os.listdir(txt_dir):
        if filename.endswith('.txt'):
            txt_file_path = os.path.join(ann_dir, filename)
            ann_file_path = os.path.splitext(txt_file_path)[0] + '.ann'
            
            if not os.path.exists(ann_file_path):
                with open(ann_file_path, 'w') as ann_file:
                    pass  # Creates an empty .ann file

ann_dir = '../../S1000_Transformer_Tagger/output/transformer_predicted_ann_files/'
txt_dir ='../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/test_data/'

create_missing_ann_files(ann_dir,txt_dir)


#### 5.calculate performance in comparison with manual annotations:


In [26]:
#with different LSF branches 
!python2 ../../utils/IAA.py -o -d -v     '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/test_data/'    '../../S1000_Transformer_Tagger/output/transformer_predicted_ann_files/'  --allowmissing > ../../S1000_Transformer_NER/results/Transformer_Tagger_Performance_test_data_with_LSF_types.txt 2>&1

#using -i to ignore different LSF branches and consider single LSF enity type
!python2 ../../utils/IAA.py -o -d -v  -i  '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/test_data/'    '../../S1000_Transformer_Tagger/output/transformer_predicted_ann_files/'  --allowmissing > ../../S1000_Transformer_NER/results/Transformer_Tagger_Performance_test_data_single_LSF_type.stv 2>&1


#### 6.produce tagger matches using dictionary based-NER-matches for test data

In [29]:
#!python2 ../../../utils/IAA.py -o -d -v   '../../data/s1000_ner/balanced_LSF_types_splits/test_data/'   "../../data/200_abstracts/taggers_matches/brat_files/" --allowmissing > ../../data/s1000_ner/results/performance_Tagger_for_Test_Data.txt 2>&1
!python2 ../../utils/IAA.py -o -d -v   '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/test_data/'   '../../data/NER-Benchmarking/tagger/tagger_output/brat_files/'  >../../S1000_Transformer_NER/results/Dictionary_based_Tagger_Performance_test_data_with_LSF_types.txt 2>&1



# Plot

* extract metrics from text files

In [None]:
import pandas as pd
import re

def extract_data_and_save(file_path):
    # Initialize an empty DataFrame
    pr_rec_per_lsf_branch = pd.DataFrame(columns=['Lifestyle-factor branch', 'Precision', 'Recall', 'F'])

    # Read the text from the file
    with open(file_path, 'r') as file:
        text = file.read()

    # Split the text into lines
    lines = text.strip().split('\n')

    # Iterate over each line and extract data
    for line in lines:
        match = re.match(r'TYPE:\s+(.*?)\s+precision\s+([\d.]+%)\s+\((\d+)/(\d+)\)\s+recall\s+([\d.]+%)\s+\((\d+)/(\d+)\)\s+F\s+([\d.]+)', line)
        if match:
            revision_step, precision, _, _, recall, _, _, f_score = match.groups()
            if revision_step not in ['Beauty_and_Cleaning', 'Drugs', 'Environmental_exposures',
                                     'Mental_health_practices', 'Non_physical_leisure_time_activities',
                                     'Nutrition', 'Physical_activity', 'Sleep', 'Socioeconomic_factors']:
                continue

            # Remove the percentage symbols and convert to floats
            precision = float(precision.rstrip('%'))
            recall = float(recall.rstrip('%'))
            f_score = float(f_score)
            # Append the data to the DataFrame
            pr_rec_per_lsf_branch = pr_rec_per_lsf_branch.append({
                'Lifestyle-factor branch': revision_step.strip(),
                'Precision': precision,
                'Recall': recall,
                'F': f_score
            }, ignore_index=True)

    # Sort the DataFrame by the 'F' column in ascending order
    pr_rec_per_lsf_branch = pr_rec_per_lsf_branch.sort_values(by='F')
    
    # Return the resulting DataFrame
    return pr_rec_per_lsf_branch

# full_output Dictionary based NER for test data:
file_path = '../../S1000_Transformer_NER/results/Dictionary_based_Tagger_Performance_test_data_with_LSF_types.txt'

resulting_df = extract_data_and_save(file_path)
# Save the DataFrame to a TSV file
Dict_based_NER_file_path = '../../S1000_Transformer_NER/results/Dictionary_based_Tagger_Performance_test_data_with_LSF_types_prec_rec.tsv'
resulting_df.to_csv(Dict_based_NER_file_path, sep='\t', index=None)



# full_output S1000 NER for test data:
file_path = '../../S1000_Transformer_NER/results/Transformer_Tagger_Performance_test_data_with_LSF_types.txt'
resulting_df = extract_data_and_save(file_path)
# Save the DataFrame to a TSV file
Transformer_based_NER_file_path = '../../S1000_Transformer_NER/results/Transformer_Tagger_Performance_test_data_with_LSF_types_prec_rec.tsv'
resulting_df.to_csv(Transformer_based_NER_file_path, sep='\t', index=None)




* Modify lables to be appropriate for plotting


In [36]:

import pandas as pd

def replace_phrases_in_dataframe(input_file_path, phrase_replacements):
    # Read the input TSV file into a DataFrame
    df = pd.read_csv(input_file_path, sep='\t')

    # Replace the phrases in the DataFrame
    df['Lifestyle-factor branch'] = df['Lifestyle-factor branch'].replace(phrase_replacements)

    # Write the modified DataFrame to the output TSV file
    df.to_csv(input_file_path, sep='\t', index=False)


phrase_replacements = {
    'Environmental_exposures': 'Environmental exposures',
    'Physical_activity': 'Physical activities',
    'Socioeconomic_factors': 'Socioeconomic factors',
    'Drugs': 'Substance use',
    'Mental_health_practices': 'Mental health practices',
    'Non_physical_leisure_time_activities': 'Non physical leisure time activities',
    'Beauty_and_Cleaning': 'Beauty and cleaning'
}


# replace names in both files
replace_phrases_in_dataframe(Dict_based_NER_file_path, phrase_replacements)

replace_phrases_in_dataframe(Transformer_based_NER_file_path, phrase_replacements)




# merge two files

df_tagger = pd.read_csv(Dict_based_NER_file_path, sep='\t')
df2_transformer = pd.read_csv(Transformer_based_NER_file_path, sep='\t')
df2_transformer.columns = ['Lifestyle-factor branch']+[col + '_transformer' for col in df2_transformer.columns if col!='Lifestyle-factor branch']

merged_df = df_tagger.merge(df2_transformer, on='Lifestyle-factor branch')

# Save the merged DataFrame to a new TSV file
merged_file ='../../S1000_Transformer_NER/results/plot_input_compare_DIC_VS_Transformer.tsv'

#merged_df=merged_df.sort_values(by=['Lifestyle-factor branch'])

desired_order = ['nutrition', 'socioeconomic factors', 'environmental exposures', 'substance use','physical activities', 'beauty and cleaning',  'non physical leisure time activities',  'sleep' , 'mental health practices']
# Create a new column with the order of 'Lifestyle-factor branch' (ignoring case)
merged_df['Order'] = merged_df['Lifestyle-factor branch'].str.lower().map({value.lower(): index for index, value in enumerate(desired_order)})
# Sort the DataFrame based on the new 'Order' column
merged_df_sorted = merged_df.sort_values(by='Order').drop('Order', axis=1)
merged_df_sorted.to_csv(merged_file, sep='\t', index=False)



* plot by comparing Tagger Ner with Transformer NER
    


In [38]:

#!python3 ../plot_prec_rec_Deep_vs_Tagger.py --input_file=../../data/s1000_ner/results/plot_input_compare_DIC_VS_Transformer.tsv --task="Performance of Transformer-based and Dictonary-based NER (Test-Data)" --output_file=../../data/s1000_ner/results/plot_Tagger_VS_Transformer_NER.png
!python3 ../NER-Benchmarking/scripts/plot_prec_rec_Deep_vs_Tagger.py  --input_file=../../S1000_Transformer_NER/results/plot_input_compare_DIC_VS_Transformer.tsv --task="" --output_file=../../plots/plot_Tagger_VS_Transformer_NER.png


Figure(1200x1200)
