# Benchmarking Transformer-based NER 

In [9]:
%%bash
cd ../../
pwd
git submodule add https://github.com/EsmaeilNourani/S1000-transformer-ner.git   S1000_Transformer_NER
git add .
git commit -m "S1000-transformer-ner as a submodule in S1000_Transformer_NER directory."
git push


In [12]:
%%bash
cd ../../
pwd
git submodule add https://github.com/EsmaeilNourani/Lifestyle-factors-classification.git   LSFC
git add .
git commit -m "LSFC as a submodule in LSFC directory."
git push



In [None]:
%%bash
cd ../../
pwd
git submodule add https://github.com/jouniluoma/S1000-transformer-tagger.git   S1000_Transformer_Tagger
git add .
git commit -m "S1000-transformer-tagger as a submodule in S1000_Transformer_Tagger directory."
git push


In [None]:
%%bash
cd ../../
pwd
git submodule add https://github.com/spyysalo/standoff2conll.git   standoff2conll
git add .
git commit -m "standoff2conll as a submodule in standoff2conll directory."
git push


# Preprocess 

### Filter annotation files
* create a copy of annotation files by excluding: 
        * keywords_to_remove = {"Out-of-scope","LSF_out_of_context", "Geographical_Feature", "Occupations"}

In [13]:
import os

# Input folder containing .ann files
input_folder = '../../data/NER-Benchmarking/annotations/categorized/'
# Output folder for modified files
output_folder = '../../data/NER-Benchmarking/annotations/filtered/'

# Keywords to remove
keywords_to_remove = {"Out-of-scope","LSF_out_of_context", "Geographical_Feature", "Occupations"}

def remove_keywords(input_filepath, output_filepath):
    with open(input_filepath, 'r') as input_file, open(output_filepath, 'w') as output_file:
        for line in input_file:
            line = line.strip()
            if not any(keyword in line for keyword in keywords_to_remove):
                output_file.write(line + '\n')

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Iterate through .ann files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".ann"):
        input_filepath = os.path.join(input_folder, filename)
        output_filepath = os.path.join(output_folder, filename)
        remove_keywords(input_filepath, output_filepath)

print("Files with keywords removed have been created in the output folder.")


Files with keywords removed have been created in the output folder.


### Prepare dataset
* 20% hold out test data
* 80% divided into 5 fold for cross validation 

* Split with considering the LSF types distribution between train and test

In [3]:
import os
import shutil
import random
import random

# Set the random seed for reproducibility
random.seed(1361)
# Source directory containing your pairs of .txt and .ann files
source_directory_txt = '../../data/NER-Benchmarking/annotations/raw/'
source_directory_ann = '../../data/NER-Benchmarking/annotations/filtered/'

# Root destination directory for folds and test data
root_directory = '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/'

# Create the root directory if it doesn't exist
os.makedirs(root_directory, exist_ok=True)

# Create a directory for the test set
test_directory = os.path.join(root_directory, 'test_data')
os.makedirs(test_directory, exist_ok=True)

# List all .txt files in the source directory
txt_files = [filename for filename in os.listdir(source_directory_txt) if filename.endswith(".txt")]

# Shuffle the list of .txt files randomly
random.shuffle(txt_files)

# Define the 9 keywords
keywords = {
    'Environmental_exposures',
    'Physical_activity',
    'Socioeconomic_factors',
    'Drugs',
    'Mental_health_practices',
    'Non_physical_leisure_time_activities',
    'Beauty_and_Cleaning',
    'Nutrition',
    'Sleep'
}


# List all .txt files in the source directory
txt_files = [filename for filename in os.listdir(source_directory_txt) if filename.endswith(".txt")]

# Shuffle the list of .txt files randomly while keeping the test size at 20%
test_size = int(0.20 * len(txt_files))
test_files = random.sample(txt_files, test_size)
train_files = [file for file in txt_files if file not in test_files]



# Create a dictionary to track keyword distribution in each fold
keyword_counts = {keyword: [0] * 5 for keyword in keywords}

# Calculate the distribution of keywords in the test set
keyword_counts_test = {keyword: 0 for keyword in keywords}

for txt_file in test_files:
    ann_file = txt_file.replace(".txt", ".ann")
    source_ann_path = os.path.join(source_directory_ann, ann_file)

    # Read the .ann file to identify the keywords
    with open(source_ann_path, 'r') as ann_file_content:
        for line in ann_file_content:
            fields = line.strip().split()
            if fields and fields[0].startswith('T'):
                keyword = fields[1]  # Assumes the keyword appears as the second field
                if keyword in keywords:
                    keyword_counts_test[keyword] += 1

# Calculate the distribution of keywords in the train set
keyword_counts_train = {keyword: 0 for keyword in keywords}

for txt_file in train_files:
    ann_file = txt_file.replace(".txt", ".ann")
    source_ann_path = os.path.join(source_directory_ann, ann_file)

    # Read the .ann file to identify the keywords
    with open(source_ann_path, 'r') as ann_file_content:
        for line in ann_file_content:
            fields = line.strip().split()
            if fields and fields[0].startswith('T'):
                keyword = fields[1]  # Assumes the keyword appears as the second field
                if keyword in keywords:
                    keyword_counts_train[keyword] += 1

# Check keyword distribution in test and train sets
print("Distribution of Keywords in Test Set:")
for keyword, count in keyword_counts_test.items():
    print(f"  {keyword}: {count} files")

print("\nDistribution of Keywords in Train Set:")
for keyword, count in keyword_counts_train.items():
    print(f"  {keyword}: {count} files")

# Copy test files to the test folder
for txt_file in test_files:
    ann_file = txt_file.replace(".txt", ".ann")
    dest_txt_path = os.path.join(test_directory, txt_file)
    dest_ann_path = os.path.join(test_directory, ann_file)
    shutil.copy(os.path.join(source_directory_txt, txt_file), dest_txt_path)
    shutil.copy(os.path.join(source_directory_ann, ann_file), dest_ann_path)

# Create directories for each fold in the train set
num_folds = 5
fold_size = len(train_files) // num_folds

for fold in range(num_folds):
    fold_directory = os.path.join(root_directory, f'fold_{fold + 1}')
    os.makedirs(fold_directory, exist_ok=True)

# Iterate over the .txt files in the train set and distribute them to folds
for txt_file in train_files:
    ann_file = txt_file.replace(".txt", ".ann")
    source_ann_path = os.path.join(source_directory_ann, ann_file)

    fold_idx = txt_files.index(txt_file) % num_folds
    fold_directory = os.path.join(root_directory, f'fold_{fold_idx + 1}')
    dest_txt_path = os.path.join(fold_directory, txt_file)
    dest_ann_path = os.path.join(fold_directory, ann_file)

    shutil.copy(os.path.join(source_directory_txt, txt_file), dest_txt_path)
    shutil.copy(source_ann_path, dest_ann_path)


print("\nNumber of Keywords in Test Set:")
total_keywords_test = sum(keyword_counts_test.values())

print(f"Total: {total_keywords_test} files")

print("\nNumber of Keywords in Train Set:")
total_keywords_train = sum(keyword_counts_train.values())
print(f"Total: {total_keywords_train} files")

print("Folding complete.")


Distribution of Keywords in Test Set:
  Mental_health_practices: 11 files
  Non_physical_leisure_time_activities: 8 files
  Beauty_and_Cleaning: 26 files
  Drugs: 34 files
  Environmental_exposures: 25 files
  Socioeconomic_factors: 52 files
  Physical_activity: 21 files
  Nutrition: 66 files
  Sleep: 20 files

Distribution of Keywords in Train Set:
  Mental_health_practices: 63 files
  Non_physical_leisure_time_activities: 53 files
  Beauty_and_Cleaning: 30 files
  Drugs: 153 files
  Environmental_exposures: 174 files
  Socioeconomic_factors: 223 files
  Physical_activity: 201 files
  Nutrition: 379 files
  Sleep: 90 files

Number of Keywords in Test Set:
Total: 263 files

Number of Keywords in Train Set:
Total: 1366 files
Folding complete.


### Convert to CONLL format
* Note: this can be done even before spliiting

In [None]:
import os
import subprocess

def convert_to_CONLL(root_folder, script_path, output_suffix, singletype=None):
    for subdir in os.listdir(root_folder):
        subdir_path = os.path.join(root_folder, subdir)
        if os.path.isdir(subdir_path):
            output_file = os.path.join(root_folder, f'{subdir}_{output_suffix}.tsv')
            if singletype:
                command = f"python  {script_path}  -1 {singletype} {subdir_path} > {output_file}"
                subprocess.call(command, shell=True)
            else:
                command = f"python  {script_path}  {subdir_path} > {output_file}"
                subprocess.call(command, shell=True)

# Example usage:
root_folder = '../../data/NER-Benchmarking/annotations/balanced_LSF_types_splits/'
script_path = './standoff2conll/standoff2conll.py'
output_suffix = 'merged_only_LSF'  # Customize the output file suffix

# Call the function with -1 LSF in arguments
singletype = 'LSF'  # Replace with the desired singletype
convert_to_CONLL(root_folder, script_path, output_suffix, singletype)

# Call the function without -1 LSF in arguments
# removed -1 LSF to explicitly mention LSF branch instead of single LSF type
output_suffix='merged_with_LSF_branches'
convert_to_CONLL(root_folder, script_path, output_suffix)


# Merge folds to create single train file
    * to be used after grid search for training the final model 

In [33]:
import pandas as pd

def concatenate_files(file_names, output_file):
    # Initialize an empty DataFrame to store the concatenated data
    concatenated_df = pd.DataFrame()

    # Loop through each file and concatenate its contents to the DataFrame
    for file_name in file_names:
        # Assuming that the first row contains column headers
        df = pd.read_csv(file_name, sep='\t', header=None)
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    # Save the concatenated DataFrame to a new TSV file
    concatenated_df.to_csv(output_file, sep='\t', index=False, header=None)



# List of file names to concatenate
file_names = ["../../data/s1000_ner/balanced_LSF_types_splits/fold_1_merged_only_LSF.tsv",
              "../../data/s1000_ner/balanced_LSF_types_splits/fold_2_merged_only_LSF.tsv",
              "../../data/s1000_ner/balanced_LSF_types_splits/fold_3_merged_only_LSF.tsv",
              "../../data/s1000_ner/balanced_LSF_types_splits/fold_4_merged_only_LSF.tsv",
              "../../data/s1000_ner/balanced_LSF_types_splits/fold_5_merged_only_LSF.tsv"]
              

# Output file name
output_file = "../../data/s1000_ner/balanced_LSF_types_splits/train_data_merged_only_LSF.tsv"

# Call the function to concatenate files
concatenate_files(file_names, output_file)



# List of file names to concatenate
file_names = ['../../data/s1000_ner/balanced_LSF_types_splits/fold_1_merged_with_LSF_branches.tsv',
              '../../data/s1000_ner/balanced_LSF_types_splits/fold_2_merged_with_LSF_branches.tsv',
              '../../data/s1000_ner/balanced_LSF_types_splits/fold_3_merged_with_LSF_branches.tsv',
              '../../data/s1000_ner/balanced_LSF_types_splits/fold_4_merged_with_LSF_branches.tsv',
              '../../data/s1000_ner/balanced_LSF_types_splits/fold_5_merged_with_LSF_branches.tsv']

# Output file name
output_file = "../../data/s1000_ner/balanced_LSF_types_splits/train_data_merged_with_LSF_branches.tsv"

# Call the function to concatenate files
concatenate_files(file_names, output_file)




# Transfer data to computerome

* data dir on computerome:
    * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/data

* script to run NER on computerome:
    * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/scripts/run-ner.sh

# Documentation of Grid Search 

* To find best hyperparameters we use the second approach from the following options:

In cross-validation, calculating precision and recall by summing TP (True Positives), FP (False Positives), and FN (False Negatives) across all folds can yield different results compared to averaging precision and recall across folds. Here's how they differ:

1. Averaging Precision and Recall Across Folds:
   - For each fold, you calculate precision and recall separately.
   - Then, you average the precision and recall values obtained from all folds.
   - This method gives equal weight to each fold, ensuring that the contribution of each fold to the final result is the same.

2. Summing TP, FP, and FN Across All Folds:
   - In this approach, you sum TP, FP, and FN across all folds to calculate the overall values for the entire dataset.
   - Afterward, you calculate precision and recall based on these summed values.
   - This method treats the entire dataset as a single unit and computes precision and recall as if it were a single large dataset.

   Here's how you can calculate precision and recall in the summing approach for each fold:

    Sum TP, FP, and FN across all folds:
        Calculate the total TP, FP, and FN across all folds by summing the corresponding counts from each fold.

    Calculate Precision and Recall:

        Once you have the total TP, FP, and FN counts, you can calculate precision and recall as follows:

        Precision = Total TP / (Total TP + Total FP)

        Recall = Total TP / (Total TP + Total FN)

The key difference lies in how they handle the division of the dataset. Here are some considerations for each approach:

- Averaging precision and recall across folds is useful when you want to evaluate the model's performance on each fold independently and then obtain an average performance metric. It provides insights into how well the model generalizes across different subsets of the data.

- Summing TP, FP, and FN across all folds treats the entire dataset as a whole, which can be beneficial when you want an overall assessment of the model's performance across the entire dataset. However, it may not provide information about how consistent the model's performance is across different subsets of data.



## Update scripts
* update ner_hf_trainer.py  (/home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner)
    * we replaced this 
    * results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) 
    * with 
    * results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore,conlleval.metrics(c)[0].tp,conlleval.metrics(c)[0].fp,conlleval.metrics(c)[0].fn])
    * to also report tp,fp,fn to be able to calculate Total metrics
    * Note: [0] in these lines shows the metric calculate for overal types and not specific entity type (here we have only one type: LSF)
        * overal metric is calculated by Report funstion in  conlleval.py where it returns   overall, by_type = metrics(counts)
        * so [0] refers to overal
 
    * Note: also in ner_hf_trainer.py 3 different evaluation type is introduced and reported in the result file:
        * method_names = ['CMV','CMVP','F']
        * for each type a line as appended in the result file, so for each fold (in cross validation) there will be three lines in the result file



# Computerome

### Summary:
1. Grid Search
    * qsub  qsub_ner_grid.sh to do grid search
    * raw results will be saved to /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/results/results-new-grid.csv
    * then we summ TP,FP, Fn and calculate overall results and save it in local dir: '../../data/s1000_ner/results/results-new-grid-summed.csv'
2. Run with final hyperparametrs from grid search
    * merge all 5 folds to create a merged train file: /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/data/train_data_merged.tsv
    * test data: /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/data/test_data_merged.tsv
    * qsub qsub_ner_train_test.sh
    * results will in /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/output/output_final_test

LLM Models:
* RoBERTa large path on computerome
    * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Voc-hf
* Finetuned model path:
    * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/S1000-transformer-ner

* iqsub with gpu

* Activate env
    * module load tools
    * cd /home/projects/ku_10024/scratch/esmaeil/s1000
    * conda  activate /home/projects/ku_10024/scratch/esmaeil/s1000/s1000
        * or source activate /home/projects/ku_10024/scratch/esmaeil/s1000/s1000
    * cd S1000-transformer-ner/
    * 


## Modified Files and Scripts:


### result file:
* I chnaged this line in ner_hf_trainer.py

    * with open(result_file, 'w+') as f:
    * to 
    * with open(result_file, 'a') as f:
    * to append results for different folds and also different set of hyperparameters

* /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/results/results-output.csv
* each row contains:
    * args.output_file,
    * args.max_seq_length, 
    * args.model_name, 
    * args.num_train_epochs, 
    * args.learning_rate,
    * args.batch_size,
    * args.predict_position,
    * args.train_data,
    * args.test_data,
    * method_name (one of ['CMV','CMVP','F']) so there will three lines
    * prec
    * rec
    * fscore
    * tp
    * fp
    * fn
    * example: output	128	RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Voc-hf	10.0	3E-05	4	0	data/train.tsv	data/dev.tsv	F	0.5689655172413793	0.5689655172413793	0.5689655172413793	66	50	50


### Modified /scripts/run-ner.sh

* There are three different versions of this script in this path:
    * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/scripts

1. For single run: run-ner-single.sh
    * ./scripts/run-ner-single.sh


2. For folds: run-ner-folds.sh
    * ./scripts/run-ner-folds.sh


3. For grid search: run-ner-grid.sh
    * normally it will be executed using qsub: qsub run-ner-grid.sh
    * checkjob -v  46045344 to see the dedicated resources
    


## Output files:


* 1. /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/output/ 
    * contain three files for actuall NER output whcih shows what entities are detected in th BIO format

* 2. /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/results
    * contains results-output.csv file which has all performance results along with TP,FP,FN counts for different experiments


# Process results from computerome

* Possible explanations for different metrics provided in the result files:
    * method_names = ['CMV','CMVP','F']

    * 'F':  'F' likely stands for "First" or "First Prediction." This means that the 'F' method represents a strategy where the NER label for each token is determined based on the label from the first prediction (the initial prediction) for that token. In contrast to the other methods that involve voting or accumulating probabilities, this strategy simply takes the label from the first prediction as the final label for each token.
    * 'CMV': This likely stands for "Consensus Most Voted" or a similar term. It corresponds to the strategy where the NER label for each token is determined by selecting the label that occurs most frequently among all the model's predictions for that token. This method is represented by the pr_ensemble variable and is based on the "First tag then vote" strategy mentioned in your comments.
    * 'CMVP': This likely stands for "Consensus Most Voted with Probabilities" or something similar. It corresponds to the strategy where the NER label for each token is determined by selecting the label with the highest accumulated probability among all the model's predicted probabilities for that token. This method is represented by the prob_ensemble variable and is based on the "Accumulate probabilities, then vote" strategy mentioned in your comments.

* check job status:
    * checkjob -v 46215868
    * qstat -f 46215868
* kill job 
    * mjobctl -c <jobid> or canceljob <jobid>
    * Force cancel job, try this if regular cancel fails
    * mjobctl -F <jobid>
    mjobctl -F  46221248
        

###### ______________________________________

# Final Grid Search

###### ______________________________________

* 1. ssh C2
* 2. cd /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/scripts
* 3. qsub qsub_ner_grid_with_LSF_branches.sh
* 4. output file is copied from computerome(/home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/results) to '../../data/s1000_ner/results/results-new-grid.csv'

In [125]:
#results=pd.read_csv('../../data/s1000_ner/results/results-output.csv',sep='\t')
results=pd.read_csv('../../data/s1000_ner/results/results-new-grid.csv')
results.columns=['experiment_ID','max_seq_length','model_name','num_train_epochs','learning_rate','batch_size','predict_position','train_data','test_data','method','prec','rec','F','TP','FP','FN']
# # remove old results from the file
# # Drop the first n rows
# results = results.drop(results.index[:182])
# # Reset the index if needed
# results = results.reset_index(drop=True)

# results.to_csv('../../data/s1000_ner/results/results-output.csv', index=False)

In [126]:
results.shape

(326, 16)

In [127]:
results.head()

Unnamed: 0,experiment_ID,max_seq_length,model_name,num_train_epochs,learning_rate,batch_size,predict_position,train_data,test_data,method,prec,rec,F,TP,FP,FN
0,fold_1_epoch_20_lr1e-5_batch16_seq128,128,RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Vo...,20.0,1e-05,16,0,/home/projects/ku_10024/scratch/esmaeil/s1000/...,/home/projects/ku_10024/scratch/esmaeil/s1000/...,CMVP,0.61039,0.69802,0.65127,141,90,61
1,fold_1_epoch_20_lr1e-5_batch16_seq128,128,RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Vo...,20.0,1e-05,16,0,/home/projects/ku_10024/scratch/esmaeil/s1000/...,/home/projects/ku_10024/scratch/esmaeil/s1000/...,F,0.601732,0.688119,0.642032,139,92,63
2,fold_2_epoch_20_lr1e-5_batch16_seq128,128,RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Vo...,20.0,1e-05,16,0,/home/projects/ku_10024/scratch/esmaeil/s1000/...,/home/projects/ku_10024/scratch/esmaeil/s1000/...,CMV,0.663185,0.709497,0.68556,254,129,104
3,fold_2_epoch_20_lr1e-5_batch16_seq128,128,RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Vo...,20.0,1e-05,16,0,/home/projects/ku_10024/scratch/esmaeil/s1000/...,/home/projects/ku_10024/scratch/esmaeil/s1000/...,CMVP,0.664062,0.712291,0.687332,255,129,103
4,fold_2_epoch_20_lr1e-5_batch16_seq128,128,RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Vo...,20.0,1e-05,16,0,/home/projects/ku_10024/scratch/esmaeil/s1000/...,/home/projects/ku_10024/scratch/esmaeil/s1000/...,F,0.650131,0.695531,0.672065,249,134,109


In [128]:
# remove fold# number from experimentd ID
results['experiment_ID'] = results['experiment_ID'].str.replace(r'fold_[0-5]_', '')

  results['experiment_ID'] = results['experiment_ID'].str.replace(r'fold_[0-5]_', '')


In [129]:
# calculate total TP,FP,FN for 5 folds for different experiments and evaluation methods
grouped_results = results.groupby(['experiment_ID', 'method'])[[ 'TP', 'FP', 'FN']].sum().reset_index()

In [130]:

# Calculate precision, recall, and F-score
grouped_results['Precision'] = grouped_results['TP'] / (grouped_results['TP'] + grouped_results['FP'])
grouped_results['Recall'] = grouped_results['TP'] / (grouped_results['TP'] + grouped_results['FN'])
grouped_results['F_score'] = 2 * (grouped_results['Precision'] * grouped_results['Recall']) / (grouped_results['Precision'] + grouped_results['Recall'])


In [131]:
results_sorted = grouped_results.sort_values(by='F_score', ascending=False)

In [132]:
results_sorted.head(30)

Unnamed: 0,experiment_ID,method,TP,FP,FN,Precision,Recall,F_score
51,epoch_60_lr1e-5_batch16_seq256,CMV,572,338,306,0.628571,0.651481,0.639821
52,epoch_60_lr1e-5_batch16_seq256,CMVP,572,340,306,0.627193,0.651481,0.639106
6,epoch_20_lr3e-5_batch16_seq128,CMV,696,402,396,0.63388,0.637363,0.635616
50,epoch_60_lr1e-5_batch16_seq128,F,581,370,297,0.610936,0.661731,0.63532
54,epoch_60_lr3e-5_batch16_seq128,CMV,553,311,325,0.640046,0.629841,0.634902
55,epoch_60_lr3e-5_batch16_seq128,CMVP,558,323,320,0.633371,0.635535,0.634451
7,epoch_20_lr3e-5_batch16_seq128,CMVP,699,413,393,0.628597,0.64011,0.634301
8,epoch_20_lr3e-5_batch16_seq128,F,702,428,390,0.621239,0.642857,0.631863
46,epoch_50_lr3e-5_batch16_seq256,CMVP,687,408,393,0.627397,0.636111,0.631724
25,epoch_40_lr1e-5_batch16_seq128,CMVP,1289,826,681,0.609456,0.654315,0.631089


In [102]:
results_sorted.to_csv('../../data/s1000_ner/results/results-new-grid-summed.csv',sep='\t',index=None)

* Best hyperparameter set:
    * epoch_60_lr1e-5_batch16_seq256	


# Final train 

* Note: I combined this part with Transformer Tagger in the next section, which trains at the same time and predicts for test data using tagger format input
* So we can skipp this standalone training
* Train model using best hyperparameter set found in the previous section
    *     epoch_60_lr1e-5_batch16_seq256	
* Train is done by the the whole 80% of dataset which was initially splitted to folds for grid search and now combined
    * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/data/balanced/train_data_merged_with_LSF_branches.tsv
* Test is 20% hold out
    * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/data/balanced/test_data_merged_with_LSF_branches.tsv
* 1. ssh C2
* 2. cd /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/scripts
* 3. qsub qsub_ner_train_test_with_LSF_branches.sh
* 4. trained model is located in /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-ner/S1000-transformer-ner-with-LSF-branches that will be used in next step by transformer tagger to produce final NER prediction

# S1000 Transformer Tagger

* Note: Results of Transformer NER can be evaulated in 2 different ways:
    * 1. Evaluation script which is available in the repo of NER and by default is calculated (the approach used to do grid search)
    * 2. There is Transformet Taggger that uses the fine tuned model in the first step to tag or detect entities in the text
            * This approach recieves the input in a string db format (6 columns tsv file where each row is an abstract and first column is pmid abd the last column is the asbtract text)
            * This creates predictions in tagger format which is a tsv file for mentions and their offset in abstract and etc. 
            * This file can be converted to corresponding *.ann files similar to what we did for tagger macthes and then we can compare manual annotations with predicted *.ann similar to what we did for benchmarking
            * This approach produces improved results because we use -o argument which considers overlaping matches as match, without this arguument the result will be decreased significantly(similar can happen to tagger results) 


* clone repo:
    * cd /home/projects/ku_10024/scratch/esmaeil/s1000
    * git clone https://github.com/jouniluoma/S1000-transformer-tagger.git
* download sample data:
    * wget https://a3s.fi/s1000/database_sample.tsv.gz
    * mkdir data
    * mv database_sample.tsv.gz data/
    * gunzip data/database_sample.tsv.gz
* modify scripts/run-bio-tagger.sh to point to our finetuned model in previous step
    * ner_model="../S1000-transformer-ner/S1000-transformer-ner/"
    * I did this to sole the spacy issue:
        * pip uninstall spacy
        * pip uninstall pydantic
        * pip install spacy pydantic

## Create single tsv file using test txt files
    * there will be 6 tab separated columns, first is the abstract id and last is the contnt, other are just be compatible with requested format

In [103]:
import os

def create_single_tsv_from_txt_files(input_dir, output_tsv_file):
    # Filler dummy words for the middle columns
    dummy_words = ["other_ids", "authors", "forum", "year"]

    # Initialize an empty list to store the TSV lines
    tsv_lines = []

    # Loop through the TXT files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            # Read the content of the TXT file without modifications
            with open(os.path.join(input_dir, filename), 'r') as txt_file:
                file_content = txt_file.read().strip()

                # Remove newline characters from the file content
                #file_content = file_content.replace('\n', ' ').replace('\r', '')
                file_content = file_content.replace('\n', '')

            # Create a TSV line for the current file
            tsv_line = [filename] + dummy_words + ['"' + file_content + '"']
            tsv_lines.append("\t".join(tsv_line))

    # Write the TSV lines to the output file
    with open(output_tsv_file, 'w') as output_file:
        output_file.write("\n".join(tsv_lines))

    print(f"TSV file '{output_tsv_file}' created successfully.")

# # Example usage:
# # Replace input_dir and output_tsv_file with your desired directory and file paths
# input_dir = '../../data/s1000_ner/test_data/'
# output_tsv_file = '../../data/s1000_ner/test_data_merged_Tagger_format.tsv'
# create_single_tsv_from_txt_files(input_dir, output_tsv_file)


In [36]:
input_dir = '../../data/s1000_ner/balanced_LSF_types_splits/test_data/'
output_tsv_file = '../../data/s1000_ner/balanced_LSF_types_splits/test_data_merged_Tagger_format.tsv'
create_single_tsv_from_txt_files(input_dir, output_tsv_file)


TSV file '../../data/s1000_ner/balanced_LSF_types_splits/test_data_merged_Tagger_format.tsv' created successfully.


## Get predictions using S1000 Transformer Tagger

* Transfer the file to  computerome
    * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-tagger/data/test_data_merged_Tagger_format.tsv
    * 


* Run qsub:
    * cd /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-tagger/scripts
    * qsub  qsub-final.sh
    * qsub-final.sh contains both training the model and applying the trained model for getting predictions for test data in tagger format
    * out put will be in /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-tagger/output/output-with-LSF-branches-balanced-spans.tsv
        * this is the tagged entities in tagger format we can convert it ti correspoinding *.ann files to make it easier to compare with manual annotations
    * trained model will be /S1000-transformer-ner/S1000-transformer-ner-LSF-branches-balanced-final/
* Alternative using interactive session (Obselete)
    * run script
        * create iqsub session 
        * module load tools
        * cd /home/projects/ku_10024/scratch/esmaeil/s1000
        * conda  activate /home/projects/ku_10024/scratch/esmaeil/s1000/s1000
        * cd S1000-transformer-tagger/
        * ./scripts/run-bio-tagger.sh
        * this will create ouputs in 
            * /home/projects/ku_10024/scratch/esmaeil/s1000/S1000-transformer-tagger/output/output-LSF-spans.tsv
            


## Convert predictions to *.ann files

In [104]:
import os
import pandas as pd

def create_ann_files_from_NER_output(input_file, output_ann_folder):
    # Read the input file, remove duplicates, and write to the output file
    df = pd.read_csv(input_file, sep='\t', header=None)
    df.drop_duplicates(inplace=True)
    df.to_csv(input_file, index=False, header=None, sep='\t')

    os.makedirs(output_ann_folder, exist_ok=True)

    previous_value = None
    counter = 1

    with open(input_file, 'r', encoding='utf-8-sig') as input_file:
        for line in input_file:
            fields = line.strip().split('\t')
            current_value = fields[0]
            # Remove .txt extension from filename
            current_value = current_value.rstrip()[:-4]

            if current_value != previous_value:
                counter = 1  # Reset the counter for a new value in the first column
                previous_value = current_value

            filename = os.path.join(output_ann_folder, current_value + '.ann')
            with open(filename, 'a', encoding='utf-8') as output_file:
                span_text = f'{fields[6]} {int(fields[3])-1} {int(fields[4])}'
                span_text = span_text.replace('\xa0', ' ')  # Replace non-breaking space with a normal space
                output_file.write(f'T{counter}\t{span_text}\t{fields[5]}\n')

            counter += 1



### Fine tuning the model using single LSF type ignoring LSF types

In [105]:
# # Example usage:
# input_file = '../../data/s1000_ner/output/output-ignore-types-balanced-spans.tsv'
# output_ann_folder = '../../data/s1000_ner/output/transformer_predicted_ann_files_single_LSF_type/'
# create_ann_files_from_NER_output(input_file, output_ann_folder)
#!python2 ../../../utils/IAA.py -o -d -v  -i  '../../data/s1000_ner/balanced_LSF_types_splits/test_data'   '../../data/s1000_ner/output/transformer_predicted_ann_files_single_LSF_type/' --allowmissing > ../../data/s1000_ner/results/performance_single_LSF_type.txt 2>&1


### Fine tuning the model using different LSF types

In [142]:
import os


#input_file='../../data/s1000_ner/output/output-with-LSF-branches-spans.tsv'
input_file='../../data/s1000_ner/output/output-with-LSF-branches-balanced-spans.tsv'

df=pd.read_csv(input_file,sep='\t',header=None)
df.drop_duplicates(inplace=True)
df.to_csv(input_file,index=False,header=None,sep='\t')


#output_ann_folder = '../../data/s1000_ner/output/transformer_predicted_ann_files_with_LSF_types/'
output_ann_folder = '../../data/s1000_ner/output/transformer_predicted_ann_files_with_LSF_types_balanced/'
os.makedirs(output_ann_folder, exist_ok=True)


previous_value = None
counter = 1

with open(input_file, 'r', encoding='utf-8-sig') as input_file:  # Use utf-8-sig to handle BOM
    for line in input_file:
        fields = line.strip().split('\t')
        current_value = fields[0]
        # remove .txt extension from filename
        current_value = current_value.rstrip()[:-4] 
        
        if current_value != previous_value:
            counter = 1  # Reset the counter for a new value in the first column
            previous_value = current_value
        
        filename = os.path.join(output_ann_folder, current_value + '.ann')
        with open(filename, 'a', encoding='utf-8') as output_file:  # Ensure utf-8 encoding without BOM
                #span_text = f'{fields[6]} {int(fields[3])-1} {int(fields[4])}'
                span_text = f'{fields[6]} {int(fields[3])-1} {int(fields[4])}'
                span_text = span_text.replace('\xa0', ' ')  # Replace non-breaking space with normal space
                output_file.write(f'T{counter}\t{span_text}\t{fields[5]}\n')
        
        counter += 1


### Create empty ann files for those abstracts that don't have them because NER has not found any matches


In [143]:


import os

def create_missing_ann_files(ann_dir,txt_dir):
    for filename in os.listdir(txt_dir):
        if filename.endswith('.txt'):
            txt_file_path = os.path.join(ann_dir, filename)
            ann_file_path = os.path.splitext(txt_file_path)[0] + '.ann'
            
            if not os.path.exists(ann_file_path):
                with open(ann_file_path, 'w') as ann_file:
                    pass  # Creates an empty .ann file

ann_dir = '../../data/s1000_ner/output/transformer_predicted_ann_files_with_LSF_types_balanced/'
txt_dir ='../../data/s1000_ner/balanced_LSF_types_splits/test_data/'

create_missing_ann_files(ann_dir,txt_dir)


### calculate performance in comparison with manual annotations:
    * This is based on filtered annotation files
    * keywords_removed = {"LSF_out_of_context", "Geographical_Feature", "Occupations"}

In [144]:

!python2 ../../../utils/IAA.py -o -d -v  '../../data/s1000_ner/balanced_LSF_types_splits/test_data/'   '../../data/s1000_ner/output/transformer_predicted_ann_files_with_LSF_types_balanced/' --allowmissing > ../../data/s1000_ner/results/performance_with_LSF_types_balanced.txt 2>&1
!grep 'FP:' ../../data/s1000_ner/results/performance_with_LSF_types_balanced.txt > ../../data/s1000_ner/results/performance_with_LSF_types_balanced_FP_lines.txt
!grep 'FN:' ../../data/s1000_ner/results/performance_with_LSF_types_balanced.txt > ../../data/s1000_ner/results/performance_with_LSF_types_balanced_FN_lines.txt


In [145]:
!python2 ../../../utils/IAA.py -o -d -v  -i  '../../data/s1000_ner/balanced_LSF_types_splits/test_data/'   '../../data/s1000_ner/output/transformer_predicted_ann_files_with_LSF_types_balanced/' --allowmissing > ../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced.txt 2>&1
!grep 'FP:' ../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced.txt > ../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced_FP_lines.txt
!grep 'FN:' ../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced.txt > ../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced_FN_lines.txt


In [146]:
import re

def add_links(input_file, output_file,correction):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) >= 1:
                first_column = parts[0]
                elements = first_column.split()
                if len(elements) >= 3:
                    identifier = elements[0]
                    code = elements[-1]

                    # Extract the offsets from the second tab-separated column
                    offset_parts = parts[1].split()[1:]
                    if len(offset_parts) >= 2:
                        start_offset, end_offset = map(int, offset_parts)
                        start_offset += correction
                        end_offset += correction
                        second_column = f"http://localhost:5555/index.xhtml#/NER/Annotated_200_abstracts/{identifier}?focus={start_offset}~{end_offset}"
                        outfile.write(first_column + '\t' + second_column + '\t' + '\t'.join(parts[2:]) + '\n')
                    else:
                        outfile.write(line)
                else:
                    outfile.write(line)
            else:
                outfile.write(line)

    print(f"Modified file '{output_file}' created.")

# Example usage for FP
input_file_fp = r"../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced_FP_lines.txt"
output_file_fp = r"../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced_FP_lines_with_links.txt"
add_links(input_file_fp, output_file_fp,correction=1)

# Example usage for FN
input_file_fn = r"../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced_FN_lines.txt"
output_file_fn = r"../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced_FN_lines_with_links.txt"
add_links(input_file_fn, output_file_fn,correction=0)


Modified file '../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced_FP_lines_with_links.txt' created.
Modified file '../../data/s1000_ner/results/performance_with_LSF_types_ignored_balanced_FN_lines_with_links.txt' created.


### produce tagger matches using dictionary based-NER-matches for test data

In [147]:
#!python2 ../../../utils/IAA.py -o -d -v   '../../data/s1000_ner/balanced_LSF_types_splits/test_data/'   "../../data/200_abstracts/taggers_matches/brat_files/" --allowmissing > ../../data/s1000_ner/results/performance_Tagger_for_Test_Data.txt 2>&1
!python2 ../../../utils/IAA.py -o -d -v   '../../data/s1000_ner/balanced_LSF_types_splits/test_data/'   "../../data/200_abstracts/taggers_matches/brat_files/"  > ../../data/s1000_ner/results/performance_Tagger_for_Test_Data.txt 2>&1



# Plot

* extract metrics from text files

In [None]:
import pandas as pd
import re

def extract_data_and_save(file_path):
    # Initialize an empty DataFrame
    pr_rec_per_lsf_branch = pd.DataFrame(columns=['Lifestyle-factor branch', 'Precision', 'Recall', 'F'])

    # Read the text from the file
    with open(file_path, 'r') as file:
        text = file.read()

    # Split the text into lines
    lines = text.strip().split('\n')

    # Iterate over each line and extract data
    for line in lines:
        match = re.match(r'TYPE:\s+(.*?)\s+precision\s+([\d.]+%)\s+\((\d+)/(\d+)\)\s+recall\s+([\d.]+%)\s+\((\d+)/(\d+)\)\s+F\s+([\d.]+)', line)
        if match:
            revision_step, precision, _, _, recall, _, _, f_score = match.groups()
            if revision_step not in ['Beauty_and_Cleaning', 'Drugs', 'Environmental_exposures',
                                     'Mental_health_practices', 'Non_physical_leisure_time_activities',
                                     'Nutrition', 'Physical_activity', 'Sleep', 'Socioeconomic_factors']:
                continue

            # Remove the percentage symbols and convert to floats
            precision = float(precision.rstrip('%'))
            recall = float(recall.rstrip('%'))
            f_score = float(f_score)
            # Append the data to the DataFrame
            pr_rec_per_lsf_branch = pr_rec_per_lsf_branch.append({
                'Lifestyle-factor branch': revision_step.strip(),
                'Precision': precision,
                'Recall': recall,
                'F': f_score
            }, ignore_index=True)

    # Sort the DataFrame by the 'F' column in ascending order
    pr_rec_per_lsf_branch = pr_rec_per_lsf_branch.sort_values(by='F')
    
    # Return the resulting DataFrame
    return pr_rec_per_lsf_branch

# full_output Dictionary based NER for test data:
file_path = "../../data/s1000_ner/results/performance_Tagger_for_Test_Data.txt"

resulting_df = extract_data_and_save(file_path)
# Save the DataFrame to a TSV file
tsv_file_path = "../../data/s1000_ner/results/performance_Tagger_for_Test_Data_prec_rec.tsv"
resulting_df.to_csv(tsv_file_path, sep='\t', index=None)



# full_output S1000 NER for test data:
file_path = "../../data/s1000_ner/results/performance_with_LSF_types_balanced.txt"
resulting_df = extract_data_and_save(file_path)
# Save the DataFrame to a TSV file
tsv_file_path = "../../data/s1000_ner/results/performance_Deep_for_Test_Data_prec_rec.tsv"
resulting_df.to_csv(tsv_file_path, sep='\t', index=None)




* Modify lables to be appropriate for plotting


In [1]:

import pandas as pd

def replace_phrases_in_dataframe(input_file_path, phrase_replacements):
    # Read the input TSV file into a DataFrame
    df = pd.read_csv(input_file_path, sep='\t')

    # Replace the phrases in the DataFrame
    df['Lifestyle-factor branch'] = df['Lifestyle-factor branch'].replace(phrase_replacements)

    # Write the modified DataFrame to the output TSV file
    df.to_csv(input_file_path, sep='\t', index=False)


phrase_replacements = {
    'Environmental_exposures': 'Environmental exposures',
    'Physical_activity': 'Physical activities',
    'Socioeconomic_factors': 'Socioeconomic factors',
    'Drugs': 'Substance use',
    'Mental_health_practices': 'Mental health practices',
    'Non_physical_leisure_time_activities': 'Non physical leisure time activities',
    'Beauty_and_Cleaning': 'Beauty and cleaning'
}


# replace names in both files
Tagger_file = "../../data/s1000_ner/results/performance_Tagger_for_Test_Data_prec_rec.tsv"
replace_phrases_in_dataframe(Tagger_file, phrase_replacements)

Transformer_NER_file = "../../data/s1000_ner/results/performance_Deep_for_Test_Data_prec_rec.tsv"
replace_phrases_in_dataframe(Transformer_NER_file, phrase_replacements)


# merge two files

df_tagger = pd.read_csv(Tagger_file, sep='\t')
df2_transformer = pd.read_csv(Transformer_NER_file, sep='\t')
df2_transformer.columns = ['Lifestyle-factor branch']+[col + '_transformer' for col in df2_transformer.columns if col!='Lifestyle-factor branch']

merged_df = df_tagger.merge(df2_transformer, on='Lifestyle-factor branch')

# Save the merged DataFrame to a new TSV file
merged_file = "../../data/s1000_ner/results/plot_input_compare_DIC_VS_Transformer.tsv"

#merged_df=merged_df.sort_values(by=['Lifestyle-factor branch'])

desired_order = ['nutrition', 'socioeconomic factors', 'environmental exposures', 'substance use','physical activities', 'beauty and cleaning',  'non physical leisure time activities',  'sleep' , 'mental health practices']
# Create a new column with the order of 'Lifestyle-factor branch' (ignoring case)
merged_df['Order'] = merged_df['Lifestyle-factor branch'].str.lower().map({value.lower(): index for index, value in enumerate(desired_order)})
# Sort the DataFrame based on the new 'Order' column
merged_df_sorted = merged_df.sort_values(by='Order').drop('Order', axis=1)
merged_df_sorted.to_csv(merged_file, sep='\t', index=False)


    * plot by comparing Tagger Ner with Deep NER
    


In [2]:

#!python3 ../plot_prec_rec_Deep_vs_Tagger.py --input_file=../../data/s1000_ner/results/plot_input_compare_DIC_VS_Transformer.tsv --task="Performance of Transformer-based and Dictonary-based NER (Test-Data)" --output_file=../../data/s1000_ner/results/plot_Tagger_VS_Transformer_NER.png
!python3 ../plot_prec_rec_Deep_vs_Tagger.py --input_file=../../data/s1000_ner/results/plot_input_compare_DIC_VS_Transformer.tsv --task="" --output_file=../../data/s1000_ner/results/plot_Tagger_VS_Transformer_NER.png


Figure(1200x1200)
