### Cell 1. Mount Google Drive
The cell below mounts our Google Drive having the files, making the files in your drive accessible in the notebook.

In [4]:
from google.colab import drive
import os

# Lets start with mounting the drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Cell 2. Define Project Directory

The cell below has the code to give the path to our directory where the project files (like CSV files) are located and ensures the directory exists.

In [5]:
# this is path to your dir containing the files
project_dir = '/content/drive/MyDrive/NLP_Assignment/'
os.makedirs(project_dir, exist_ok=True)

### Cell 3. A function to Preview CSV Files
This A function shows the first five rows of each CSV file in the our directory. It helps you check if the files are loaded correctly.



In [None]:
# A function to display the first few rows of each CSV file
def preview_csv_files(csv_files):
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            print(f"\nPreview of {file}:")
            print(df.head())  # Show the first 5 rows
            print("\n" + "-"*50 + "\n")
        except Exception as e:
            print(f"Error reading {file}: {e}")

# Previewing the content of the input CSV files
preview_csv_files(csv_files)



Preview of /content/drive/MyDrive/NLP_Assignment/CSV1.csv:
   Unnamed: 0  HADM_ID                                         SHORT-TEXT  \
0           1   100003  history of present illness mr known lastname i...   
1           3   100009  history of present illness yo man with known c...   
2           5   100011  history of present illness y o male helmeted m...   
3           9   100021  history of present illness year old spanish sp...   
4          11   100028  history of present illness this is a year old ...   

                                           ICD9_CODE  \
0  53100, 2851, 07054, 5715, 45621, 53789, 4019, ...   
1  41401, 99604, 4142, 25000, 27800, V8535, 4148,...   
2  85206, 82111, 86403, 48242, 8600, 2851, 86121,...   
3  E8788, E8497, 4019, 04104, 0413, 5728, V1581, ...   
4  5761, 0389, 5184, 57481, 99591, 42731, 2875, 8...   

                                                ICD9    Label  
0        531, 285, 070, 571, 456, 537, 401, 535, 782      285  
1  414, 996,

### Cell 4. Extract Text from CSV Files
This A function extracts the text data from desired columns in the CSV files and writes them into a single .txt file by the name of **combined_texts.txt**

In [None]:
import pandas as pd

# A function to extract all the 'text' column data from our CSV files and then save them into a single .txt file
def extract_text_from_csvs(csv_files, output_txt):

    text_columns = ['text', 'SHORT-TEXT', 'TEXT']

    with open(output_txt, 'w', encoding='utf-8') as output_file:
        for file in csv_files:
            df = pd.read_csv(file)

            column_found = False
            for col in text_columns:
                if col in df.columns:
                    column_found = True
                    extracted_text = df[col].dropna()
                    print(f"Extracting text from {file} (Column: {col}):\n")
                    print(extracted_text.head())  # Preview first few extracted lines
                    extracted_text.apply(lambda x: output_file.write(f"{x}\n"))
                    break

            if not column_found:
                print(f"No valid text column found in {file}.")

# The list of CSV file and them paths.
csv_files = [
    project_dir + 'CSV1.csv',
    project_dir + 'CSV2.csv',
    project_dir + 'CSV3.csv',
    project_dir + 'CSV4.csv'
]

# lets output text file
output_txt = project_dir + 'combined_texts.txt'

# execute text extraction
extract_text_from_csvs(csv_files, output_txt)

print(f"The text extraction has been completed and the file has been saved to: {output_txt}")


Extracting text from /content/drive/MyDrive/NLP_Assignment/CSV1.csv (Column: SHORT-TEXT):

0    history of present illness mr known lastname i...
1    history of present illness yo man with known c...
2    history of present illness y o male helmeted m...
3    history of present illness year old spanish sp...
4    history of present illness this is a year old ...
Name: SHORT-TEXT, dtype: object
Extracting text from /content/drive/MyDrive/NLP_Assignment/CSV2.csv (Column: TEXT):

0    Admission Date:  [**2185-11-11**]       Discha...
1    Admission Date:  [**2160-12-7**]              ...
2    Admission Date:  [**2187-10-23**]             ...
3    Admission Date:  [**2123-4-26**]              ...
4    Admission Date:  [**2124-2-14**]              ...
Name: TEXT, dtype: object
Extracting text from /content/drive/MyDrive/NLP_Assignment/CSV3.csv (Column: TEXT):

0    Admission Date:  [**2117-9-11**]              ...
1    Admission Date:  [**2150-4-17**]              ...
2    Admission Date: 

### Cell 5. Word Count in Text File
This A function reads the combined text file and counts the top 30 most common words. It outputs and stores the results into a CSV file.

In [None]:
from collections import Counter
import re

# A function to count words in our combined text file and store top 30 words in a CSV file
def count_words_in_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
        words = re.findall(r'\b\w+\b', text)
        word_counts = Counter(words)
        return word_counts.most_common(30)

def save_word_counts_to_csv(word_counts, output_csv):
    df = pd.DataFrame(word_counts, columns=['Word', 'Count'])
    df.to_csv(output_csv, index=False)

# count the words and then save to CSV
word_counts = count_words_in_text(output_txt)
output_csv = project_dir + 'top_30_words.csv'
save_word_counts_to_csv(word_counts, output_csv)

print(f"The Top 30 words saved to: {output_csv}")
print_file_contents(file_path)


The Top 30 words saved to: /content/drive/MyDrive/NLP_Assignment/top_30_words.csv


### Cell 6. Tokenizing Text with a BERT Model
This A *function* tokenizes the text using a BERT model and extracts the top 30 tokens. It handles text in chunks for memory efficiency.

In [None]:
import os
import pandas as pd
from collections import Counter
from transformers import AutoTokenizer

project_dir = '/content/drive/MyDrive/NLP_Assignment/'
output_txt = project_dir + 'combined_texts.txt'

if os.path.exists(project_dir):
    files_in_dir = os.listdir(project_dir)
    print("Files in project directory:", files_in_dir)
else:
    print(f"Directory {project_dir} does not exist. Please check the path.")

# Install transformers library if needed
!pip install transformers

# A function to save word/token counts to a CSV file
def save_word_counts_to_csv(word_counts, output_csv):
    df = pd.DataFrame(word_counts, columns=['Word', 'Count'])
    df.to_csv(output_csv, index=False)

# A function to tokenize and handle sequence truncation
def get_top_tokens_in_chunks(model_name, text_file, top_n=30, chunk_size=1024):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    token_counts = Counter()

    # Reading the files in chunks
    with open(text_file, 'r', encoding='utf-8') as file:
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            tokens = tokenizer.tokenize(chunk, truncation=True, max_length=512)
            token_counts.update(tokens)

    return token_counts.most_common(top_n)

if os.path.exists(output_txt):
    top_tokens = get_top_tokens_in_chunks("bert-base-uncased", output_txt)
    output_csv_tokens = project_dir + 'top_30_tokens.csv'
    save_word_counts_to_csv(top_tokens, output_csv_tokens)
    print(f"The top 30 tokens have been saved to: {output_csv_tokens}")
else:
    print(f"Text file {output_txt} not found.")


Files in project directory: ['CSV1.csv', 'CSV2.csv', 'CSV3.csv', 'CSV4.csv', 'combined_texts.txt', 'top_30_words.csv']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



The top 30 tokens have been saved to: /content/drive/MyDrive/NLP_Assignment/top_30_tokens.csv


### Cell 7. Install SpaCy, SciSpaCy, Install SciSpaCy's Disease and Drug Model (en_ner_bc5cdr_md) and the Install Transformers Library

- Installs the SpaCy library along with the SciSpaCy extension, which provides access to various pre-trained models for biomedical named entity recognition.

- Installs a specific SciSpaCy model (en_ner_bc5cdr_md) that is trained to recognize biomedical entities like diseases and chemicals (drugs). The model is downloaded from a specific URL.

- Installs the Hugging Face transformers library, which provides access to state-of-the-art pre-trained models like BERT, BioBERT, and more. This library will be used for named entity recognition (NER) tasks with BioBERT.

In [1]:
!pip install spacy scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz
!pip install transformers


Collecting spacy
  Using cached spacy-3.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Using cached thinc-8.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached spacy-3.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.0 MB)
Using cached thinc-8.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (922 kB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.0.17
    Uninstalling thinc-8.0.17:
      Successfully uninstalled thinc-8.0.17
  Attempting uninstall: spacy
    Found existing installation: spacy 3.0.9
    Uninstalling spacy-3.0.9:
      Successfully uninstalled spacy-3.0.9
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-ner-bc5cdr-md 0.4.0 

### Cell 8. Entity Recognition Using SciSpaCy and BioBERT

The entire code is presented next, defining the file path to the text file (combined_texts.txt) for processing.

**Entity Filtering — filter_entities**  
Filters and maps only meaningful entities (e.g., 'DISEASE' or 'CHEMICAL') and provides statistics along with small samples of the resulting matched output.

**Text Chunking — split_text_into_tokens**  
Performs chunking of the text to break it into digestible NumPy array blocks (512 tokens for BioBERT, 10,000 characters for SciSpaCy).

**SciSpaCy Processing — process_with_scispacy_sample**  
Applies the SciSpaCy model on the first 1MB chunked text and then filters out useful entities.

**BioBERT Preprocessing — process_with_biobert_sample**  
Chunks text to process it with BioBERT for natural language processing, then retrieves the matching entities using the available API.

**Entity Comparison — compare_entities**  
Compares the entities detected by both models and lists the common and unique entities, along with the totals.

The code tests both models on the same text and compares the results of entity detection.

In [None]:
import spacy
from transformers import pipeline, AutoTokenizer

project_dir = '/content/drive/MyDrive/NLP_Assignment/'
output_txt = project_dir + 'combined_texts.txt'

# A function to filter only drug and disease entities
def filter_entities(entities, target_labels=['DISEASE', 'CHEMICAL']):
    filtered_entities = [ent for ent in entities if ent[1] in target_labels]

    # Checking the size of the filtered data
    print(f"Number of filtered entities: {len(filtered_entities)}")

    # Checking the total number of characters in filtered entities
    total_chars = sum(len(ent[0]) for ent in filtered_entities)
    print(f"Total characters in filtered entities: {total_chars}")

    # Printing a sample of the filtered entities
    print("Sample of filtered entities (first 5):", filtered_entities[:5])

    return filtered_entities

# A function to spliting the text into smaller chunks by tokens, not by characters.
def split_text_into_tokens(text, tokenizer, chunk_size=512):
    tokens = tokenizer(text, truncation=True, padding='max_length', max_length=chunk_size, return_tensors="pt")['input_ids'][0]
    for i in range(0, len(tokens), chunk_size):
        yield tokens[i:i + chunk_size]

# A function to processing text using scispaCy on a smaller sample of the text of 1MB sample
def process_with_scispacy_sample(text_file, sample_size=1000000):
    nlp = spacy.load("en_ner_bc5cdr_md")

    # Only process the first 1MB of text
    with open(text_file, 'r', encoding='utf-8') as file:
        text = file.read()[:sample_size]

    entities = []
    # Split by characters for scispaCy
    for chunk in split_text(text, chunk_size=10000):
        doc = nlp(chunk)
        entities.extend([(ent.text, ent.label_) for ent in doc.ents])

    return filter_entities(entities)

# A function for the processing of the text using BioBERT in token-based chunks ,512 tokens per chunk
def process_with_biobert_sample(text_file, sample_size=1000000):  # 1MB sample
    nlp_pipeline = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1")
    tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

    with open(text_file, 'r', encoding='utf-8') as file:
        text = file.read()[:sample_size]  # Only process the first 1MB of text

    entities = []
    # Spliting the text into token-based chunks
    for chunk_tokens in split_text_into_tokens(text, tokenizer, chunk_size=512):
        # Decoding the tokens back to text for NER
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)

        # Passing the decoded text to the pipeline
        chunk_entities = nlp_pipeline(chunk_text)

        # Extracting the entities
        entities.extend([(entity['word'], entity['entity']) for entity in chunk_entities])

    return filter_entities(entities)


# Comparing the entities detected by both models
def compare_entities(scispacy_entities, biobert_entities):
    scispacy_set = set(scispacy_entities)
    biobert_set = set(biobert_entities)

    # Finding the common, unique, and total entities
    common_entities = scispacy_set & biobert_set
    scispacy_unique = scispacy_set - biobert_set
    biobert_unique = biobert_set - scispacy_set

    return {
        "scispacy_total": len(scispacy_set),
        "biobert_total": len(biobert_set),
        "common_entities": common_entities,
        "scispacy_unique": scispacy_unique,
        "biobert_unique": biobert_unique
    }

# Running the scispaCy and BioBERT on a 1MB sample of the text
scispacy_entities_sample = process_with_scispacy_sample(output_txt)
biobert_entities_sample = process_with_biobert_sample(output_txt)

# Comparing the entities detected by scispaCy and BioBERT
comparison_results = compare_entities(scispacy_entities_sample, biobert_entities_sample)

# Printing comparison results
print(f"Total entities detected by scispaCy (sample): {comparison_results['scispacy_total']}")
print(f"Total entities detected by BioBERT (sample): {comparison_results['biobert_total']}")
print(f"Common entities (sample): {comparison_results['common_entities']}")
print(f"Entities unique to scispaCy (sample): {comparison_results['scispacy_unique']}")
print(f"Entities unique to BioBERT (sample): {comparison_results['biobert_unique']}")


### Cell 10. The Whole Processing of combined_texts.txt

The entire processing of the combined_texts.txt file cannot be done due to the significant amount of computational resources such as GPU, RAM, and CPU required for the task. The free version of Google Colab has limitations in resource availability and cannot handle large-scale NLP tasks with massive datasets like those used in this assignment. The combined_texts.txt is about 750MB, and processing the entire text with models like SciSpaCy and BioBERT would take several hours or even days if enough computing resources were available. Since we tested on 1MB, and it took a few hours (as described earlier), processing 750MB could take significantly longer. Due to the time-limited and resource-constrained nature of the free Colab instance, this process would not complete and could result in a system crash due to resource overflow.

**Code Functionality Overview**

- **SciSpaCy**: The text is divided into 10,000 character-sized blocks, and the SciSpaCy model processes the entity (disease-chemical) extraction. Iterative processing is employed to handle the chunks to avoid out-of-memory errors.
  
- **BioBERT**: BioBERT processes the text using chunk-based processing with 512-token chunks. This token-based approach ensures consistency with how transformer models like BioBERT expect text input. The AutoTokenizer tokenizes the input text, which is then passed to the Named Entity Recognition (NER) model.

- **The Model**: Both models detect diseases and chemicals, filtering them to provide a cleaned list of entities. This code is used to compare the results of both models, extracting common entities and unique entities.

**Why We Tested a 1MB Sample**

By using a smaller 1MB sample of the data, we ran the models to validate their success and accuracy against larger datasets. This allowed us to:

- **Test the Validity of the Code**: Confirm that the code works correctly without using excessive resources or causing significant issues like resource exhaustion.
  
- **Step 4: Check Model Performance (1MB)**: A very small sample size of 5k abstracts at a time was enough data to examine model output quality and the performance of both SciSpaCy and BioBERT.
  
- **Develop Faster**: This approach provided almost instant feedback, reducing CPU overhead and long processing times during debugging and refinement. It also made it easier to test the code on smaller portions of data.


Since the free tier capacity of Google Colab cannot process the entire dataset, we focused on processing a smaller subset of the collection efficiently to ensure we were on the right track. In real-world conditions or with premium cloud infrastructure, the full dataset could be processed. However, for the purpose of this research project and with the available computational resources, processing the sample is sufficient to demonstrate the code’s functionality.

In [None]:
import spacy
from transformers import pipeline, AutoTokenizer

project_dir = '/content/drive/MyDrive/NLP_Assignment/'
output_txt = project_dir + 'combined_texts.txt'

# A function to filter only drug and disease entities
def filter_entities(entities, target_labels=['DISEASE', 'CHEMICAL']):
    filtered_entities = [ent for ent in entities if ent[1] in target_labels]

    # Checking the size of the filtered data
    print(f"Number of filtered entities: {len(filtered_entities)}")

    # Checking the total number of characters in filtered entities
    total_chars = sum(len(ent[0]) for ent in filtered_entities)
    print(f"Total characters in filtered entities: {total_chars}")

    # Printing a sample of the filtered entities
    print("Sample of filtered entities (first 5):", filtered_entities[:5])

    return filtered_entities

# A function to split the text into smaller chunks by tokens, not by characters.
def split_text_into_tokens(text, tokenizer, chunk_size=512):
    tokens = tokenizer(text, truncation=True, padding='max_length', max_length=chunk_size, return_tensors="pt")['input_ids'][0]
    for i in range(0, len(tokens), chunk_size):
        yield tokens[i:i + chunk_size]

# A function to split the text by characters for scispaCy
def split_text(text, chunk_size=10000):
    for i in range(0, len(text), chunk_size):
        yield text[i:i + chunk_size]

# A function for processing text using scispaCy on the entire dataset
def process_with_scispacy_full(text_file):
    nlp = spacy.load("en_ner_bc5cdr_md")

    # Process the entire text
    with open(text_file, 'r', encoding='utf-8') as file:
        text = file.read()  # Process the entire file

    entities = []
    # Split by characters for scispaCy
    for chunk in split_text(text, chunk_size=10000):
        doc = nlp(chunk)
        entities.extend([(ent.text, ent.label_) for ent in doc.ents])

    return filter_entities(entities)

# A function for processing the text using BioBERT in token-based chunks, 512 tokens per chunk
def process_with_biobert_full(text_file):
    nlp_pipeline = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1")
    tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

    with open(text_file, 'r', encoding='utf-8') as file:
        text = file.read()  # Processing the entire file

    entities = []
    # Splitting the text into token-based chunks
    for chunk_tokens in split_text_into_tokens(text, tokenizer, chunk_size=512):
        # Decoding the tokens back to text for NER
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)

        # Passing the decoded text to the pipeline
        chunk_entities = nlp_pipeline(chunk_text)

        # Extracting the entities
        entities.extend([(entity['word'], entity['entity']) for entity in chunk_entities])

    return filter_entities(entities)


# Comparing the entities detected by both models
def compare_entities(scispacy_entities, biobert_entities):
    scispacy_set = set(scispacy_entities)
    biobert_set = set(biobert_entities)

    # Finding the common, unique, and total entities
    common_entities = scispacy_set & biobert_set
    scispacy_unique = scispacy_set - biobert_set
    biobert_unique = biobert_set - scispacy_set

    return {
        "scispacy_total": len(scispacy_set),
        "biobert_total": len(biobert_set),
        "common_entities": common_entities,
        "scispacy_unique": scispacy_unique,
        "biobert_unique": biobert_unique
    }

# Running the scispaCy and BioBERT on the entire text
scispacy_entities_full = process_with_scispacy_full(output_txt)
biobert_entities_full = process_with_biobert_full(output_txt)

# Comparing the entities detected by scispaCy and BioBERT
comparison_results_full = compare_entities(scispacy_entities_full, biobert_entities_full)

# Printing comparison results
print(f"Total entities detected by scispaCy (full): {comparison_results_full['scispacy_total']}")
print(f"Total entities detected by BioBERT (full): {comparison_results_full['biobert_total']}")
print(f"Common entities (full): {comparison_results_full['common_entities']}")
print(f"Entities unique to scispaCy (full): {comparison_results_full['scispacy_unique']}")
print(f"Entities unique to BioBERT (full): {comparison_results_full['biobert_unique']}")
