In [7]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import re
from thefuzz import process, fuzz
from datasets import load_dataset, Features, ClassLabel, Value, Dataset
import evaluate
from sklearn.metrics import precision_recall_fscore_support
import torch


In [8]:
is_cuda_available = torch.cuda.is_available()
print()
print(f"Is CUDA available? {is_cuda_available}")

if is_cuda_available:
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")


Is CUDA available? True
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [None]:
df = pd.read_csv('D:/STC_25/ml/notebook/data/for_final_finetuning_Disease_classification.csv')

if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

label_names = df['label'].unique().tolist()

features = Features({
    'text': Value('string'),
    'label': ClassLabel(names=label_names)
})

dataset = Dataset.from_pandas(df, features=features)
dataset = dataset.train_test_split(test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained("yikuan8/Clinical-BigBird")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=4096)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

model = AutoModelForSequenceClassification.from_pretrained(
    "yikuan8/Clinical-BigBird",
    num_labels=len(label_names),
    use_safetensors=True
)

model.config.id2label = {i: label for i, label in enumerate(label_names)}
model.config.label2id = {label: i for i, label in enumerate(label_names)}

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, _ , f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': metric.compute(predictions=predictions, references=labels),
        'f1': f1,
        'precision': precision,
    }

training_args = TrainingArguments(
    output_dir="./clinical_bigbird_symptom_classifier_4096",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=16,
    num_train_epochs=3,
    weight_decay=0.01,
    optim="adamw_torch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Starting the fine-tuning process with max_length=4096...")
trainer.train()
print("Fine-tuning complete!")

Map: 100%|██████████| 983/983 [00:01<00:00, 820.02 examples/s]
Map: 100%|██████████| 246/246 [00:00<00:00, 902.38 examples/s]
Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at yikuan8/Clinical-BigBird and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.67 GiB is allocated by PyTorch, and 72.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
import torch
text = "A 19-year-old patient presents with multiple blackheads, whiteheads, and several painful, pus-filled pimples concentrated on the face, chest, and upper back."
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained("D:/STC_25/ml/src/artifacts/model")
tokenizer = AutoTokenizer.from_pretrained("D:/STC_25/ml/src/artifacts/preprocessor")
model.to(device)
inputs = tokenizer(text, return_tensors="pt").to(device)
with torch.no_grad():
  outputs = model(**inputs)
  logits = outputs.logits
  pred_class_id = torch.argmax(logits, dim = -1).item()
  predicted_label = model.config.id2label[pred_class_id]
  print(predicted_label)

Attention type 'block_sparse' is not possible if sequence_length: 38 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Chronic cholestasis


In [9]:
from sklearn.metrics import classification_report
import numpy as np
import torch

# Load your model and tokenizer (if not already loaded)
# model = AutoModelForSequenceClassification.from_pretrained("./final_model")
# tokenizer = AutoTokenizer.from_pretrained("./final_model")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Get predictions for the entire test dataset
print("Generating predictions for the classification report...")
predictions = trainer.predict(tokenized_dataset["test"])
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Get the true labels
true_labels = tokenized_dataset["test"]["labels"]

# Get the names of all your diseases
label_names = model.config.id2label.values()

# Generate and print the report
report = classification_report(true_labels, predicted_labels, target_names=label_names)

print("\n--- Classification Report ---")
print(report)

Generating predictions for the classification report...


NameError: name 'trainer' is not defined

In [None]:
file_path = '/D:/stc_project_25/ml/artifacts/'

model.save_pretrained(file_path)
tokenizer.save_pretrained(file_path)

print("fine tuned models are saved to", file_path)

In [None]:
# df = pd.read_excel('D:/stc_project_25/ml/data/combined_excel_file.xlsx')

In [None]:
# def clean_and_deduplicate_foods(log_string):
#     if isinstance(log_string, float) and np.isnan(log_string):
#         log_string = ""

#     cleaned_string = re.sub(r"Day 1:", "", log_string)
#     cleaned_string = re.sub(r"; Day \d+:", ",", cleaned_string)
#     cleaned_string = re.sub(r"\.", "", cleaned_string)

#     food_items = [item.strip() for item in log_string.split(',')]

#     unique_items = list(dict.fromkeys(item for item in food_items if item))

#     final_string = ", ".join(unique_items)
#     return f", Food Eaten in Last 5 Days {final_string}"

# def clean_occupation_pipeline(occ, allow_set, deny_list):
#     if not isinstance(occ, str) or not occ.strip():
#         return ("", occ)

#     occ_lower = occ.lower()

#     best_deny_match, deny_score = process.extractOne(occ_lower, deny_list, scorer=fuzz.token_set_ratio)
#     if deny_score > 85:
#         return ("", occ)

#     if occ_lower in allow_set:
#         return (occ, np.nan)

#     best_allow_match, allow_score = process.extractOne(occ_lower, allow_set)
#     if allow_score > 90:
#         return (best_allow_match.title(), np.nan)

#     return ("", occ)

# def process_occupations(df):
#     allow_list = {
#         'accountant', 'architect', 'artist', 'auditor', 'barista', 'business analyst',
#         'carpenter', 'chef', 'civil engineer', 'content writer', 'construction worker',
#         'data scientist', 'database administrator', 'dentist', 'devops engineer',
#         'doctor', 'electrician', 'firefighter', 'financial analyst', 'flight attendant',
#         'graphic designer', 'hr specialist', 'hvac technician', 'illustrator', 'lawyer',
#         'marketing manager', 'mason', 'mechanic', 'medical assistant', 'nurse',
#         'operations manager', 'paramedic', 'pharmacist', 'photographer',
#         'physical therapist', 'physician assistant', 'plumber', 'police officer',
#         'project manager', 'real estate agent', 'registered nurse', 'sales representative',
#         'software engineer', 'systems analyst', 'teacher', 'ui/ux designer',
#         'veterinarian', 'video editor', 'web developer', 'welder', 'Chief Executive Officer',
#         'Chief Operating Officer ', 'Vice President', 'Director', 'Manager',
#         'Team Lead', 'Supervisor', 'Project Manager', 'Engineer', 'Analyst', 'Specialist',
#         'Consultant', 'Coordinator', 'Associate', 'Assistant', 'Representative'
#     }
#     deny_list = [
#         'declined to answer', 'disabled', 'garbage', 'homemaker',
#         'inmate', 'n/a', 'none', 'not applicable', 'not specified', 'not working',
#         'patient', 'refused', 'retired', 'seeking employment', 'self-employed',
#         'student', 'stay at home mom/dad', 'trying to conceive', 'unemployed',
#         'unknown'
#     ]

#     results = df['Occupation'].apply(
#         lambda occ: clean_occupation_pipeline(occ, allow_list, deny_list)
#     )

#     temp_results_df = pd.DataFrame(
#         results.tolist(),
#         index=df.index,
#         columns=['Occupation_Cleaned', 'Extra Info_Temp']
#     )

#     df['Extra Info'] = temp_results_df['Extra Info_Temp']
#     df['Occupation'] = temp_results_df['Occupation_Cleaned']

#     return df

In [None]:
# def remove_nan_rows(df):
#   df.dropna(subset=['Symptoms'], inplace=True)
  

# def clean_and_deduplicate_foods(log_string):
#     if isinstance(log_string, float) and np.isnan(log_string):
#       log_string = ""

#     cleaned_string = re.sub(r"Day 1:", "", log_string)
#     cleaned_string = re.sub(r"; Day \d+:", ",", cleaned_string)
#     cleaned_string = re.sub(r"\.", "", cleaned_string)

#     food_items = [item.strip() for item in cleaned_string.split(',')]

#     unique_items = list(dict.fromkeys(item for item in food_items if item))

#     final_string = ", ".join(unique_items)
#     return f", Food Eaten in Last 5 Days {final_string}"

In [None]:
# remove_nan_rows(df)
# df['Age'] = ' of age ' + df['Age'].astype(str)
# df['Gender'] = 'A ' + df['Gender'].astype(str)
# df = process_occupations(df)
# df['Occupation'] = np.where(df['Occupation'] != "", ', Occupation is ' + df['Occupation'].astype(str), '')
# df['Food Eaten in Last 5 Days'] = df['Food Eaten in Last 5 Days'].apply(clean_and_deduplicate_foods)
# df['Symptoms'] = ', Symptoms are ' + df['Symptoms'].astype(str)
# df['Recent Travel History'] = np.where( df['Recent Travel History'].notna(), ', Recently Traveled to ' + df['Recent Travel History'].str.replace("Visited", "", n = -1, regex = True), '')
# df['Extra Info'] = np.where(df['Extra Info'].notna(), ' , ' + df['Extra Info'].astype(str), "")
# df['text'] = df['Gender'].astype(str) + df['Age'].astype(str) + df['Occupation'].astype(str) + df['Recent Travel History'].astype(str) + df['Symptoms'].astype(str) + df['Food Eaten in Last 5 Days'].astype(str) + df['Extra Info'].astype(str)
# df.drop(columns = ['Symptoms', 'Age', 'Gender', 'Occupation', 'Recent Travel History', 'Food Eaten in Last 5 Days', 'Extra Info'], inplace=True, axis = 1)
# df.rename(columns = {'Disease' : 'label'}, inplace = True)
