In [None]:
import pathlib
import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict

print("=" * 80)
print("LOADING ALL DATA FILES")
print("=" * 80)

# 2. Load CSV files from medi_bot directory
print("\n2. Loading CSV files...")
csv_files = {
    "dataset": "dataset.csv",
    "symptom_description": "symptom_Description.csv",
    "symptom_precaution": "symptom_precaution.csv",
    "symptom_severity": "Symptom-severity.csv"
}

csv_frames = {}
for name, filename in csv_files.items():
    csv_path = pathlib.Path(filename)
    if csv_path.exists():
        csv_frames[name] = pd.read_csv(csv_path)
        print(f"   ✓ Loaded: {filename} ({len(csv_frames[name])} rows)")
    else:
        print(f"   ⚠ Not found: {filename}")

# 3. Load XML files from MedQuAD directories
print("\n3. Loading XML files from MedQuAD...")
medquad_dir = pathlib.Path("MedQuAD")
xml_data = defaultdict(list)

if medquad_dir.exists():
    subdirs = [d for d in medquad_dir.iterdir() if d.is_dir()]
    print(f"   Found {len(subdirs)} subdirectories")

    for subdir in subdirs:
        xml_files = list(subdir.glob("*.xml"))
        print(f"   Processing {subdir.name}: {len(xml_files)} files")

        for xml_file in xml_files:
            try:
                tree = ET.parse(xml_file)
                root = tree.getroot()

                doc_info = {
                    'source_dir': subdir.name,
                    'file': xml_file.name,
                    'doc_id': root.get('id'),
                    'source': root.get('source'),
                    'url': root.get('url'),
                    'focus': root.find('Focus').text if root.find('Focus') is not None else None
                }

                # Extract Q&A pairs
                qapairs = root.find('QAPairs')
                if qapairs is not None:
                    for qapair in qapairs.findall('QAPair'):
                        question_elem = qapair.find('Question')
                        answer_elem = qapair.find('Answer')

                        qa_entry = doc_info.copy()
                        qa_entry.update({
                            'question_id': question_elem.get('qid') if question_elem is not None else None,
                            'question_type': question_elem.get('qtype') if question_elem is not None else None,
                            'question': question_elem.text if question_elem is not None else None,
                            'answer': answer_elem.text if answer_elem is not None else None
                        })

                        xml_data[subdir.name].append(qa_entry)

            except Exception as e:
                print(f"   ⚠ Error loading {xml_file.name}: {str(e)}")

    # Convert to DataFrames
    xml_frames = {}
    total_qa_pairs = 0
    for source_dir, data in xml_data.items():
        if data:
            xml_frames[source_dir] = pd.DataFrame(data)
            total_qa_pairs += len(data)
            print(f"   ✓ {source_dir}: {len(data)} Q&A pairs")

    print(f"\n   Total Q&A pairs extracted: {total_qa_pairs}")
else:
    print("   ⚠ MedQuAD directory not found")

# Summary
print("\n" + "=" * 80)
print("LOADING COMPLETE - SUMMARY")
print("=" * 80)
print(f"CSV files loaded: {len(csv_frames)}")
print(f"XML source directories processed: {len(xml_frames)}")


# Display sample data
print("\n" + "=" * 80)
print("SAMPLE DATA PREVIEW")
print("=" * 80)

if csv_frames:
    print("\n--- CSV: Dataset (first 3 rows) ---")
    if 'dataset' in csv_frames:
        print(csv_frames['dataset'].head(3))

if xml_frames:
    first_xml_key = list(xml_frames.keys())[0]
    print(f"\n--- XML: {first_xml_key} (first 2 Q&A pairs) ---")
    print(xml_frames[first_xml_key][['focus', 'question', 'answer']].head(2))

LOADING ALL DATA FILES

2. Loading CSV files...
   ✓ Loaded: dataset.csv (4920 rows)
   ✓ Loaded: symptom_Description.csv (41 rows)
   ✓ Loaded: symptom_precaution.csv (41 rows)
   ✓ Loaded: Symptom-severity.csv (133 rows)

3. Loading XML files from MedQuAD...
   Found 9 subdirectories
   Processing 1_CancerGov_QA: 116 files
   Processing 2_GARD_QA: 2685 files
   Processing 3_GHR_QA: 1086 files
   Processing 4_MPlus_Health_Topics_QA: 981 files
   Processing 5_NIDDK_QA: 157 files
   Processing 6_NINDS_QA: 277 files
   Processing 7_SeniorHealth_QA: 48 files
   Processing 8_NHLBI_QA_XML: 88 files
   Processing 9_CDC_QA: 59 files
   ✓ 1_CancerGov_QA: 729 Q&A pairs
   ✓ 2_GARD_QA: 5394 Q&A pairs
   ✓ 3_GHR_QA: 5430 Q&A pairs
   ✓ 4_MPlus_Health_Topics_QA: 981 Q&A pairs
   ✓ 5_NIDDK_QA: 1192 Q&A pairs
   ✓ 6_NINDS_QA: 1088 Q&A pairs
   ✓ 7_SeniorHealth_QA: 769 Q&A pairs
   ✓ 8_NHLBI_QA_XML: 559 Q&A pairs
   ✓ 9_CDC_QA: 270 Q&A pairs

   Total Q&A pairs extracted: 16412

LOADING COMPLETE - SU

In [None]:
import pathlib
import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict
import re
from sklearn.model_selection import train_test_split

print("=" * 80)
print("STEP 1: LOADING ALL DATA FILES")
print("=" * 80)

# Load CSV files from medi_bot directory
print("\n1. Loading CSV files...")
csv_files = {
    "dataset": "dataset.csv",
    "symptom_description": "symptom_Description.csv",
    "symptom_precaution": "symptom_precaution.csv",
    "symptom_severity": "Symptom-severity.csv"
}

csv_frames = {}
for name, filename in csv_files.items():
    csv_path = pathlib.Path(filename)
    if csv_path.exists():
        csv_frames[name] = pd.read_csv(csv_path)
        print(f"   ✓ Loaded: {filename} ({len(csv_frames[name])} rows)")
    else:
        print(f"   ⚠ Not found: {filename}")

# Load XML files from MedQuAD directories
print("\n2. Loading XML files from MedQuAD...")
medquad_dir = pathlib.Path("MedQuAD")
xml_data = defaultdict(list)

if medquad_dir.exists():
    subdirs = [d for d in medquad_dir.iterdir() if d.is_dir()]
    print(f"   Found {len(subdirs)} subdirectories")

    for subdir in subdirs:
        xml_files = list(subdir.glob("*.xml"))
        print(f"   Processing {subdir.name}: {len(xml_files)} files")

        for xml_file in xml_files:
            try:
                tree = ET.parse(xml_file)
                root = tree.getroot()

                doc_info = {
                    'source_dir': subdir.name,
                    'file': xml_file.name,
                    'doc_id': root.get('id'),
                    'source': root.get('source'),
                    'url': root.get('url'),
                    'focus': root.find('Focus').text if root.find('Focus') is not None else None
                }

                # Extract Q&A pairs
                qapairs = root.find('QAPairs')
                if qapairs is not None:
                    for qapair in qapairs.findall('QAPair'):
                        question_elem = qapair.find('Question')
                        answer_elem = qapair.find('Answer')

                        qa_entry = doc_info.copy()
                        qa_entry.update({
                            'question_id': question_elem.get('qid') if question_elem is not None else None,
                            'question_type': question_elem.get('qtype') if question_elem is not None else None,
                            'question': question_elem.text if question_elem is not None else None,
                            'answer': answer_elem.text if answer_elem is not None else None
                        })

                        xml_data[subdir.name].append(qa_entry)

            except Exception as e:
                print(f"   ⚠ Error loading {xml_file.name}: {str(e)}")

    # Convert to DataFrames
    xml_frames = {}
    total_qa_pairs = 0
    for source_dir, data in xml_data.items():
        if data:
            xml_frames[source_dir] = pd.DataFrame(data)
            total_qa_pairs += len(data)
            print(f"   ✓ {source_dir}: {len(data)} Q&A pairs")

    print(f"\n   Total Q&A pairs extracted: {total_qa_pairs}")
else:
    print("   ⚠ MedQuAD directory not found")

print("\n" + "=" * 80)
print("STEP 2: CLEANING AND NORMALIZING DATA")
print("=" * 80)

# Text cleaning function
def clean_text(text):
    """Clean and normalize text data"""
    if pd.isna(text) or text is None:
        return ""

    text = str(text)
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing whitespace
    text = text.strip()

    return text

print("\n" + "=" * 80)
print("STEP 3: CONVERTING TO UNIFIED FORMAT")
print("=" * 80)

unified_data = []

# 1. Process Symptom-Disease Dataset
print("\n1. Processing Symptom-Disease dataset...")
if 'dataset' in csv_frames and 'symptom_description' in csv_frames and 'symptom_precaution' in csv_frames:
    dataset_df = csv_frames['dataset']
    description_df = csv_frames['symptom_description']
    precaution_df = csv_frames['symptom_precaution']

    # Merge datasets
    disease_info = description_df.set_index('Disease').to_dict('index')
    precaution_info = precaution_df.set_index('Disease').to_dict('index')

    for idx, row in dataset_df.iterrows():
        disease = row['Disease']

        # Collect symptoms
        symptoms = []
        for col in dataset_df.columns:
            if col.startswith('Symptom_'):
                symptom = row[col]
                if pd.notna(symptom) and symptom.strip():
                    symptoms.append(clean_text(symptom))

        if symptoms and disease:
            # Create symptom-based query
            symptoms_text = ", ".join(symptoms[:5])  # Limit to 5 symptoms
            input_text = f"User: I have {symptoms_text}. What could this be? Bot:"

            # Get description
            description = ""
            if disease in disease_info and 'Description' in disease_info[disease]:
                description = clean_text(disease_info[disease]['Description'])

            # Get precautions
            precautions = []
            if disease in precaution_info:
                for i in range(1, 5):
                    prec_key = f'Precaution_{i}'
                    if prec_key in precaution_info[disease]:
                        prec = precaution_info[disease][prec_key]
                        if pd.notna(prec) and prec.strip():
                            precautions.append(clean_text(prec))

            # Build target text
            target_text = f"You may be experiencing {clean_text(disease)}."
            if description:
                target_text += f" {description}"
            if precautions:
                target_text += f" Recommended precautions: {', '.join(precautions)}."

            unified_data.append({
                'input_text': input_text,
                'target_text': clean_text(target_text),
                'source': 'symptom_disease',
                'disease': disease
            })

    print(f"   ✓ Processed {len([d for d in unified_data if d['source'] == 'symptom_disease'])} symptom-disease pairs")

# 2. Process MedQuAD Q&A Dataset
print("\n2. Processing MedQuAD Q&A dataset...")
for source_dir, df in xml_frames.items():
    count = 0
    for idx, row in df.iterrows():
        question = row['question']
        answer = row['answer']

        if pd.notna(question) and pd.notna(answer) and question.strip() and answer.strip():
            # Clean and format
            clean_question = clean_text(question)
            clean_answer = clean_text(answer)

            # Skip if too short or too long
            if len(clean_answer) < 20 or len(clean_answer) > 1000:
                continue

            input_text = f"User: {clean_question} Bot:"
            target_text = clean_answer

            unified_data.append({
                'input_text': input_text,
                'target_text': target_text,
                'source': f'medquad_{source_dir}',
                'focus': row.get('focus', '')
            })
            count += 1

    print(f"   ✓ Processed {count} Q&A pairs from {source_dir}")

print("\n" + "=" * 80)
print("STEP 4: CREATING FINAL DATASET")
print("=" * 80)

# Convert to DataFrame
full_df = pd.DataFrame(unified_data)

# Remove duplicates
print(f"\nBefore deduplication: {len(full_df)} entries")
full_df = full_df.drop_duplicates(subset=['input_text', 'target_text'])
print(f"After deduplication: {len(full_df)} entries")

# Remove entries with empty text
full_df = full_df[
    (full_df['input_text'].str.len() > 10) &
    (full_df['target_text'].str.len() > 10)
]
print(f"After filtering short entries: {len(full_df)} entries")

# Shuffle the data
full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train and validation
train_df, val_df = train_test_split(full_df, test_size=0.1, random_state=42)

print(f"\n✓ Training set: {len(train_df)} entries")
print(f"✓ Validation set: {len(val_df)} entries")

# Save to CSV
train_path = pathlib.Path("train.csv")
val_path = pathlib.Path("val.csv")
full_path = pathlib.Path("full_dataset.csv")

train_df[['input_text', 'target_text']].to_csv(train_path, index=False)
val_df[['input_text', 'target_text']].to_csv(val_path, index=False)
full_df.to_csv(full_path, index=False)

print(f"\n✓ Saved training data to: {train_path}")
print(f"✓ Saved validation data to: {val_path}")
print(f"✓ Saved full dataset to: {full_path}")

print("\n" + "=" * 80)
print("SAMPLE DATA PREVIEW")
print("=" * 80)

print("\n--- Training Samples (first 3) ---")
for i in range(min(3, len(train_df))):
    print(f"\n[{i+1}]")
    print(f"INPUT:  {train_df.iloc[i]['input_text'][:100]}...")
    print(f"TARGET: {train_df.iloc[i]['target_text'][:100]}...")

print("\n" + "=" * 80)
print("DATA SOURCE DISTRIBUTION")
print("=" * 80)
print(full_df['source'].value_counts())

print("\n" + "=" * 80)
print("✅ PREPROCESSING COMPLETE!")
print("=" * 80)

STEP 1: LOADING ALL DATA FILES

1. Loading CSV files...
   ✓ Loaded: dataset.csv (4920 rows)
   ✓ Loaded: symptom_Description.csv (41 rows)
   ✓ Loaded: symptom_precaution.csv (41 rows)
   ✓ Loaded: Symptom-severity.csv (133 rows)

2. Loading XML files from MedQuAD...
   Found 9 subdirectories
   Processing 1_CancerGov_QA: 116 files
   Processing 2_GARD_QA: 2685 files
   Processing 3_GHR_QA: 1086 files
   Processing 4_MPlus_Health_Topics_QA: 981 files
   Processing 5_NIDDK_QA: 157 files
   Processing 6_NINDS_QA: 277 files
   Processing 7_SeniorHealth_QA: 48 files
   Processing 8_NHLBI_QA_XML: 88 files
   Processing 9_CDC_QA: 59 files
   ✓ 1_CancerGov_QA: 729 Q&A pairs
   ✓ 2_GARD_QA: 5394 Q&A pairs
   ✓ 3_GHR_QA: 5430 Q&A pairs
   ✓ 4_MPlus_Health_Topics_QA: 981 Q&A pairs
   ✓ 5_NIDDK_QA: 1192 Q&A pairs
   ✓ 6_NINDS_QA: 1088 Q&A pairs
   ✓ 7_SeniorHealth_QA: 769 Q&A pairs
   ✓ 8_NHLBI_QA_XML: 559 Q&A pairs
   ✓ 9_CDC_QA: 270 Q&A pairs

   Total Q&A pairs extracted: 16412

STEP 2: CLEAN

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
from torch.optim import AdamW
import pathlib

print("="*80)
print("MEDICAL CHATBOT - GPT2 FINE-TUNING")
print("="*80 + "\n")

# Load the preprocessed datasets
print("Loading datasets...")
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")

print(f"✓ Training samples: {len(train_df)}")
print(f"✓ Validation samples: {len(val_df)}\n")

# Show sample
print("Sample training data:")
print(f"INPUT:  {train_df.iloc[0]['input_text'][:100]}...")
print(f"TARGET: {train_df.iloc[0]['target_text'][:100]}...\n")

# PyTorch Dataset class
class MedicalQADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.texts = []
        for _, row in dataframe.iterrows():
            # Combine input and target for training
            combined = f"{row['input_text']} {row['target_text']}"
            self.texts.append(combined)

        self.encodings = tokenizer(
            self.texts,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=max_length,
        )

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings.input_ids[idx],
            'attention_mask': self.encodings.attention_mask[idx],
            'labels': self.encodings.input_ids[idx]
        }

# Setup tokenizer
print("Initializing GPT-2 tokenizer and model...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Create datasets and dataloaders
train_dataset = MedicalQADataset(train_df, tokenizer)
val_dataset = MedicalQADataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

print(f"✓ Batches per epoch: {len(train_loader)}\n")

# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)

print("\n" + "="*80)
print("STARTING FINE-TUNING...")
print("="*80 + "\n")

# Create models directory
pathlib.Path("models").mkdir(exist_ok=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(f"\n{'='*80}")
    print(f"EPOCH {epoch + 1}/{num_epochs}")
    print(f"{'='*80}\n")

    # Training phase
    model.train()
    train_losses = []

    for step, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        train_losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 100 == 0:
            print(f"  Step {step:4d}/{len(train_loader)} | Loss: {loss.item():.4f}")

    avg_train_loss = sum(train_losses) / len(train_losses)

    # Validation phase
    model.eval()
    val_losses = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            val_losses.append(outputs.loss.item())

    avg_val_loss = sum(val_losses) / len(val_losses)
    val_perplexity = torch.exp(torch.tensor(avg_val_loss))

    print(f"\n  Epoch {epoch + 1} Summary:")
    print(f"  ├─ Average Train Loss: {avg_train_loss:.4f}")
    print(f"  ├─ Validation Loss: {avg_val_loss:.4f}")
    print(f"  └─ Validation Perplexity: {val_perplexity:.2f}")

    # Save checkpoint
    checkpoint_path = f"models/medical-gpt2-epoch{epoch+1}"
    model.save_pretrained(checkpoint_path)
    tokenizer.save_pretrained(checkpoint_path)
    print(f"  ✓ Checkpoint saved: {checkpoint_path}")

print(f"\n{'='*80}")
print("SAVING FINAL MODEL...")
print(f"{'='*80}\n")

final_model_path = "models/medical-gpt2-final"
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"✓ Final model saved to: {final_model_path}")
print("✓ Fine-tuning complete!\n")

# Test the model
print("="*80)
print("TESTING MODEL WITH SAMPLE MEDICAL QUESTIONS")
print("="*80 + "\n")

model.eval()
test_questions = [
    "I have fever and cough. What could this be?",
    "What are the symptoms of diabetes?",
    "What is hypertension?",
]

with torch.no_grad():
    for question in test_questions:
        prompt = f"User: {question} Bot:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract answer after "Bot:"
        if "Bot:" in generated:
            answer = generated.split("Bot:", 1)[-1].strip()
        else:
            answer = generated

        print(f"Question: {question}")
        print(f"Answer: {answer}\n")
        print("-" * 80 + "\n")

print("="*80)
print("✅ TRAINING AND TESTING COMPLETE!")
print("="*80)
print(f"\n📁 Model saved in: {final_model_path}")
print(f"🎯 Ready for deployment!")

MEDICAL CHATBOT - GPT2 FINE-TUNING

Loading datasets...
✓ Training samples: 8437
✓ Validation samples: 938

Sample training data:
INPUT:  User: how many people are affected by spastic paraplegia type 15 ? Bot:...
TARGET: spastic paraplegia type 15 is a rare condition, although its exact prevalence is unknown....

Initializing GPT-2 tokenizer and model...
✓ Batches per epoch: 2110

Using device: cuda

STARTING FINE-TUNING...


EPOCH 1/10

  Step    0/2110 | Loss: 7.7596
  Step  100/2110 | Loss: 0.6746
  Step  200/2110 | Loss: 0.6078
  Step  300/2110 | Loss: 0.6406
  Step  400/2110 | Loss: 0.7837
  Step  500/2110 | Loss: 0.8707
  Step  600/2110 | Loss: 0.2789
  Step  700/2110 | Loss: 0.5987
  Step  800/2110 | Loss: 0.4265
  Step  900/2110 | Loss: 0.5548
  Step 1000/2110 | Loss: 0.6159
  Step 1100/2110 | Loss: 0.6946
  Step 1200/2110 | Loss: 0.5218
  Step 1300/2110 | Loss: 0.4272
  Step 1400/2110 | Loss: 0.8011
  Step 1500/2110 | Loss: 0.5628
  Step 1600/2110 | Loss: 0.6015
  Step 1700/21

In [5]:
from huggingface_hub import notebook_login

# Login to Hugging Face (you'll need to create an account first at https://huggingface.co)
print("="*60)
print("HUGGING FACE LOGIN")
print("="*60 + "\n")
print("Please login to Hugging Face:")
print("1. Go to https://huggingface.co and create an account if you don't have one")
print("2. Go to https://huggingface.co/settings/tokens")
print("3. Create a new token (Write access)")
print("4. Paste it below\n")

notebook_login()   

HUGGING FACE LOGIN

Please login to Hugging Face:
1. Go to https://huggingface.co and create an account if you don't have one
2. Go to https://huggingface.co/settings/tokens
3. Create a new token (Write access)
4. Paste it below



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from huggingface_hub import HfApi, create_repo
import os
from pathlib import Path

print("="*60)
print("UPLOADING FILES INDIVIDUALLY")
print("="*60 + "\n")

USERNAME = "Branis333"
MODEL_NAME = "symptom-gpt2-chatbot"
repo_id = f"{USERNAME}/{MODEL_NAME}"

api = HfApi()

# Create repo
try:
    create_repo(repo_id=repo_id, private=False, exist_ok=True)
    print(f"✓ Repository ready\n")
except:
    pass

# Files to upload
model_dir = Path("models/medical-gpt2-final")
files_to_upload = [
    "config.json",
    "model.safetensors",
    "generation_config.json",
    "tokenizer_config.json",
    "tokenizer.json",
    "vocab.json",
    "merges.txt",
]

print("Uploading files one by one...\n")
for filename in files_to_upload:
    file_path = model_dir / filename
    
    if file_path.exists():
        try:
            print(f"Uploading {filename}...", end=" ")
            api.upload_file(
                path_or_fileobj=str(file_path),
                path_in_repo=filename,
                repo_id=repo_id,
                repo_type="model",
            )
            print("✓")
        except Exception as e:
            print(f"❌ {e}")
    else:
        print(f"⚠ {filename} not found")

print("\n" + "="*60)
print("✅ UPLOAD COMPLETE!")
print("="*60)
print(f"\nCheck your model: https://huggingface.co/{repo_id}")

UPLOADING FILES INDIVIDUALLY

Uploading files one by one...

Uploading config.json... ✓
Uploading model.safetensors... 

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/53/f4/53f442902cdb8ab2d0aec6330c1088930be3bff994a004bb1ced28e3169720af/c79c3f36213942b24eb6056c79803446fdbebe8572b57dccea135d49db47caa6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20251015%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251015T103412Z&X-Amz-Expires=86400&X-Amz-Signature=d2df209ab4d54f4b3aff381370228f846c495a1400fb11612491fd4a73528cb5&X-Amz-SignedHeaders=host&partNumber=1&uploadId=vVhZG7qbV_F98_LmLN1PXWjyjiuCh23MyP.pSPpZHCAyP_gQDyVfcukOsTVltGiJrmr2NRtDKBDeuWWfNKI2ZeWebna.l_itbC50tHWAbfablxtFYFKqt3rMFfcl4_MH&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2406)')))"), '(Request ID: 7ced9562-bba1-4867-86ff-5d50a6fae8f3)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/53/f4/5

✓
Uploading generation_config.json... ✓
Uploading tokenizer_config.json... ✓
⚠ tokenizer.json not found
Uploading vocab.json... ✓
Uploading merges.txt... ✓

✅ UPLOAD COMPLETE!

Check your model: https://huggingface.co/Branis333/symptom-gpt2-chatbot


In [11]:
from huggingface_hub import HfApi
import time

print("="*60)
print("CREATING AND UPLOADING MODEL CARD")
print("="*60 + "\n")

USERNAME = "Branis333"
MODEL_NAME = "symptom-gpt2-chatbot"
repo_id = f"{USERNAME}/{MODEL_NAME}"

model_card = f"""---
language: en
license: mit
tags:
- medical
- healthcare
- chatbot
- question-answering
- symptoms
- diseases
- gpt2
datasets:
- custom-medical-qa
- medquad
- symptom-disease-dataset
metrics:
- perplexity
- loss
model-index:
- name: {MODEL_NAME}
  results:
  - task:
      type: text-generation
      name: Medical Q&A Generation
    metrics:
    - name: Perplexity
      type: perplexity
      value: 1.50
    - name: Final Validation Loss
      type: loss
      value: 0.4024
---

# 🏥 Medical Symptom Chatbot - GPT2 Fine-tuned

A specialized GPT-2 model fine-tuned on medical Q&A data to assist with symptom analysis, disease information, and health-related questions.

## 🎯 Model Description

This model is based on GPT-2 and has been fine-tuned on a comprehensive medical dataset combining:
- **Symptom-Disease mappings** with descriptions and precautions
- **MedQuAD dataset** with expert medical Q&A pairs
- Custom medical knowledge base

**⚠️ IMPORTANT DISCLAIMER:** This model is for informational and educational purposes only. Always consult qualified healthcare professionals for medical advice, diagnosis, or treatment.

## 📊 Training Details

### Dataset Statistics
- **Total Training Samples:** 8,437
- **Validation Samples:** 938
- **Total Dataset Size:** 9,375 medical Q&A pairs

### Training Configuration
- **Base Model:** GPT-2 (124M parameters)
- **Training Epochs:** 10
- **Batch Size:** 4
- **Learning Rate:** 3e-5
- **Optimizer:** AdamW
- **Max Sequence Length:** 512 tokens
- **Hardware:** NVIDIA GPU (CUDA enabled)
- **Training Time:** ~3.5 hours

### Performance Metrics

| Epoch | Train Loss | Val Loss | Perplexity |
|-------|------------|----------|------------|
| 1     | 0.5518     | 0.4664   | 1.59       |
| 2     | 0.4553     | 0.4366   | 1.55       |
| 3     | 0.4162     | 0.4196   | 1.52       |
| 4     | 0.3865     | 0.4088   | 1.51       |
| 5     | 0.3621     | 0.4015   | 1.49       |
| 6     | 0.3415     | 0.3975   | 1.49       |
| 7     | 0.3233     | 0.3988   | 1.49       |
| 8     | 0.3069     | 0.3984   | 1.49       |
| 9     | 0.2917     | 0.3977   | 1.49       |
| **10** | **0.2781** | **0.4024** | **1.50** |

**Final Model Performance:**
- ✅ Training Loss: **0.2781**
- ✅ Validation Loss: **0.4024**
- ✅ Validation Perplexity: **1.50**

## 🚀 Usage

### Basic Usage

```python
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load model and tokenizer
model_name = "{repo_id}"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Generate response
question = "I have fever and cough. What could this be?"
prompt = f"User: {{question}} Bot:"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(
    **inputs,
    max_new_tokens=150,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = response.split("Bot:")[-1].strip()
print(answer)
```

## 💡 Example Queries

### Symptom Analysis
```
User: I have fever and cough. What could this be?
Bot: You may be experiencing a respiratory infection...
```

### Disease Information
```
User: What are the symptoms of diabetes?
Bot: Common symptoms include increased thirst, frequent urination...
```

## 📁 Dataset Sources

1. **Kaggle Symptom-Disease Dataset** - Disease descriptions, symptom mappings, precautions
2. **MedQuAD** - Expert-curated medical Q&A from multiple domains

## ⚠️ Limitations

1. **Not a Medical Professional**: Cannot replace professional medical advice
2. **Training Data Bias**: Limited to information in training data
3. **Hallucination Risk**: May generate plausible but incorrect information
4. **Language**: Primarily English medical texts

## 🔒 Ethical Considerations

- **Informational Only**: Should not be used for self-diagnosis
- **Professional Consultation Required**: Always seek medical professionals for health concerns
- **Verification**: Cross-check any medical information with reliable sources

## 📄 License

MIT License - Free to use with attribution

---

**Built with ❤️ using Hugging Face Transformers**

*Last Updated: October 2024*
"""

# Step 1: Save locally first
print("Step 1: Saving README.md locally...")
with open("README.md", "w", encoding="utf-8") as f:
    f.write(model_card)
print("✓ Saved to README.md\n")

# Step 2: Upload with retry logic
print("Step 2: Uploading to Hugging Face...")
api = HfApi()

max_retries = 3
for attempt in range(max_retries):
    try:
        print(f"  Attempt {attempt + 1}/{max_retries}...", end=" ")
        
        api.upload_file(
            path_or_fileobj="README.md",
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="model",
        )
        
        print("✓")
        print("\n" + "="*60)
        print("✅ README CARD UPLOADED SUCCESSFULLY!")
        print("="*60)
        print(f"\nView your model page: https://huggingface.co/{repo_id}")
        break
        
    except Exception as e:
        print(f"❌")
        print(f"  Error: {str(e)[:100]}...")
        
        if attempt < max_retries - 1:
            wait_time = (attempt + 1) * 5
            print(f"  Waiting {wait_time} seconds before retry...")
            time.sleep(wait_time)
        else:
            print("\n" + "="*60)
            print("❌ UPLOAD FAILED AFTER ALL RETRIES")
            print("="*60)
            print("\nManual upload option:")
            print(f"1. Go to: https://huggingface.co/{repo_id}/tree/main")
            print(f"2. Click 'Add file' > 'Upload files'")
            print(f"3. Upload the README.md file from your current directory")

CREATING AND UPLOADING MODEL CARD

Step 1: Saving README.md locally...
✓ Saved to README.md

Step 2: Uploading to Hugging Face...
  Attempt 1/3... ✓

✅ README CARD UPLOADED SUCCESSFULLY!

View your model page: https://huggingface.co/Branis333/symptom-gpt2-chatbot
