<a href="https://colab.research.google.com/github/Astro2350/CT-Train/blob/main/CT_Test2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocess

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
import os
import pickle

# NLTK setup for stopwords and lemmatization
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess text (remove stopwords, lemmatize, clean text)
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetical characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Load raw datasets (they should be in /content/)
categories_df = pd.read_csv('list_of_categories.csv')
train_df = pd.read_csv('TRAIN.csv')

print("Preprocessing data...")

# Clean column names
train_df.columns = train_df.columns.str.strip().str.lower().str.replace(' ', '_')
categories_df.columns = categories_df.columns.str.strip().str.lower().str.replace(' ', '_')

# Merge datasets to include category information
merged_df = train_df.merge(categories_df, left_on='primary_category_id', right_on='id', how='left')

# Feature engineering: create combined_text
merged_df['name'] = merged_df['name'].fillna('')
merged_df['gl_description'] = merged_df['gl_description'].fillna('')
merged_df['memo'] = merged_df['memo'].fillna('')
merged_df['combined_text'] = merged_df['name'] + ' ' + merged_df['gl_description'] + ' ' + merged_df['memo']

# Preprocess text in combined_text column
merged_df['combined_text'] = merged_df['combined_text'].apply(preprocess_text)

# Encode target variable using LabelEncoder (temporary encoding)
label_encoder = LabelEncoder()
merged_df['matched_category_id_encoded'] = label_encoder.fit_transform(merged_df['matched_category_id'])
print(f"Number of target classes before filtering: {len(label_encoder.classes_)}")

# Handle categorical features
merged_df['hospital_system_id'] = merged_df['hospital_system_id'].astype(str)
merged_df['hospital_system_id_encoded'] = LabelEncoder().fit_transform(merged_df['hospital_system_id'])

merged_df['department_name'] = merged_df['department_name'].fillna('Unknown')
merged_df['department_name_encoded'] = LabelEncoder().fit_transform(merged_df['department_name'])

# Handle category hierarchy features (for 6 levels)
for i in range(6):
    col_name = f'category{i}'
    merged_df[col_name] = merged_df[col_name].fillna('Unknown')
    merged_df[f'{col_name}_encoded'] = LabelEncoder().fit_transform(merged_df[col_name])

# Remove classes with only one sample
class_counts = merged_df['matched_category_id_encoded'].value_counts()
rare_classes = class_counts[class_counts == 1].index
filtered_df = merged_df[~merged_df['matched_category_id_encoded'].isin(rare_classes)].copy()  # Use .copy() to avoid warning

# Re-fit the label encoder on the filtered data so that labels become contiguous
label_encoder = LabelEncoder()
filtered_df['matched_category_id_encoded'] = label_encoder.fit_transform(filtered_df['matched_category_id'])
num_classes = filtered_df['matched_category_id_encoded'].nunique()
print(f"Number of classes after filtering: {num_classes}")

# Save the processed data
filtered_df.to_csv('/content/processed_train_data.csv', index=False)

# Save the re-fitted label encoder for later use in training
with open('/content/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Data preprocessing complete and saved to '/content/processed_train_data.csv'!")

# Training

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from torch.cuda.amp import autocast, GradScaler
from sklearn.utils.class_weight import compute_class_weight
import pickle
from tqdm.notebook import tqdm
import os
import csv
import json

# Load processed data and label encoder
train_df = pd.read_csv('/content/processed_train_data.csv')
with open('/content/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Determine number of classes from processed data
num_classes = train_df['matched_category_id_encoded'].nunique()
print(f"Number of classes from processed data: {num_classes}")

# Load pre-trained model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Custom dataset class
class TransactionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row['combined_text']
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # Combine categorical features: hospital_system_id_encoded, primary_category_id, department_name_encoded
        cat_features = torch.tensor([row['hospital_system_id_encoded'], row['primary_category_id'], row['department_name_encoded']], dtype=torch.long)

        # Hierarchical features (6 columns)
        hier_features = torch.tensor([row['category0_encoded'], row['category1_encoded'], row['category2_encoded'],
                                      row['category3_encoded'], row['category4_encoded'], row['category5_encoded']], dtype=torch.long)

        # Amount feature
        amount = torch.tensor([row['amount']], dtype=torch.float)

        # Target label
        target = torch.tensor(row['matched_category_id_encoded'], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'cat_features': cat_features,
            'hier_features': hier_features,
            'amount': amount,
            'target': target
        }

# Split the processed data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['matched_category_id_encoded'])

# Create DataLoaders
batch_size = 16
train_dataset = TransactionDataset(train_data, tokenizer)
val_dataset = TransactionDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Define the model
class HierarchicalCategoryModel(nn.Module):
    def __init__(self, bert_model, num_cat_features=3, num_hier_features=6, num_classes=357):
        super(HierarchicalCategoryModel, self).__init__()
        self.bert = bert_model
        self.bert_dropout = nn.Dropout(0.1)
        self.bert_dim = 768  # DistilBERT hidden size

        # Embeddings for categorical features
        self.cat_embeddings = nn.ModuleList([nn.Embedding(10000, 32) for _ in range(num_cat_features)])
        self.hier_embeddings = nn.ModuleList([nn.Embedding(10000, 32) for _ in range(num_hier_features)])

        # Calculate total embedding dimensions
        self.cat_emb_dim = 32 * num_cat_features
        self.hier_emb_dim = 32 * num_hier_features

        # Total input dimension: BERT output + categorical embeddings + hierarchical embeddings + amount
        self.total_input_dim = self.bert_dim + self.cat_emb_dim + self.hier_emb_dim + 1

        # Fully connected layers
        self.fc1 = nn.Linear(self.total_input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)  # Set to match number of classes

        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.batch_norm2 = nn.BatchNorm1d(256)

    def forward(self, input_ids, attention_mask, cat_features, hier_features, amount):
        # Process BERT output
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_cls = bert_output.last_hidden_state[:, 0, :]
        bert_cls = self.bert_dropout(bert_cls)

        # Process categorical features
        cat_embeddings = [emb(cat_features[:, i]) for i, emb in enumerate(self.cat_embeddings)]
        cat_embeddings = torch.cat(cat_embeddings, dim=1)

        # Process hierarchical features
        hier_embeddings = [emb(hier_features[:, i]) for i, emb in enumerate(self.hier_embeddings)]
        hier_embeddings = torch.cat(hier_embeddings, dim=1)

        # Concatenate all features
        combined = torch.cat([bert_cls, cat_embeddings, hier_embeddings, amount], dim=1)

        # Fully connected layers with batch normalization and dropout
        x = self.fc1(combined)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = self.batch_norm2(x)
        x = self.relu(x)
        x = self.dropout(x)

        logits = self.fc3(x)
        return logits

# Initialize device (CPU or CUDA)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Use the number of classes from the processed data
print(f"Setting model num_classes to: {num_classes}")
model = HierarchicalCategoryModel(bert_model, num_classes=num_classes).to(device)

# Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Compute class weights based on the processed training data (make sure to use the training portion)
class_weights = compute_class_weight('balanced',
                                     classes=np.unique(train_df['matched_category_id_encoded']),
                                     y=train_df['matched_category_id_encoded'])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Learning rate scheduler
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

# Mixed Precision Training setup - if CUDA is not available, this will simply be ignored
scaler = GradScaler()

# Training function
def train_epoch(model, dataloader, optimizer, criterion, device, scaler):
    model.train()
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0

    progress_bar = tqdm(dataloader, desc="Training")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        cat_features = batch['cat_features'].to(device)
        hier_features = batch['hier_features'].to(device)
        amount = batch['amount'].to(device)
        targets = batch['target'].to(device)

        optimizer.zero_grad()

        if device.type == 'cuda':
            with autocast(device_type=device.type):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    cat_features=cat_features,
                    hier_features=hier_features,
                    amount=amount
                )
                loss = criterion(outputs, targets)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                cat_features=cat_features,
                hier_features=hier_features,
                amount=amount
            )
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets)
        total_predictions += targets.shape[0]
        epoch_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item(), "accuracy": (correct_predictions.float() / total_predictions).item()})

    return epoch_loss / len(dataloader), correct_predictions.float() / total_predictions

# Evaluation function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            cat_features = batch['cat_features'].to(device)
            hier_features = batch['hier_features'].to(device)
            amount = batch['amount'].to(device)
            targets = batch['target'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                cat_features=cat_features,
                hier_features=hier_features,
                amount=amount
            )
            loss = criterion(outputs, targets)
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == targets)
            total_predictions += targets.shape[0]
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader), correct_predictions.float() / total_predictions

# Training loop with early stopping and checkpoint saving
num_epochs = 3
best_accuracy = 0
patience = 3
epochs_no_improve = 0

checkpoint_dir = "/content/drive/MyDrive/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
log_filepath = "/content/drive/MyDrive/training_logs.csv"

with open(log_filepath, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Epoch', 'Train Loss', 'Train Accuracy', 'Val Loss', 'Val Accuracy'])

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device, scaler)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

    if val_acc > best_accuracy:
        best_accuracy = val_acc
        epochs_no_improve = 0
        checkpoint_path = f"{checkpoint_dir}/best_model.pth"
        # Save checkpoint
        def save_checkpoint(model, optimizer, epoch, loss, accuracy, filepath):
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                'accuracy': accuracy
            }
            torch.save(checkpoint, filepath)
            print(f"Checkpoint saved to {filepath}")
        save_checkpoint(model, optimizer, epoch + 1, val_loss, val_acc, checkpoint_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs")
            break

    with open(log_filepath, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([epoch + 1, train_loss, train_acc.item(), val_loss, val_acc.item()])

    scheduler.step()

    print()

print(f"Best validation accuracy: {best_accuracy:.4f}")

best_model_path = f"{checkpoint_dir}/best_model.pth"
if os.path.exists(best_model_path):
    checkpoint = torch.load(best_model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded best model from epoch {checkpoint['epoch']} with accuracy {checkpoint['accuracy']:.4f}")
else:
    print("No saved model found. Using the last trained model.")

category_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
with open(f"{checkpoint_dir}/category_mapping.json", 'w') as f:
    json.dump(category_mapping, f)
print(f"Category mapping saved to {checkpoint_dir}/category_mapping.json")

print("Training complete!")