
## Web Service Classification using DeepSeek
## Our goal is to build a baseline model with at least 90% accuracy

In [1]:
!pip install seaborn transformers nltk datasets accelerate
!pip install peft transformers accelerate bitsandbytes datasets
!pip install -U bitsandbytes



## 1.  Load Python Modules

In [2]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
# Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report,precision_recall_fscore_support
)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

import warnings
import torch
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")
%matplotlib inline

sns.set(style="whitegrid")
os.makedirs("charts", exist_ok=True)
os.makedirs("results", exist_ok=True)
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Ensure deterministic behavior (optional: slower)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

cuda


## 2. Pre Processing Text

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    return " ".join(lemmatized_tokens)

## 3.Load and Preprocess Dataset

In [3]:
# === Step 1: Load and Preprocess Dataset ===
def load_data(n):
    df = pd.read_csv(f"Balanced_Top_{n}_Web_Services.csv").dropna()
    df = df[['Service Description', 'Grouped Category']].rename(columns={'Service Description': 'text', 'Grouped Category': 'label'})
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['label'])
    return df, label_encoder.classes_

## 4. Tokenizer and Tokenization

In [4]:
# === Step 2: Tokenizer and Tokenization ===
model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=256)

tokenizer_config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

## 5. Metric Function

In [None]:
def generate_report_csv(report_dict, confusion_mat, class_labels, class_names, output_file="BERT-Report.csv"):
    df = pd.DataFrame(report_dict).transpose()

    # Filter only the class rows (not avg/macro)
    filtered_labels = [str(class_names[i]) for i in class_labels]
    df = df.loc[filtered_labels].copy()

    # Compute per-class accuracy
    row_sums = confusion_mat.sum(axis=1)
    diagonal = np.diag(confusion_mat)
    per_class_accuracy = (diagonal / row_sums).round(2)

    df['accuracy'] = per_class_accuracy
    if 'support' in df.columns:
        df = df.drop(columns=['support'])

    df.reset_index(inplace=True)
    df.rename(columns={'index': 'Category'}, inplace=True)

    output_path = f"results/{output_file}"
    df.to_csv(output_path, index=False)
    print(f"Classification report saved to: {output_path}")

In [None]:
def plot_confusion_and_report(y_true, y_pred, class_names, n):
    cm = confusion_matrix(y_true, y_pred)
    figsize = (18, 18) if n >= 40 else (8, 8)
    plt.figure(figsize=figsize)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.title(f"Confusion Matrix - Top {n} Categories")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"charts/BERT_confusion_matrix_top_{n}.png")
    plt.show()

    print(f"classification_report for {n} Web Serivce Categories")
    # Convert class_names to list of strings
    class_names_str = [str(name) for name in class_names]
    
    # Get sorted list of unique label integers
    labels = sorted(np.unique(y_true))

    # classification_report for print
    print(classification_report(y_true, y_pred, target_names=class_names_str, zero_division=0))

    # classification_report for dictionary
    report_dict = classification_report(y_true, y_pred, target_names=class_names_str, output_dict=True, zero_division=0)

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    # Save to CSV
    generate_report_csv(report_dict, cm, class_labels=labels, class_names=class_names_str, output_file=f"BERT-Report-Top-{n}.csv")

    return report_dict

def plot_metrics(bert_results):
    categories = list(bert_results.keys())
    accuracies = [bert_results[n]['accuracy'] for n in categories]
    precisions = [bert_results[n]['precision'] for n in categories]
    recalls = [bert_results[n]['recall'] for n in categories]
    f1_scores = [bert_results[n]['f1_score'] for n in categories]

    def plot_metric(x, y, ylabel, title, color='blue'):
        plt.figure(figsize=(8, 5))
        plt.plot(x, y, marker='o', linestyle='-', color=color)
        plt.title(title)
        plt.xlabel('Number of Web Serivce Categories')
        plt.ylabel(ylabel)
        plt.grid(True)
        plt.xticks(x)
        plt.tight_layout()
        plt.savefig(f"charts/{title}.png")
        plt.show()

    plot_metric(categories, accuracies, 'Accuracy', 'BERT Model Performace Accuracy', color='green')
    plot_metric(categories, precisions, 'Precision', 'BERT Model Performace Precision', color='orange')
    plot_metric(categories, recalls, 'Recall', 'BERT Model Performace Recall', color='purple')
    plot_metric(categories, f1_scores, 'F1 Score', 'BERT Model Performace F1 Score', color='red')

In [5]:
# === Step 3: Metric Function ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average='weighted', zero_division=0),
        "recall": recall_score(labels, preds, average='weighted', zero_division=0),
        "f1": f1_score(labels, preds, average='weighted', zero_division=0)
    }

## 6.PEFT + LoRA Setup

In [6]:
# === Step 4: PEFT + LoRA Setup ===
def get_peft_model_for_classification(num_labels):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    model.config.pad_token_id = tokenizer.pad_token_id

    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none"
    )

    return get_peft_model(model, peft_config)

## 7. Training Function

In [7]:
# === Step 5: Training Function ===
def train_lora_classifier(n):
    df, class_names = load_data(n)
    num_labels = len(class_names)

    dataset = Dataset.from_pandas(df[['text', 'label']])
    dataset = dataset.train_test_split(test_size=0.2, seed=42)
    tokenized = dataset.map(tokenize_function, batched=True)
    tokenized = tokenized.remove_columns(["text"])

    model = get_peft_model_for_classification(num_labels)

    training_args = TrainingArguments(
        output_dir=f"./results_lora_deepseek_top_{n}",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=2e-4,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="none",
        fp16=True
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    preds = trainer.predict(tokenized_dataset['test'])
    y_true = preds.label_ids
    y_pred = np.argmax(preds.predictions, axis=1)

    # Metrics and reports
    metrics = compute_metrics((preds.predictions, y_true))
    report = plot_confusion_and_report(y_true, y_pred, class_names, n)
    model_dir = f"./web_sevice_trained_deepseek_model"
    os.makedirs(model_dir, exist_ok=True)

    # Save model and tokenizer
    trainer.save_model(model_dir)
    tokenizer.save_pretrained(model_dir)

    return {
        "accuracy": metrics["accuracy"],
        "precision": metrics["precision"],
        "recall": metrics["recall"],
        "f1_score": metrics["f1"]
    }

## 8 . Call Train

In [8]:
import torch
result = train_lora_classifier(50)
print(result)

Map:   0%|          | 0/7995 [00:00<?, ? examples/s]

Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-llm-7b-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6031,0.777261,0.790895,0.795309,0.790895,0.782394
2,0.3589,0.612692,0.841421,0.839412,0.841421,0.836517
3,0.2191,0.556281,0.861931,0.860803,0.861931,0.859497


{'eval_loss': 0.5562814474105835, 'eval_accuracy': 0.8619309654827414, 'eval_precision': 0.8608030506633698, 'eval_recall': 0.8619309654827414, 'eval_f1': 0.8594967739515581, 'eval_runtime': 369.7967, 'eval_samples_per_second': 5.406, 'eval_steps_per_second': 2.704, 'epoch': 3.0}
