	Milestone 2 : Core Model Development
○	Objective: Train and validate the NLP and NER models.
○	Tasks: Select appropriate models (e.g., scikit-learn classifiers, 
           SpaCy  for NER); 
           train the models on the annotated dataset; evaluate initial model accuracy.


In [2]:
import torch.nn as nn
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, DataCollatorWithPadding)



ModuleNotFoundError: No module named 'datasets'

In [None]:
# --- STEP 6: TOKENIZATION ---
# Convert text into the specific ID format BERT understands
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(batch):
    return tokenizer(batch["clean_text"], truncation=True, padding="max_length", max_length=128)

train_ds = Dataset.from_pandas(train_df[['clean_text', 'label']]).map(tokenize_fn, batched=True)
test_ds = Dataset.from_pandas(test_df[['clean_text', 'label']]).map(tokenize_fn, batched=True)



In [None]:
# --- STEP 7: CUSTOM WEIGHTED TRAINER ---
# This forces the model to learn the 3.7% minority class by increasing its error penalty
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Use the class_weights we calculated in Milestone 1
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
from sklearn.metrics import accuracy_score, f1_score

# --- STEP 7.5: DEFINE THE EVALUATION METRIC ---
# This function tells the Trainer how to calculate accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': acc,   # The Trainer will automatically prefix this with 'eval_'
        'f1': f1
    }



In [None]:
# --- STEP 8: UPDATED TRAINING ARGUMENTS ---
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

args = TrainingArguments(
    output_dir="weighted_bert_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_steps=500,
    fp16=True,
    load_best_model_at_end=True,
    # IMPORTANT: The key must be 'eval_accuracy' because Trainer prefixes 'eval_' to metrics
    metric_for_best_model="eval_accuracy", 
    greater_is_better=True,
    report_to="none"
)



In [None]:
# --- STEP 9: UPDATED TRAINER ---
trainer = WeightedTrainer(
    model=model, 
    args=args, 
    train_dataset=train_ds, 
    eval_dataset=test_ds, 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # DO NOT FORGET THIS LINE
)

print("Starting Training with Metrics...")
trainer.train()

In [None]:
import shutil

# This will save the BEST version (Epoch 3) because of our Trainer settings
model_path = "champion_weighted_bert"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# Zip it for download
shutil.make_archive('it_ticket_final_bert', 'zip', model_path)

print("SUCCESS: Your 86.9% Accuracy model is zipped and ready in the Output tab!")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import joblib

# 1. Vectorize text using TF-IDF
# We use max_features=5000 to keep it efficient on Kaggle
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

print("Vectorizing data for SVM...")
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['clean_text'])

# 2. Train the SVM
# probability=True is ESSENTIAL for the Soft Voting ensemble!
print("Training SVM Specialist (probability=True)...")
svm_model = SVC(kernel='linear', probability=True, class_weight='balanced')
svm_model.fit(X_train_tfidf, train_df['label'])

# 3. Quick Accuracy Check for SVM
svm_acc = svm_model.score(X_test_tfidf, test_df['label'])
print(f"✅ SVM Specialist Accuracy: {svm_acc:.4f}")

# 4. Save for the Ensemble & Milestone 3
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

In [None]:
import numpy as np
import torch

def ensemble_predict(text):
    # --- PART A: BERT CONFIDENCE ---
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        bert_logits = model(**inputs).logits
    bert_probs = torch.softmax(bert_logits, dim=1).cpu().numpy()[0]
    
    # --- PART B: SVM CONFIDENCE ---
    tfidf_feat = tfidf_vectorizer.transform([text])
    svm_probs = svm_model.predict_proba(tfidf_feat)[0]
    
    # --- PART C: SOFT VOTING (WEIGHTED) ---
    # Give BERT (the deep learner) 60% and SVM (the keyword expert) 40%
    final_probs = (0.6 * bert_probs) + (0.4 * svm_probs)
    return np.argmax(final_probs)

# Test on a new ticket
sample = "I need access to the internal storage folder for the new project."
print(f"Ensemble Result: {ensemble_predict(sample)}")

In [None]:
import spacy
import re

# 1. Load a lightweight NER model (spaCy)
# Run !python -m spacy download en_core_web_sm in a cell first
nlp = spacy.load("en_core_web_sm")

def extract_entities_and_priority(text, category):
    # --- NER SECTION ---
    doc = nlp(text)
    entities = {
        "usernames": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
        "machine_names": re.findall(r'[A-Z0-9]+(?:-[A-Z0-9]+)+', text), # Finds strings like LAPTOP-X1
        "error_codes": re.findall(r'0x[0-9a-fA-F]+|\b\d{3,4}\b', text)    # Finds 0x8004 or 404
    }

    # --- PRIORITY & SENTIMENT SECTION ---
    # Keywords that indicate urgency
    urgent_keywords = ['emergency', 'urgent', 'immediately', 'stopped', 'broken', 'dead', 'critical']
    is_urgent = any(word in text.lower() for word in urgent_keywords)
    
    # Priority Logic based on User Intent (Context)
    if is_urgent or category in ['Access', 'Security']:
        priority = "High"
    elif category in ['Hardware', 'Storage']:
        priority = "Medium"
    else:
        priority = "Low"
        
    return entities, priority

# --- TEST IT OUT ---
sample_text = "Urgent: User Adison is reporting Error 0x8004 on LAPTOP-P12. Screen is dead."
sample_category = "Hardware" # This would come from your BERT model

entities, priority = extract_entities_and_priority(sample_text, sample_category)

print(f"Entities Found: {entities}")
print(f"Assigned Priority: {priority}")

In [None]:
import shutil
import joblib

# 1. Save the SVM files
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# 2. Save the BERT model (Best version from Epoch 3)
trainer.save_model("final_bert_model")
tokenizer.save_pretrained("final_bert_model")

# 3. Zip everything into one download
# This creates 'milestone_2_final.zip'
shutil.make_archive('milestone_2_final', 'zip', './', './')

print("✅ ALL MODELS SAVED! Download 'milestone_2_final.zip' from the Output tab.")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Use your actual dataframe counts
class_counts = df['Topic_group'].value_counts()

plt.figure(figsize=(10, 7))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
plt.title('Distribution of IT Ticket Categories')
plt.axis('equal') 
plt.show()

In [None]:
# Generate predictions for your test set
# (You'll need to run your ensemble_predict function on test_df['clean_text'])
y_pred = [ensemble_predict(txt) for txt in test_df['clean_text'].iloc[:100]] # Test first 100 for speed
y_true = test_df['label'].iloc[:100]

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=categories, yticklabels=categories)
plt.xlabel('Predicted Category')
plt.ylabel('Actual Category')
plt.title('Final Ensemble Confusion Matrix')
plt.show()