	Milestone 1 : Data Preparation & Annotation
○	Objective: Collect and prepare a clean, annotated dataset.
○	Tasks: Collect a diverse set of historical ticket data;
            clean and normalize text; 
            manually annotate a portion of the data for training.


In [None]:
%pip install torch --quiet

import pandas as pd
import re
import torch 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

ModuleNotFoundError: No module named 'torch'

In [None]:
# --- STEP 1: LOAD DATA ---
# We use the raw IT tickets CSV containing ~47k records
df = pd.read_csv('D:\AI-Powered Ticket Creation & Categorization\Kaggle Dataset\all_tickets_processed_improved_v3.csv')
print(df.head())
# --- STEP 2: LIGHT CLEANING ---
# BERT needs sentence structure, so we only remove "noise" like URLs/Emails
def bert_cleaning(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)      # Remove HTML
    text = re.sub(r'\S+@\S+', '', text)      # Remove Emails
    text = re.sub(r'http\S+', '', text)      # Remove URLs
    text = re.sub(r'[^a-z0-9!?. ]', '', text) # Keep basic punctuation (!?.)
    return re.sub(r'\s+', ' ', text).strip()

df['clean_text'] = df['Document'].apply(bert_cleaning)

# --- STEP 3: LABEL ENCODING ---
# Convert 'Hardware', 'Access', etc., into numbers 0-7
df['label'] = df['Topic_group'].astype('category').cat.codes
num_labels = df['label'].nunique()

# --- STEP 4: CLASS WEIGHTS (THE ACCURACY BOOSTER) ---
# We calculate weights so the model pays more attention to small categories
y_labels = df['label'].values
weights = compute_class_weight(class_weight='balanced', 
                                classes=np.unique(y_labels), 
                                y=y_labels)
# Move weights to GPU for the training phase
class_weights = torch.tensor(weights, dtype=torch.float).to("cuda")

# --- STEP 5: SPLIT DATA ---
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
print(df.head())