In [3]:
#IMPORT LIBRARIES

import pandas as pd
import numpy as np
import torch
import re
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
import transformers
import wordfreq

In [None]:
#PREPROCESSING

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

df = pd.read_csv('train.csv')
df.dropna(subset=['Text', 'Category'], inplace=True)

stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.discard("not")
lemmatizer = nltk.stem.WordNetLemmatizer()
common_words = wordfreq.top_n_list('en', 400000)

sym_spell = symspellpy.SymSpell(max_dictionary_edit_distance=3)
for word in common_words:
    sym_spell.create_dictionary_entry(word, int(wordfreq.word_frequency(word, "en") * 100000))


def correct_word(word):
    suggestions = sym_spell.lookup(word, symspellpy.Verbosity.CLOSEST, max_edit_distance=3)
    if suggestions:
        return suggestions[0].term
    else:
        return word.replace('.', 'e')


def get_pos_tag(tag):
    if tag[0] == 'V':
        return nltk.corpus.wordnet.VERB
    elif tag[0] == 'J':
        return nltk.corpus.wordnet.ADJ
    elif tag[0] == 'R':
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN

def match_word(patterned_word):
    regex = re.compile('^' + patterned_word + '$')
    for word in common_words:
        if len(word) == len(patterned_word) and regex.match(word):
            return word
    else:
        return patterned_word.replace('.', 'e')

def preprocess(text):
    text = text.lower().strip()
    text = re.sub(r'\b\S*(https|www|linkedin)\S*\b', '', text).strip()
    text = re.sub(r"[^a-z\s]", '', text)
    text = text.replace('claire', '.')
    words = [correct_word(word) for word in text.split()]
    words = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(w[0], pos = get_pos_tag(w[1])) for w in words if w[0] not in stop_words]
    return " ".join(words)

df['Text'] = df['Text'].apply(preprocess)

label2id = {label: idx for idx, label in enumerate(df['Category'].unique())}
id2label = {v: k for k, v in label2id.items()}
df['label'] = df['Category'].map(label2id)

print("Preprocessing done")

df = df[['label', 'Text']].reset_index(drop=True)
df.to_csv("Processed_train.csv")

In [4]:
#LOAD DATA

id2label = {0: 'Accountant', 1: 'Advocate', 2: 'Agriculture', 3: 'Apparel', 4: 'Architecture', 5: 'Arts', 6: 'Automobile', 7: 'Aviation', 8: 'Banking', 9: 'Blockchain', 10: 'BPO', 11: 'Building and Construction', 12: 'Business Analyst', 13: 'Civil Engineer', 14: 'Consultant', 15: 'Data Science', 16: 'Database', 17: 'Designing', 18: 'DevOps', 19: 'Digital Media', 20: 'DotNet Developer', 21: 'Education', 22: 'Electrical Engineering', 23: 'ETL Developer', 24: 'Finance', 25: 'Food and Beverages', 26: 'Health and Fitness', 27: 'Human Resources', 28: 'Information Technology', 29: 'Java Developer', 30: 'Management', 31: 'Mechanical Engineer', 32: 'Network Security Engineer', 33: 'Operations Manager', 34: 'PMO', 35: 'Public Relations', 36: 'Python Developer', 37: 'React Developer', 38: 'Sales', 39: 'SAP Developer', 40: 'SQL Developer', 41: 'Testing', 42: 'Web Designing'}
label2id = {v: k for k, v in id2label.items()}
df = pd.read_csv("Processed_train.csv")


train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Text'].to_list(), df['label'].to_list(), test_size=0.2, random_state=100
)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [6]:
#LOAD MODEL

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=len(id2label)
)
model.load_state_dict(torch.load("bert_weights2.pt"))
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
#TRAIN MODEL

learning_rate = 1e-6
epochs = 2

optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay = 0.07)
num_training_steps = epochs * len(train_loader)
lr_scheduler = transformers.get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

train_losses, val_losses = [], []

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}\n\n")
    
    model.train()
    total_loss = 0
    prev_loss = 0
    batch_no = 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        batch_no += 1
        if batch_no % 100 == 0:
            print(f"Batch {batch_no} / {len(train_loader)} : Train loss = {(total_loss - prev_loss)/100}")
            prev_loss = total_loss

    
    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Train Loss: {avg_train_loss:.4f}")

    model.eval()
    total_val_loss = 0
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            logits = outputs.logits
            total_val_loss += loss.item()
            preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f"Val Loss: {avg_val_loss:.4f}")


In [None]:
#SAVE MODEL

torch.save(model.state_dict(), 'bert_weights2.pt')

In [None]:
#EVALUATE MODEL

print("\nClassification Report:")
print(classification_report(true_labels, preds, target_names=list(label2id.keys())))

plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training/Validation Loss over Epochs")
plt.legend()
plt.grid(True)
plt.show()


In [8]:
#TEST MODEL

model.eval()
total_val_loss = 0
preds, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits
        total_val_loss += loss.item()
        preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
        true_labels.extend(batch['labels'].cpu().numpy())

avg_val_loss = total_val_loss / len(val_loader)

print(classification_report(true_labels, preds, target_names=list(label2id.keys())))

                           precision    recall  f1-score   support

               Accountant       0.81      0.95      0.87        65
                 Advocate       0.84      0.89      0.87        47
              Agriculture       0.80      0.73      0.76        64
                  Apparel       0.64      0.73      0.68        48
             Architecture       0.74      0.66      0.70        77
                     Arts       0.81      0.88      0.85        69
               Automobile       0.57      0.46      0.51        65
                 Aviation       0.92      0.92      0.92        64
                  Banking       0.81      0.81      0.81        53
               Blockchain       1.00      0.75      0.86         4
                      BPO       0.92      0.70      0.80        47
Building and Construction       0.72      0.78      0.75        73
         Business Analyst       0.87      0.92      0.89        77
           Civil Engineer       0.98      0.91      0.94     