In [1]:
#IMPORT LIBRARIES

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection
import torch
from torch.utils.data import Dataset, DataLoader
import sklearn
import nltk
import re
import collections
import itertools
import wordfreq
import symspellpy.symspellpy as symspellpy

In [2]:
#DEFINE WORD EMBEDDING MODULE

class Word_Embeddings(torch.nn.Module):
    def __init__(self, embed_dim, n_words, max_len = 6400, device = "cuda"):
        super().__init__()

        self.embed_dim = embed_dim
        self.embedding_layer = torch.nn.Embedding(n_words, embed_dim, padding_idx = 0)

        pe_div_term = torch.exp(-torch.log(torch.tensor(10000.0, device=device)) * torch.arange(0, self.embed_dim, 2, device = device) / self.embed_dim).to(device)
        inside_term = torch.arange(0, max_len, device = device).unsqueeze(1) * pe_div_term
        positional_encoding = torch.zeros(max_len, self.embed_dim, device = device)
        positional_encoding[:, 0::2] = torch.sin(inside_term)
        positional_encoding[:, 1::2] = torch.cos(inside_term)

        self.positional_encoding = positional_encoding
    
    def forward(self, inputs):
        device = next(self.embedding_layer.parameters()).device

        seq_lens = [len(sentence) for sentence in inputs]
        max_len = max(seq_lens)
        tokens = torch.zeros((len(inputs), max_len), dtype = torch.long, device = device)
        for i, sentence in enumerate(inputs):
            tokens[i, 0:len(sentence)] = sentence.to(device)

        embeddings = self.embedding_layer(tokens)

        attn_mask = torch.zeros(len(seq_lens), max_len, dtype = torch.bool, device = device)

        for i in range(len(seq_lens)):
            embeddings[i, 0:seq_lens[i], :] += self.positional_encoding[0:seq_lens[i], :]
            attn_mask[i, seq_lens[i]:] = 1

        return embeddings, attn_mask


In [3]:
#DEFINE ATTENTION MODULE

class Multi_Head_Attention_Layer(torch.nn.Module):
    def __init__(self, in_len, out_len, n_heads, hidden_len, cls_only = False):
        super().__init__()

        self.in_len = in_len
        self.out_len = out_len
        self.n_heads = n_heads
        self.hidden_len = hidden_len
        self.cls_only = cls_only

        self.key_proj = torch.nn.Linear(in_len, n_heads * hidden_len)
        self.query_proj = torch.nn.Linear(in_len, n_heads * hidden_len)
        self.value_proj = torch.nn.Linear(in_len, n_heads * out_len)

        self.linear_transform = torch.nn.Linear(n_heads * out_len, in_len)
        self.layer_norm = torch.nn.LayerNorm(in_len)
    
    def forward(self, inputs, attention_mask = None):
        batch_size, seq_len = inputs.shape[0:2]
        keys = self.key_proj(inputs).view(batch_size, seq_len, self.n_heads, self.hidden_len).transpose(1, 2)
        queries = self.query_proj(inputs).view(batch_size, seq_len, self.n_heads, self.hidden_len).transpose(1, 2)
        values = self.value_proj(inputs).view(batch_size, seq_len, self.n_heads, self.out_len).transpose(1, 2)

        if self.cls_only:
            queries = queries[:, :, 0:1, :]

        relations = torch.matmul(queries, keys.transpose(-1, -2)) / self.hidden_len**0.5

        if attention_mask is not None:
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            relations = relations.masked_fill(attention_mask, float('-inf'))
        
        relations = torch.softmax(relations, dim = -1)
        attention = torch.matmul(relations, values)
        attention = attention.permute(0, 2, 1, 3)
        
        if self.cls_only:
            attention = attention.reshape(attention.shape[0], self.n_heads * self.out_len)
        else:
            attention = attention.reshape(attention.shape[0], attention.shape[1], self.n_heads * self.out_len)

        output = self.linear_transform(attention)
        output = self.layer_norm(output)

        return output
        

In [4]:
#DEFINE TRANSFORMER MODULE

class Transformer_Classifier(torch.nn.Module):
    def __init__(self, n_classes, embedding_dim, n_heads, attention_dim, key_query_dim, fc1_dim, fc2_dim, n_words, dropout = 0):
        super().__init__()

        self.Embedding_model = Word_Embeddings(embedding_dim, n_words)

        self.Layer1 = Multi_Head_Attention_Layer(in_len=embedding_dim, out_len=attention_dim, n_heads=n_heads, hidden_len=key_query_dim)

        self.Layer2 = Multi_Head_Attention_Layer(in_len=embedding_dim, out_len=attention_dim, n_heads=n_heads, hidden_len=key_query_dim, cls_only = True)
        
        self.dropout = torch.nn.Dropout(dropout)

        self.fc = torch.nn.Sequential(torch.nn.Linear(embedding_dim, fc1_dim),
                                      torch.nn.ReLU(),
                                      torch.nn.Linear(fc1_dim, fc2_dim),
                                      torch.nn.ReLU(),
                                      torch.nn.Linear(fc2_dim, n_classes))
        
        self.cls_token = torch.nn.Parameter((2 * torch.rand(1, 1, embedding_dim) - 1))
    
    def forward(self, inputs):
        single = False
        if not isinstance(inputs, list):
            single = True
            inputs = [inputs]
        embeddings, attention_masks = self.Embedding_model(inputs)
        embeddings = torch.cat((self.cls_token.expand(embeddings.shape[0], -1, -1), embeddings), dim = 1)
        attention_masks = torch.cat((torch.zeros(attention_masks.shape[0], 1, dtype = torch.bool, device = attention_masks.device), attention_masks), dim = 1)

        outputs = embeddings + self.Layer1(embeddings, attention_masks)
        outputs = self.Layer2(outputs, attention_masks)
        outputs = self.fc(self.dropout(outputs))

        if single:
            outputs = outputs[0, :, :]

        return outputs


In [5]:
#DEFINE DATASET CLASS AND REQUIRED METHODS

class TextDataset(Dataset):
    def __init__(self, texts, labels, word2idx, n_classes, max_len = 1024):
        self.word2idx = word2idx
        self.texts = [list(map((lambda w : self.word2idx.get(w, 0)), sentence.split())) for sentence in texts]
        self.labels = labels.apply(lambda x : self.one_hot_encode(x, n_classes).unsqueeze(0))
        self.max_len = max_len
        #print(pd.Series([len(s) for s in self.texts]).describe())
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.texts[idx][0:self.max_len], dtype=torch.long), self.labels.iloc[idx])
    
    @classmethod
    def create_word_idx_map(cls, texts, freq_threshold = 10):
        words = texts.str.findall(r"\w+")
        word2idx = {"<PAD>" : 0}
        curr_idx = 1
        words = list(itertools.chain(*words))
        unique_words = collections.Counter(words)

        for word, freq in unique_words.items():
            if word not in word2idx and freq >= freq_threshold:
                word2idx[word] = curr_idx
                curr_idx += 1
        
        return word2idx

    @classmethod
    def collate_fn(cls, batch):
        texts, labels = zip(*batch)
        texts = list(texts)
        labels = torch.cat(labels, dim = 0)
        return texts, labels
    
    @classmethod
    def one_hot_encode(cls, label, n_classes):
        vector = torch.zeros(n_classes)
        vector[label] = 1
        return vector


In [6]:
#DEFINE TRAINING FUNCTIONS

def train_one_epoch(model, loader, loss_fn, optimizer, device = "cpu"):
    model.train()
    total_loss = 0
    prev_loss = 0
    n_batch = 0
    total_batches = len(loader)
    print_batches = 100

    for texts, labels in loader:
        labels = labels.to(device)
        outputs = model(texts)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        n_batch += 1

        if n_batch % print_batches == 0:
            print(f"Batch {n_batch} / {total_batches} : Train loss = {(total_loss - prev_loss) / print_batches}")
            prev_loss = total_loss
            
    return total_loss / total_batches

def evaluate(model, loader, loss_fn = torch.nn.CrossEntropyLoss(), device = "cpu"):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0
    with torch.no_grad():
        for texts, labels in loader:
            labels = labels.to(device)
            outputs = model(texts)
            total_loss += loss_fn(outputs, labels)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = torch.argmax(labels, dim=1).cpu().numpy()
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    
    val_loss = total_loss/len(loader)
    accuracy = (pd.Series(all_preds) == pd.Series(all_labels)).sum() / len(all_labels)
    return (val_loss, accuracy, sklearn.metrics.classification_report(all_labels, all_preds, zero_division = 0))

def run_training(model, train_loader, test_loader, epochs = 10, lr = 1e-3, weight_decay = 1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = model.to(device)

    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 2, factor = 0.2)

    for epoch in range(1, epochs+1):
        print(f"\nEpoch {epoch}\n")
        train_loss = train_one_epoch(model, train_loader, loss_fn, optimizer, device)
        report = evaluate(model, test_loader, loss_fn, device)
        scheduler.step(metrics = report[0])
        print(report[2])
        print(f"\n\nTrain Loss: {train_loss}")
        print(f"Test Loss: {report[0]}")
        print(f"Test accuracy: {report[1]}\n")


In [None]:
#PREPROCESSING DATA

df = pd.read_csv('train.csv')
df.dropna(subset=['Text', 'Category'], inplace=True)

stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.discard("not")
lemmatizer = nltk.stem.WordNetLemmatizer()
common_words = wordfreq.top_n_list('en', 400000)

sym_spell = symspellpy.SymSpell(max_dictionary_edit_distance=3)
for word in common_words:
    sym_spell.create_dictionary_entry(word, int(wordfreq.word_frequency(word, "en") * 100000))


def correct_word(word):
    suggestions = sym_spell.lookup(word, symspellpy.Verbosity.CLOSEST, max_edit_distance=3)
    if suggestions:
        return suggestions[0].term
    else:
        return word.replace('.', 'e')


def get_pos_tag(tag):
    if tag[0] == 'V':
        return nltk.corpus.wordnet.VERB
    elif tag[0] == 'J':
        return nltk.corpus.wordnet.ADJ
    elif tag[0] == 'R':
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN

def match_word(patterned_word):
    regex = re.compile('^' + patterned_word + '$')
    for word in common_words:
        if len(word) == len(patterned_word) and regex.match(word):
            return word
    else:
        return patterned_word.replace('.', 'e')

def preprocess(text):
    text = text.lower().strip()
    text = re.sub(r'\b\S*(https|www|linkedin)\S*\b', '', text).strip()
    text = re.sub(r"[^a-z\s]", '', text)
    text = text.replace('claire', '.')
    words = [correct_word(word) for word in text.split()]
    words = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(w[0], pos = get_pos_tag(w[1])) for w in words if w[0] not in stop_words]
    return " ".join(words)

df['Text'] = df['Text'].apply(preprocess)

label2id = {label: idx for idx, label in enumerate(df['Category'].unique())}
id2label = {v: k for k, v in label2id.items()}
df['label'] = df['Category'].map(label2id)

print("Preprocessing done")

df = df[['label', 'Text']].reset_index(drop=True)
df.to_csv("Processed_train.csv")


In [7]:
#LOAD DATA AND MODEL

id2label = {0: 'Accountant', 1: 'Advocate', 2: 'Agriculture', 3: 'Apparel', 4: 'Architecture', 5: 'Arts', 6: 'Automobile', 7: 'Aviation', 8: 'Banking', 9: 'Blockchain', 10: 'BPO', 11: 'Building and Construction', 12: 'Business Analyst', 13: 'Civil Engineer', 14: 'Consultant', 15: 'Data Science', 16: 'Database', 17: 'Designing', 18: 'DevOps', 19: 'Digital Media', 20: 'DotNet Developer', 21: 'Education', 22: 'Electrical Engineering', 23: 'ETL Developer', 24: 'Finance', 25: 'Food and Beverages', 26: 'Health and Fitness', 27: 'Human Resources', 28: 'Information Technology', 29: 'Java Developer', 30: 'Management', 31: 'Mechanical Engineer', 32: 'Network Security Engineer', 33: 'Operations Manager', 34: 'PMO', 35: 'Public Relations', 36: 'Python Developer', 37: 'React Developer', 38: 'Sales', 39: 'SAP Developer', 40: 'SQL Developer', 41: 'Testing', 42: 'Web Designing'}
label2id = {v: k for k, v in id2label.items()}
df = pd.read_csv("Processed_train.csv")

word2idx = TextDataset.create_word_idx_map(df['Text'], freq_threshold = 3)
train_texts, val_texts, train_labels, val_labels = sklearn.model_selection.train_test_split(df['Text'], df['label'], test_size=0.2, random_state=100)

train_dataset = TextDataset(train_texts.reset_index(drop=True), train_labels.reset_index(drop=True), word2idx, n_classes = len(id2label), max_len=1024)
test_dataset = TextDataset(val_texts.reset_index(drop=True), val_labels.reset_index(drop=True), word2idx, n_classes = len(id2label), max_len=1024)
train_dataloader = DataLoader(train_dataset, batch_size = 16, shuffle = True, collate_fn = TextDataset.collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size = 16, shuffle = False, collate_fn = TextDataset.collate_fn)

model= Transformer_Classifier(n_classes=len(id2label), embedding_dim=256, n_heads=5, attention_dim=128, key_query_dim=128, fc1_dim=128, fc2_dim=64, n_words=len(word2idx), dropout=0.3)
model.load_state_dict(torch.load("Transformer_weights4.pth"))

<All keys matched successfully>

In [None]:
#RUN TRAINING

run_training(model, train_dataloader, test_dataloader, 10, lr=4e-6, weight_decay = 2e-2)
torch.save(model.state_dict(), "Transformer_weights4.pth")

In [9]:
#EVALUATION

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
report = evaluate(model, test_dataloader, device = device)
print(report[2])
print(f"Test Loss: {report[0]}")
print(f"Test accuracy: {report[1]}\n")

              precision    recall  f1-score   support

           0       0.60      0.80      0.69        65
           1       0.71      0.89      0.79        47
           2       0.36      0.23      0.28        64
           3       0.22      0.21      0.21        48
           4       0.51      0.36      0.42        77
           5       0.64      0.61      0.62        69
           6       0.34      0.31      0.32        65
           7       0.85      0.72      0.78        64
           8       0.37      0.34      0.35        53
           9       0.00      0.00      0.00         4
          10       0.81      0.36      0.50        47
          11       0.66      0.85      0.74        73
          12       0.63      0.60      0.61        77
          13       0.71      0.78      0.75        64
          14       0.52      0.48      0.50        64
          15       0.76      0.68      0.72        65
          16       0.67      0.60      0.63        50
          17       0.48    