<a href="https://colab.research.google.com/github/Caitlin-Fogg/NLP-Project/blob/main/COMP316_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Group members:
#Caitlin Fogg 223005053
#Tarika Sukdeoa 223010024
#Naseeha Osman 223005931

In [None]:
# Main solution - HMM model

# Imports
!pip uninstall gensim -y
!pip install --upgrade gensim
!pip install cython
import gensim
from gensim.models import Word2Vec

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from collections import defaultdict
from collections import Counter

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns



df = pd.read_csv("/content/resume_data.csv")

# Renaming columns to more common terms
column_map = {
    'career_objective': 'objective',
    'skills': 'skills',
    'educational_institution_name': 'institution',
    'degree_names': 'degree',
    'passing_years': 'grad_year',
    'educational_results': 'grade',
    'result_types': 'grade_type',
    'major_field_of_studies': 'major',
    'professional_company_names': 'company',
    'company_urls': 'company_url',
    'start_dates': 'start_date',
    'end_dates': 'end_date',
    'related_skils_in_job': 'job_skills',
    'positions': 'job_title',
    'locations': 'location',
    'responsibilities': 'responsibility',
    'extra_curricular_activity_types': 'activity_type',
    'extra_curricular_organization_names': 'activity_org',
    'extra_curricular_organization_links': 'activity_link',
    'role_positions': 'activity_role',
    'languages': 'languages',
    'proficiency_levels': 'language_level',
    'certification_providers': 'cert_provider',
    'certification_skills': 'cert_skills',
    'online_links': 'cert_link',
    'issue_dates': 'cert_issue_date',
    'expiry_dates': 'cert_expiry_date',
    '\ufeffjob_position_name': 'job_position',
    'educationaL_requirements': 'job_edu_req',
    'experiencere_requirement': 'job_exp_req',
    'age_requirement': 'job_age_req',
    'responsibilities.1': 'job_responsibility',
    'skills_required': 'job_skills_req',
    'matched_score': 'matched_score'
}

df.rename(columns=column_map, inplace=True)
col_to_use = ['objective','skills', 'institution','degree','grad_year','grade','grade_type','major','company','start_date','end_date','job_skills','job_title','location','responsibility','activity_type','activity_org','languages','language_level','cert_provider','cert_skills','cert_issue_date','cert_expiry_date']
def get_word_tags(row):
    word_tag_pairs = []
    stop_words = set(stopwords.words('english'))  # Get the set of English stop words
    for col in col_to_use:
        value = row[col]
        # Checks if a value is missing
        if pd.isna(value):
            continue
        # Convert list-like strings to lists if needed
        if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
            try:
                items = eval(value) # Uses eval to try convert it to an actual Python list
            except:
                items = [value] # If it fails, treat it like a single item list
        elif isinstance(value, str):
            items = re.split(r'[,\n;]', value)
        else:
            items = [value]

        for item in items:
            # Check if item is not None and is a string before applying strip()
            if item is not None and isinstance(item, str):
                item = item.strip()  # convert item to string before strip, removes leading/trialing whitespaces
            # If item is a string, proceed to split
            if isinstance(item, str) and item:
                for word in item.split():
                    if word.lower() not in stop_words:  # Check if the word is a stop word
                        word_tag_pairs.append((word.lower(), col.lower()))
            # If item is a list, iterate through and split each element
            elif isinstance(item, list):
                for sub_item in item:
                    if isinstance(sub_item, str):
                        for word in sub_item.split():
                            if word.lower() not in stop_words:  # Check if the word is a stop word
                                word_tag_pairs.append((word.lower(), col.lower()))
    return word_tag_pairs

# Create full dataset of word-tag pairs (with stop words removed)
all_data = []
for _, row in df.iterrows():
    all_data.append(get_word_tags(row))

# Train/Test Split

train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

print(f"Total resumes: {len(all_data)}")
print(f"Training resumes: {len(train_data)}")
print(f"Testing resumes: {len(test_data)}")

print("Preprocessing complete")

# Word embedding
# Extracting words from training data
def get_words(row):
    sentence = []
    for word, tag in row:
        sentence.append(word)
    return sentence

train_sentences = []
for row in train_data:
  train_sentences.append(get_words(row))

# Train model
word2vec_embedding_model = Word2Vec(
    sentences=train_sentences,  # Training data
    vector_size = 100, # Word vectors size -(increse=more nuance but risk of overfitting) (lower = faster but lose small differences)
    window = 5, # How many words before and after it looks at (increase - better for thematic similarity?) (decrease - syntatic and short range?)
    min_count = 2,  # Ignores words that only appear <min_count (smaller better for smaller data sets but risk noise)(higher more stable model)
    workers = 4, # No. of CPU cores used to train model
    sg = 1) # Uses skip gram but can use (SG- slower, better for small datasets and specialized datasets) (CBOW = 0) (cbow - faster, better for large corpuses that are more general )

# Save model
word2vec_embedding_model.save("word2vec_embedding_model.model")

print("Word embedding model complete")
# Function to get word vectors
def get_word_vector(word):
    """Get embedding vector for a word, returns zero vector if word not found"""
    if word in word2vec_embedding_model.wv:
        return word2vec_embedding_model.wv[word]
    else:
        return np.zeros(EMBEDDING_DIM)

# Extracting unique tags (NER labels)
all_tags = set() # Use a set to ensure no duplicates
for row in train_data:
    for word, tag in row:
        all_tags.add(tag)
all_tags = list(all_tags) # Converts set to a list to enable indexing
# Creates indices for the tags in a dictionary
tag_to_index = {tag: i for i, tag in enumerate(all_tags)}
index_to_tag = {i: tag for tag, i in tag_to_index.items()}

# Setting up constants
EMBEDDING_DIM = 100  # Must match Word2Vec embedding size
NUM_TAGS = len(all_tags)

# Emission probabilities using Word2Vec - we use cosine similarity between a word vector and the average vector for each tag
# This is instead of using P(word|tag) from count probabilities

# Calculate average embedding for each tag
tag_vectors = defaultdict(list) # Creates a dictionary where each key (a tag) maps to a list of word vectors
# Loops through each resume in train_data
for row in train_data:
  # Each row is a list of word tag pairs
    for word, tag in row:
      # Uses get_word_vector function
        vec = get_word_vector(word)
        # If the vector exists, add it to the list of vectors assocated with that tag
        if vec is not None:
            tag_vectors[tag].append(vec)

# Creates an empty dictionary to hold the average vector for each tag
tag_avg_vectors = {}
# For each tag and its list of vectors: compute mean vector across all word vectors for that tag, store average vector
for tag, vecs in tag_vectors.items():
    if vecs:
        tag_avg_vectors[tag] = np.mean(vecs, axis=0)
    else:
      # If there were no vectors, assign a zero vector tp avoid breaking the model
        tag_avg_vectors[tag] = np.zeros(EMBEDDING_DIM)

# Emission probability using cosine similarity between word vector and average vector
# Function takes in word that we want to estimate the probability of being emitted from a tag
# Cosine similarity took too long when comparing single comparisons therefore defined a faster method
def fast_cosine_similarity(vec1, vec2):
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return np.dot(vec1, vec2) / (norm1 * norm2)

def emission_probability(word, tag):
    word_vec = get_word_vector(word)
    tag_vec = tag_avg_vectors[tag]
    return fast_cosine_similarity(word_vec, tag_vec)

# Transition Probabilities
# Count transitions and initial states
transition_counts = defaultdict(Counter) # A nested dictionary where transition_counts[prev_tag][curr_tag] gives the count of how often curr_tag follows prev_tag
initial_tag_counts = Counter() # A counter for how many times each tag is the first tag in a resume

# Loops through training data, skipping empty rows
for row in train_data:
    if not row:
        continue
    initial_tag_counts[row[0][1]] += 1 # For each resume, look at the first tag (row[0][1]) and increment its count
    # For every pair of consecutive tags in a resume, count how often curr_tag follows prev_tag
    for i in range(1, len(row)):
        prev_tag = row[i - 1][1]
        curr_tag = row[i][1]
        transition_counts[prev_tag][curr_tag] += 1

# Normalise to probabilities
# These dictionaries will hold the final probabilities
transition_probs = {}
initial_probs = {}

V = len(all_tags)  # Total number of unique tags (for smoothing)

# Compute inital tag probabilities
total_initial = sum(initial_tag_counts.values()) + V  # Add V for Laplace smoothing
for tag in all_tags:
    initial_probs[tag] = (initial_tag_counts[tag] + 1) / total_initial # + 1 for Laplace smoothing

# Compute transition probabilities
for prev_tag in all_tags:
    total = sum(transition_counts[prev_tag].values()) + V # Counts total amount of times prev_tag occured, added V for smoothing
    transition_probs[prev_tag] = {}
    # For each possible curr_tag, compute the probability of it occurring after prev_tag
    for curr_tag in all_tags:
        if total == 0:
            transition_probs[prev_tag][curr_tag] = 0
        else:
            transition_probs[prev_tag][curr_tag] = (transition_counts[prev_tag][curr_tag] + 1) / total

# Viterbi algorithm
# Given a sentence it returns the most likely sequence of tags
def viterbi(sentence):
    V = [{}]  # List of dictionaries - stores max probabilities for each tag in sentence
    path = {}  # Backpointer - stores the best tag path leading to each tag

    # Initialisation step (for the first word)
    for tag in all_tags:
      # Computes P(tag at position 0) = initial_probs[tag] × emission_prob(word_0 | tag)
       # Uses .get() to return a small value (1e-6) if tag is not in initial_probs, for smoothing
        V[0][tag] = initial_probs.get(tag, 1e-6) * emission_probability(sentence[0], tag)
        path[tag] = [tag]

    # Recursion step - words 1 to n - finds the best path up to that tag
    for t in range(1, len(sentence)):
        V.append({}) # Create a new dictionary
        new_path = {} # Temporary path to build updated best paths for each tag at a time
        # Try each possible tag at the current word position
        for curr_tag in all_tags:
          # Find the best previous tag that leads to this current tag with the highest probability
            (prob, prev_tag) = max(
                (V[t - 1][prev_tag] * transition_probs[prev_tag].get(curr_tag, 1e-6) * emission_probability(sentence[t], curr_tag), prev_tag)
                for prev_tag in all_tags
            )
            V[t][curr_tag] = prob # Store max probability
            new_path[curr_tag] = path[prev_tag] + [curr_tag] # Append next best tag
        path = new_path # Update path

    # Termination step
    # Find the tag at the last word (V[-1]) with the highest probablity. V[-1][tag] gives you the probability of the best path ending in tag
    (prob, final_tag) = max((V[-1][tag], tag) for tag in all_tags)
    return path[final_tag] # Return the full best tag path that ends in final_tag

print("Start testing")
# Evaluation and testing
# Flatten true and predicted tags
true_tags = []
pred_tags = []

for i, resume in enumerate(test_data):
    if not resume:
        continue
    words = [word for word, _ in resume]
    true_tags_temp = [tag for _, tag in resume]
    pred_tags_temp = viterbi(words)

    true_tags.extend(true_tags_temp)
    pred_tags.extend(pred_tags_temp)
    if i % 100 == 0:
        print(f"Processed {i} resumes") # Prints after every 100 resumes are processed to track progress

print("Testing complete.")
# Generate evaluation metrics
print("Evaluation on Test Data:")
print(classification_report(true_tags, pred_tags, labels=all_tags))

Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Installing collected packages: gensim
Successfully installed gensim-4.3.3




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total resumes: 9544
Training resumes: 7635
Testing resumes: 1909
Preprocessing complete
Word embedding model complete
Start testing
Processed 0 resumes
Processed 100 resumes
Processed 200 resumes
Processed 300 resumes
Processed 400 resumes
Processed 500 resumes
Processed 600 resumes
Processed 700 resumes
Processed 800 resumes
Processed 900 resumes
Processed 1000 resumes
Processed 1100 resumes
Processed 1200 resumes
Processed 1300 resumes
Processed 1400 resumes
Processed 1500 resumes
Processed 1600 resumes
Processed 1700 resumes
Processed 1800 resumes
Processed 1900 resumes
Testing complete.
Evaluation on Test Data:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                  precision    recall  f1-score   support

          skills       0.35      0.96      0.52     67022
   cert_provider       0.00      0.00      0.00      1054
         company       0.78      0.36      0.50     11367
          degree       0.67      0.08      0.14      5710
   activity_type       1.00      0.01      0.02      1520
           major       0.93      0.18      0.30      5796
 cert_issue_date       0.00      0.00      0.00       478
      grade_type       0.63      0.10      0.17      2219
        location       1.00      0.43      0.60     10117
cert_expiry_date       0.00      0.00      0.00        91
      start_date       0.57      0.29      0.39      8342
      job_skills       0.72      0.38      0.50     29886
  language_level       0.00      0.00      0.00       409
     institution       0.59      0.17      0.26      9685
       languages       0.00      0.00      0.00       315
           grade       0.56      0.16      0.24      2698
        end_d

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Benchmark model - BiLSTM-CRF model with our Word2Vec embeddings
# Imports
!pip uninstall gensim -y
!pip install --upgrade gensim
!pip install cython
!pip install torchcrf

!pip install git+https://github.com/kmkurn/pytorch-crf.git

import torch
import torch.nn as nn
from torchcrf import CRF
from torch.nn.utils.rnn import pad_sequence
# Create dataloader
from torch.utils.data import TensorDataset, DataLoader

import gensim
from gensim.models import Word2Vec

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from collections import defaultdict
from collections import Counter

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("/content/resume_data.csv")

# Renaming columns to more common terms
column_map = {
    'career_objective': 'objective',
    'skills': 'skills',
    'educational_institution_name': 'institution',
    'degree_names': 'degree',
    'passing_years': 'grad_year',
    'educational_results': 'grade',
    'result_types': 'grade_type',
    'major_field_of_studies': 'major',
    'professional_company_names': 'company',
    'company_urls': 'company_url',
    'start_dates': 'start_date',
    'end_dates': 'end_date',
    'related_skils_in_job': 'job_skills',
    'positions': 'job_title',
    'locations': 'location',
    'responsibilities': 'responsibility',
    'extra_curricular_activity_types': 'activity_type',
    'extra_curricular_organization_names': 'activity_org',
    'extra_curricular_organization_links': 'activity_link',
    'role_positions': 'activity_role',
    'languages': 'languages',
    'proficiency_levels': 'language_level',
    'certification_providers': 'cert_provider',
    'certification_skills': 'cert_skills',
    'online_links': 'cert_link',
    'issue_dates': 'cert_issue_date',
    'expiry_dates': 'cert_expiry_date',
    '\ufeffjob_position_name': 'job_position',
    'educationaL_requirements': 'job_edu_req',
    'experiencere_requirement': 'job_exp_req',
    'age_requirement': 'job_age_req',
    'responsibilities.1': 'job_responsibility',
    'skills_required': 'job_skills_req',
    'matched_score': 'matched_score'
}

df.rename(columns=column_map, inplace=True)
col_to_use = ['objective','skills', 'institution','degree','grad_year','grade','grade_type','major','company','start_date','end_date','job_skills','job_title','location','responsibility','activity_type','activity_org','languages','language_level','cert_provider','cert_skills','cert_issue_date','cert_expiry_date']
def get_word_tags(row):
    word_tag_pairs = []
    stop_words = set(stopwords.words('english'))  # Get the set of English stop words
    for col in col_to_use:
        value = row[col]
        # Checks if a value is missing
        if pd.isna(value):
            continue
        # Convert list-like strings to lists if needed
        if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
            try:
                items = eval(value) # Uses eval to try convert it to an actual Python list
            except:
                items = [value] # If it fails, treat it like a single item list
        elif isinstance(value, str):
            items = re.split(r'[,\n;]', value)
        else:
            items = [value]

        for item in items:
            # Check if item is not None and is a string before applying strip()
            if item is not None and isinstance(item, str):
                item = item.strip()  # convert item to string before strip, removes leading/trialing whitespaces
            # If item is a string, proceed to split
            if isinstance(item, str) and item:
                for word in item.split():
                    if word.lower() not in stop_words:  # Check if the word is a stop word
                        word_tag_pairs.append((word.lower(), col.lower()))
            # If item is a list, iterate through and split each element
            elif isinstance(item, list):
                for sub_item in item:
                    if isinstance(sub_item, str):
                        for word in sub_item.split():
                            if word.lower() not in stop_words:  # Check if the word is a stop word
                                word_tag_pairs.append((word.lower(), col.lower()))
    return word_tag_pairs

# Create full dataset of word-tag pairs (with stop words removed)
all_data = []
for _, row in df.iterrows():
    all_data.append(get_word_tags(row))

# Train/Test Split

train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

print(f"Total resumes: {len(all_data)}")
print(f"Training resumes: {len(train_data)}")
print(f"Testing resumes: {len(test_data)}")

# Build vocabulary and tag mappings
word_to_ix = defaultdict(lambda: len(word_to_ix)) # Maps each unique word to a unique index
tag_to_ix = defaultdict(lambda: len(tag_to_ix)) # Maps each unique tag to a unique index

# Add special padding tokens to both vocabs - used to pad shorter sequences
word_to_ix["<PAD>"]  # index 0 - placed before any words or tags
tag_to_ix["<PAD>"]

# Populate mappings from the training data
for sentence in train_data:
    for word, tag in sentence:
        word_to_ix[word.lower()]
        tag_to_ix[tag]

# Lock vocab - converts to regular dict to prevent accidental addition of new items during model training or inference
word_to_ix = dict(word_to_ix)
tag_to_ix = dict(tag_to_ix)
# Create reverse mapping for tags
ix_to_tag = {v: k for k, v in tag_to_ix.items()}
# Creates a list of all known words - helpful for debugging
vocabulary = list(word_to_ix.keys())

# Converting to indexed sequences - takes words/tags and returns a list of indices
def encode_sentence(sentence, word_to_ix):
    return [word_to_ix[word.lower()] for word, tag in sentence]

def encode_tags(sentence, tag_to_ix):
    return [tag_to_ix[tag] for word, tag in sentence]

# Apply encoding to whole training set
encoded_sentences = [encode_sentence(s, word_to_ix) for s in train_data]
encoded_tags = [encode_tags(s, tag_to_ix) for s in train_data]

# Convert to tensors - a generalisation of vectors to higher dimensions, allows for efficient numerical computations
# Required for embedding layers and loss functions
# X and Y are lists of 1D tensors
X = [torch.tensor(seq, dtype=torch.long) for seq in encoded_sentences]
y = [torch.tensor(seq, dtype=torch.long) for seq in encoded_tags]

# Pad sequences to the same length for efficient batch processing
# batch_first=True - means the resulting shape is batch_size, max_seq_len, padding_value specifies which index is used to fill the shorter sequences
X_padded = pad_sequence(X, batch_first=True, padding_value=word_to_ix["<PAD>"])
y_padded = pad_sequence(y, batch_first=True, padding_value=tag_to_ix["<PAD>"])
# Create mask - used during model training to avoid computing loss on padded positions-
mask = X_padded != word_to_ix["<PAD>"]

# Combines X_padded, y_padded and mask into a single TensorDataset
dataset = TensorDataset(X_padded, y_padded, mask)
# Dataloader provides batches of data to the model during training
# Shuffle the data at every epoch to improve generalisation, also uses 2 background subprocesses for data loading (helps speed up training)
loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2)

# Load the pretrained Word2Vec model
word2vec_model = Word2Vec.load("word2vec_embedding_model.model")

# Access the word vectors
word_vectors = word2vec_model.wv

# Initialise an embedding matrix with random values (size of vocab x embedding size)
embedding_dim = 100  # Match the size used in Word2Vec model
vocab_size = len(vocabulary)  # Size of vocabulary
# Initialises a 2D numPy array (the embedding matrix) with random values
# Holds either real Word2Vec vectors or random vectors if word is not in pretrained model
embedding_matrix = np.random.randn(vocab_size, embedding_dim)

# Creates a dictionary mapping each word in vocabulary to its corresponding row index in the embedding matrix
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

# Fill the embedding matrix with Word2Vec embeddings
for word, idx in word_to_index.items():
    if word in word_vectors:
        embedding_matrix[idx] = word_vectors[word]
    else:
        embedding_matrix[idx] = np.random.randn(embedding_dim)  # for unknown words

# Defining a neural network that performs sequence labeling (e.g. tagging words in resumes with labels like skill)
# Inherits from nn.Module
class BiLSTM_CRF(nn.Module):
  # Initialisation method
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, pretrained_embeddings=None):
        super(BiLSTM_CRF, self).__init__()
        # Initialise embedding layer with pretrained Word2Vec embeddings
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(torch.tensor(pretrained_embeddings, dtype=torch.float))
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Hidden LSTM layer - outputs contextual representations
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        # Linear layer - Maps LSTM output to scores for each possible tag (emission scores for tokens)
        self.fc = nn.Linear(hidden_dim, tagset_size)
        # CRF layer - takes emission scores and models dependencies between tags i.e. which tag likely follows another tag
        self.crf = CRF(tagset_size, batch_first=True)

    # Forward pass - outputs emission scores fro every token in the sentence
    def forward(self, sentence, mask=None):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        emissions = self.fc(lstm_out)
        return emissions

    # Loss function - gets emission scores, applies the mask and Computes the negative log-likelihood loss using the CRF
    # Returns a negative value in order to minimise loss
    def loss(self, sentence, tags, mask):
        emissions = self.forward(sentence)
        # Apply mask to avoid padding positions
        masked_emissions = emissions * mask.unsqueeze(-1)
        return -self.crf(masked_emissions, tags, mask=mask)

    # Prediction method - gets emission scores and Uses CRF decoding (Viterbi algorithm) to get most likely tag sequence
    def predict(self, sentence, mask=None):
        emissions = self.forward(sentence)
        return self.crf.decode(emissions, mask=mask)

# Initialise the model with the pretrained embeddings
model = BiLSTM_CRF(vocab_size=len(vocabulary),
                   embedding_dim=embedding_dim,
                   hidden_dim=128,
                   tagset_size=len(tag_to_ix),
                   pretrained_embeddings=embedding_matrix)

# model.parameters - This gives the optimizer access to all learnable parameters in the model so it can update them during training
# lr - the learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train the model
for epoch in range(10):
    model.train() # Put model into training mode
    total_loss = 0 # Used to monitor training process
    for batch in loader:
        x_batch, y_batch, mask_batch = batch

        # Forward pass
        loss = model.loss(x_batch, y_batch, mask=mask_batch)
        total_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()  # Zero gradients before backward pass to avoid gradient accumulation across batches
        loss.backward()        # Backpropagate the loss - computes gradients of the loss with respect to each model parameter
        optimizer.step()       # Update the model's weights

    print(f"Epoch {epoch+1} - Loss: {total_loss:.4f}")

# Inference and evaluation
# Take the first 100 samples from test_data
subset_test_data = test_data[:100]

# Encode sentences and tags
subset_encoded_sentences = [encode_sentence(s, word_to_ix) for s in subset_test_data]
subset_encoded_tags = [encode_tags(s, tag_to_ix) for s in subset_test_data]

# Pad sequences and create mask
X_subset = [torch.tensor(seq, dtype=torch.long) for seq in subset_encoded_sentences]
y_subset = [torch.tensor(seq, dtype=torch.long) for seq in subset_encoded_tags]
X_subset_padded = pad_sequence(X_subset, batch_first=True, padding_value=word_to_ix["<PAD>"])
y_subset_padded = pad_sequence(y_subset, batch_first=True, padding_value=tag_to_ix["<PAD>"])
mask_subset = X_subset_padded != word_to_ix["<PAD>"]

# Create DataLoader for the subset
subset_dataset = TensorDataset(X_subset_padded, y_subset_padded, mask_subset)
subset_loader = DataLoader(subset_dataset, batch_size=32, shuffle=False)  # No shuffling for evaluation

model.eval() # Sets model to evaluation mode
subset_preds = [] # Collects predicted labels
subset_labels = [] # Collects true labels

with torch.no_grad(): # Disables gradient calculation - saves memory and speeds up evaluation
    for batch in subset_loader:
        x_batch, y_batch, mask_batch = batch
        # Use the model to predict the tag sequence for each sentence using CRF Viterbi decoding
        predictions = model.predict(x_batch, mask=mask_batch)

        # Process predictions and ground truth
        for i in range(len(predictions)):
            pred = predictions[i]
            true = y_batch[i][mask_batch[i]].tolist()  # Get true tags where mask is True
            subset_preds.extend(pred)
            subset_labels.extend(true)

# Convert tag indices back to tag names for evaluation
pred_tag_names = [ix_to_tag[ix] for ix in subset_preds]
true_tag_names = [ix_to_tag[ix] for ix in subset_labels]

# Generate a classification report
print(classification_report(true_tag_names, pred_tag_names, digits=4))


Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Installing collected packages: gensim
Successfully installed gensim-4.3.3


Collecting torchcrf
  Downloading TorchCRF-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->torchcrf)
 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total resumes: 9544
Training resumes: 7635
Testing resumes: 1909
Epoch 1 - Loss: 178161.5644
Epoch 2 - Loss: 12170.4421
Epoch 3 - Loss: 4036.8365
Epoch 4 - Loss: 1779.1881
Epoch 5 - Loss: 19014.4908
Epoch 6 - Loss: 2622.9141
Epoch 7 - Loss: 1159.7661
Epoch 8 - Loss: 1437.9918
Epoch 9 - Loss: 1602.9710
Epoch 10 - Loss: 492.0304
                  precision    recall  f1-score   support

    activity_org     0.9854    1.0000    0.9926       135
   activity_type     1.0000    0.9794    0.9896        97
cert_expiry_date     1.0000    1.0000    1.0000         6
 cert_issue_date     0.9545    1.0000    0.9767        21
   cert_provider     1.0000    0.9762    0.9880        42
     cert_skills     1.0000    1.0000    1.0000        11
         company     1.0000    1.0000    1.0000       620
          degree     1.0000    1.0000    1.0000       336
        end_date     1.0000    1.0000    1.0000       430
       grad_year     1.0000    1.0000    1.0000       170
           grade     1.0000    1