# **Training the model**

In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg

In [None]:
!pip install scikit-learn

In [None]:
def clean_entities(text, entities):
    cleaned_entities = []
    seen_tokens = set()
    for start, end, label in entities:
        entity_text = text[start:end].strip()
        start = text.find(entity_text, start, end)
        end = start + len(entity_text)
        entity_tokens = set(range(start, end))

        if not entity_tokens & seen_tokens:
            cleaned_entities.append((start, end, label))
            seen_tokens.update(entity_tokens)
    return cleaned_entities


In [None]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        with open(dataturks_JSON_FilePath, 'r') as f:
            data = json.load(f)  # Load entire JSON content

        for item in data:
            text = item['content']
            entities = []
            if 'annotation' in item and item['annotation'] is not None:
                for annotation in item['annotation']:
                    point = annotation['points'][0]
                    labels = annotation['label']
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        entities.append((point['start'], point['end'] + 1, label))

            cleaned_entities = clean_entities(text, entities)
            training_data.append((text, {"entities": cleaned_entities}))

        return training_data
    except Exception as e:
        logging.exception(f"Unable to process {dataturks_JSON_FilePath}\nerror = {str(e)}")
        return None


In [None]:
def train_spacy():
    TRAIN_DATA = convert_dataturks_to_spacy("/content/traindata.json")
    nlp = spacy.load('en_core_web_lg')  # Load pre-trained model

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.resume_training()
        best_loss = float('inf')
        patience = 3  # Number of iterations to wait for improvement
        no_improvement = 0

        for itn in range(20):  # Start with a lower number of iterations
            print(f"Starting iteration {itn}")
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.5)):
                texts, annotations = zip(*batch)
                examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
                nlp.update(
                    examples,
                    drop=0.2,
                    sgd=optimizer,
                    losses=losses)
            print(losses)

            # Early stopping check
            current_loss = losses.get('ner', float('inf'))
            if current_loss < best_loss:
                best_loss = current_loss
                no_improvement = 0
            else:
                no_improvement += 1

            if no_improvement >= patience:
                print(f"No improvement for {patience} iterations. Stopping training.")
                break

    output_dir = "/content/cv_parsing_model"
    nlp.to_disk(output_dir)
    print(f"Model saved to {output_dir}")

In [None]:
train_spacy()

In [None]:
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path
import random
import string

nlp = spacy.load("/content/drive/MyDrive/CV_Parser/cv_parsing_model")

# Get the NER component
ner = nlp.get_pipe("ner")

# Add labels
for _, annotations in TRAINING_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

# Convert the training data to spaCy examples
examples = []
for text, annotations in TRAINING_DATA:
    doc = nlp.make_doc(text)
    entites=annotations.get("entities")
    resolved_entities = []
    seen_spans = set()
    for start, end, label in entites:
      # Check for overlaps and resolve them (keep the longest span)
        if any((s, e) in seen_spans for s in range(start, end + 1) for e in range(start, end + 1)):
            continue  # Skip this entity if it overlaps with an existing one
        while start < end and (text[start].isspace() or text[start] in string.punctuation):
            start += 1
        while end > start and (text[end - 1].isspace() or text[end - 1] in string.punctuation):
            end -= 1
        if start < end:  # Only add the entity if it's still valid after trimming
            resolved_entities.append((start, end, label))
            seen_spans.update((s, e) for s in range(start, end + 1) for e in range(start, end + 1))  # Mark all character pairs within the span as seen
    annotations["entities"] = resolved_entities
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Start the training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

best_loss = float('inf')
no_improvement = 0
patience = 3  # Number of iterations with no improvement to wait before stopping

with nlp.disable_pipes(*other_pipes):  # only train NER
    sizes = compounding(1.0, 4.0, 1.001)
    for itn in range(50):
        random.shuffle(examples)
        batches = minibatch(examples, size=sizes)
        losses = {}
        for batch in batches:
            nlp.update(
                batch,  # batch of Example objects
                drop=0.2,  # dropout - make it harder to memorize data
                losses=losses
            )
        print(f"Iteration {itn}, Losses: {losses}")

        # Early stopping check
        current_loss = losses.get('ner', float('inf'))
        if current_loss < best_loss:
            best_loss = current_loss
            no_improvement = 0
        else:
            no_improvement += 1

        if no_improvement >= patience:
            print(f"No improvement for {patience} iterations. Stopping training.")
            break

# Save the model to disk
output_dir = Path("/content/drive/MyDrive/CV_Parser/cv_parsing_model_fineTuned2")
nlp.to_disk(output_dir)

print("Training complete. Model saved to", output_dir)

In [None]:
import PyPDF2
import pdfplumber
import spacy
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Function to extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Clean text
    return text.strip()

# Function to load and parse CV using custom spaCy model
def parse_cv(cv_text, nlp_model):
    doc = nlp_model(cv_text)
    parsed_data = [(ent.label_, ent.text) for ent in doc.ents]
    return parsed_data

# Function to calculate cosine similarity between two vectors
def calculate_cosine_similarity(vector1, vector2):
    return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0] * 100

# Main function to process CV and job description
def process_resume_and_job(cv_path, jd_path):
    # Step 1: Extract text from PDFs
    cv_text = extract_text_from_pdf(cv_path)
    jd_text = extract_text_from_pdf(jd_path)

    # Step 2: Preprocess text
    cv_text = preprocess_text(cv_text)
    jd_text = preprocess_text(jd_text)

    # Step 3: Load custom spaCy model for CV parsing
    nlp_cv = spacy.load("/content/drive/MyDrive/CV_Parser/cv_parsing_model_final/model-best")

    # Step 4: Parse CV using custom spaCy model
    parsed_cv_data = parse_cv(cv_text, nlp_cv)
    parsed_cv_text = ' '.join([text for _, text in parsed_cv_data]) if parsed_cv_data else ''

    # Step 5: Generate embeddings using Gensim Doc2Vec
    documents = [TaggedDocument(parsed_cv_text.split(), [0]), TaggedDocument(jd_text.split(), [1])]
    model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)
    cv_vector = model.dv[0]
    jd_vector = model.dv[1]

    # Step 6: Generate embeddings using Sentence Transformers
    sentence_model = SentenceTransformer('bert-base-nli-mean-tokens')
    cv_embeddings = sentence_model.encode([parsed_cv_text])
    jd_embeddings = sentence_model.encode([jd_text])

    # Step 7: Calculate cosine similarity scores
    gensim_cosine_sim = calculate_cosine_similarity(cv_vector, jd_vector)
    sentence_transformers_cosine_sim = calculate_cosine_similarity(cv_embeddings, jd_embeddings)

    # Step 8: Final matching score
    matching_score = (gensim_cosine_sim + sentence_transformers_cosine_sim) / 2

    return matching_score


# **Evaluating the model**

In [None]:
import spacy
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Function to load the trained model
def load_model(model_path):
    nlp = spacy.load(model_path)
    return nlp

# Function to load test data
def load_test_data(test_data_path):
    import json
    with open(test_data_path, 'r') as f:
        data = json.load(f)
    examples = []
    for entry in data:
        text = entry['content']
        annotations = entry['annotation']
        entities = []
        for annot in annotations:
            label = annot['label'][0]
            points = annot['points'][0]
            start = points['start']
            end = points['end']
            entities.append((start, end, label))
        examples.append((text, entities))
    return examples

# Function to test and evaluate the model
def test_and_evaluate(nlp, test_data_path):
    examples = load_test_data(test_data_path)
    entity_stats = {}
    overall_y_true = []
    overall_y_pred = []

    for text, annot in examples:
        doc_to_test = nlp(text)

        for ent in doc_to_test.ents:
            entity_stats.setdefault(ent.label_, [0, 0, 0, 0, 0, 0])

        y_true = []
        y_pred = []
        for start, end, label in annot:
            found = False
            for ent in doc_to_test.ents:
                if ent.start_char == start and ent.end_char == end and ent.label_ == label:
                    y_true.append(ent.label_)
                    y_pred.append(ent.label_)
                    found = True
                    break
            if not found:
                y_true.append(label)
                y_pred.append('Not' + label)

        overall_y_true.extend(y_true)
        overall_y_pred.extend(y_pred)

        for entity_type in entity_stats:
            true_positive = 0
            false_positive = 0
            false_negative = 0
            for i in range(len(y_true)):
                if y_true[i] == entity_type and y_pred[i] == entity_type:
                    true_positive += 1
                elif y_true[i] == entity_type and y_pred[i] != entity_type:
                    false_negative += 1
                elif y_true[i] != entity_type and y_pred[i] == entity_type:
                    false_positive += 1

            if true_positive + false_positive > 0:
                precision = true_positive / (true_positive + false_positive)
            else:
                precision = 0

            if true_positive + false_negative > 0:
                recall = true_positive / (true_positive + false_negative)
            else:
                recall = 0

            if precision + recall > 0:
                f1_score = 2 * precision * recall / (precision + recall)
            else:
                f1_score = 0

            accuracy = (true_positive + (len(y_true) - (true_positive + false_positive + false_negative))) / len(y_true)

            entity_stats[entity_type][0] = 1
            entity_stats[entity_type][1] += precision
            entity_stats[entity_type][2] += recall
            entity_stats[entity_type][3] += f1_score
            entity_stats[entity_type][4] += accuracy
            entity_stats[entity_type][5] += 1

    entity_accuracies = []
    for entity_type, stats in entity_stats.items():
        avg_accuracy = stats[4] / stats[5]
        entity_accuracies.append(avg_accuracy)
        print(f"\nFor Entity {entity_type}\n")
        print(f"Accuracy : {avg_accuracy * 100}%")
        print(f"Precision : {stats[1] / stats[5]}")
        print(f"Recall : {stats[2] / stats[5]}")
        print(f"F-score : {stats[3] / stats[5]}")

    overall_accuracy = sum(entity_accuracies) / len(entity_accuracies)

    print("\nOverall Evaluation Report\n")
    print(f"Overall Accuracy: {overall_accuracy * 100}%")

# Example usage
model_path = "/content/drive/MyDrive/CV_Parser/cv_parsing_model_fineTuned2"
test_data_path = "/content/drive/MyDrive/CV_Parser/testdata.json"

# Load the trained model
nlp = load_model(model_path)

# Test and evaluate the model
test_and_evaluate(nlp, test_data_path)




For Entity Name

Accuracy : 94.1474471509705%
Precision : 0.0
Recall : 0.0
F-score : 0.0

For Entity Designation

Accuracy : 88.81890860463179%
Precision : 0.3
Recall : 0.2666666666666667
F-score : 0.275

For Entity Companies worked at

Accuracy : 88.54095781333129%
Precision : 0.15
Recall : 0.1125
F-score : 0.12

For Entity Location

Accuracy : 87.09622674255195%
Precision : 0.1
Recall : 0.05
F-score : 0.06666666666666667

For Entity Email Address

Accuracy : 95.04805194805193%
Precision : 0.3
Recall : 0.3
F-score : 0.3

For Entity Degree

Accuracy : 93.29132185847087%
Precision : 0.25
Recall : 0.2
F-score : 0.21666666666666665

For Entity College Name

Accuracy : 89.86533394683718%
Precision : 0.3
Recall : 0.16666666666666669
F-score : 0.1983333333333333

For Entity Skills

Accuracy : 86.58471187899609%
Precision : 0.1
Recall : 0.05
F-score : 0.06666666666666667

For Entity Graduation Year

Accuracy : 91.17645728286459%
Precision : 0.05263157894736842
Recall : 0.02631578947368421
F-