<a href="https://colab.research.google.com/github/AbuMulla-Mohammad/CVSegmentation2/blob/main/DocumentSegmentation2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install cohere
!pip install instructor
!pip install docx
!pip install instructor
!pip install generativeai
!pip install python-docx
!pip install sentence-transformers transformers

Collecting cohere
  Downloading cohere-5.8.1-py3-none-any.whl.metadata (3.4 kB)
Collecting boto3<2.0.0,>=1.34.0 (from cohere)
  Downloading boto3-1.35.0-py3-none-any.whl.metadata (6.6 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting httpx>=0.21.2 (from cohere)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting parameterized<0.10.0,>=0.9.0 (from cohere)
  Downloading parameterized-0.9.0-py2.py3-none-any.whl.metadata (18 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20240712-py3-none-any.whl.metadata (1.9 kB)
Collecting botocore<1.36.0,>=1.35.0 (from boto3<2.0.0,>=1.34.0->cohere)
  Downloading botocore-1.35.0-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>

In [None]:

import cohere
from docx import Document
import os
from transformers import pipeline
from pydantic import BaseModel, Field
from typing import List
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
import instructor

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


In [None]:
class Section(BaseModel):
    title: str = Field(description="Main topic of this section of the document")
    start_index: int = Field(description="Line number where the section begins")
    end_index: int = Field(description="Line number where the section ends")
    category: str = Field(description="The Category of this section")
    content: str = Field(description="The actual content of this section")

    def matches(self, job_application_section: str) -> bool:
        return job_application_section.lower() in self.content.lower()
class StructuredDocument(BaseModel):
    """Obtains meaningful sections, each centered around a single concept/topic."""
    sections: List[Section] = Field(description="A list of sections of the document")

In [None]:
def calculate_similarity_TF_IDF(segments, job_sections):
    cv_texts = [segment['content'] for segment in segments]
    job_texts = [job['content'] for job in job_sections]
    all_texts = cv_texts + job_texts
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    cv_vectors = tfidf_matrix[:len(cv_texts)]
    job_vectors = tfidf_matrix[len(cv_texts):]

    similarity_matrix = cosine_similarity(cv_vectors, job_vectors)

    similarities = []
    for i, segment in enumerate(segments):
        print(f"Similarity for CV Section '{segment['title']}':")
        for j, job_section in enumerate(job_sections):
            similarity = similarity_matrix[i][j] * 100  # Convert to percentage
            similarities.append({
                "User_Skills": i,
                "Job_Requirements": j,
                "Match_Score": similarity
            })
            print(f"  - With Job Section '{job_section['title']}': {similarity:.2f}%")
        print("-" * 40)

    return similarities

In [None]:
def compute_similarity_between_sections(cv_sections, job_sections):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    cv_texts = [section['content'] for section in cv_sections]
    job_texts = [section['content'] for section in job_sections]
    cv_embeddings = model.encode(cv_texts, convert_to_tensor=True)
    job_embeddings = model.encode(job_texts, convert_to_tensor=True)

    similarity_matrix = []
    for i, cv_embedding in enumerate(cv_embeddings):
        similarity_row = []
        for j, job_embedding in enumerate(job_embeddings):
            similarity = util.pytorch_cos_sim(cv_embedding, job_embedding).item() * 100
            similarity_row.append(similarity)
        similarity_matrix.append(similarity_row)

    similarities = []
    for i, section in enumerate(cv_sections):
        print(f"Similarity for CV Section '{section['title']}':")
        for j, job_section in enumerate(job_sections):
            similarity = similarity_matrix[i][j]
            similarities.append({
                "User_Skills": i,
                "Job_Requirements": j,
                "Match_Score": similarity
            })
            print(f"  - With Job Section '{job_section['title']}': {similarity:.2f}%")
        print("-" * 40)

    return similarities

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def compute_similarity_between_sections_using_distilbert(cv_sections, job_sections):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')

    def encode_texts(texts, tokenizer, model):
        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings

    cv_texts = [section['content'] for section in cv_sections]
    job_texts = [section['content'] for section in job_sections]

    cv_embeddings = encode_texts(cv_texts, tokenizer, model)
    job_embeddings = encode_texts(job_texts, tokenizer, model)

    similarity_matrix = cosine_similarity(cv_embeddings.numpy(), job_embeddings.numpy())

    similarities = []
    for i, section in enumerate(cv_sections):
        print(f"Similarity for CV Section '{section['title']}':")
        for j, job_section in enumerate(job_sections):
            similarity = similarity_matrix[i][j] * 100  # Convert to percentage
            similarities.append({
                "User_Skills": i,
                "Job_Requirements": j,
                "Match_Score": similarity
            })
            print(f"  - With Job Section '{job_section['title']}': {similarity:.2f}%")
        print("-" * 40)

    return similarities

In [None]:
import numpy as np

def calculate_squared_error(predicted_similarities, target_similarities):
    predicted_scores = np.array([sim['similarity_score'] for sim in predicted_similarities])
    target_scores = np.array(target_similarities)
    squared_errors = np.square(predicted_scores - target_scores)
    return squared_errors


In [None]:

from sklearn.metrics import mean_squared_error

def calculate_mse(predicted_similarities, true_similarities):
    y_true = []
    y_pred = []
    print("predicted_similarities",predicted_similarities,"true_similarities",true_similarities)
    for key, true_value in true_similarities.items():
        if key in predicted_similarities:
            y_true.append(true_value)
            y_pred.append(predicted_similarities[key])

    return mean_squared_error(y_true, y_pred)




In [34]:
api_key = "QRgahiOvBH2HBDFdd9ye1veAkzpliD4t2VD4LxiJ"
if not api_key:
    raise ValueError("API key for Cohere is not set in the .env file.")

# Initialize the Cohere client with the API key
client = cohere.Client(api_key)

# Apply the patch to the Cohere client
client = instructor.from_cohere(client)

system_prompt = """
You are a skilled resume analyst tasked with organizing a CV.
Read the document below and extract a StructuredDocument object from it where each section of the CV is clearly defined, categorized, and labeled with an appropriate category, the category must be one of these(Personal Information, Objective or Summary, Education, Work Experience, Skills, Languages, Certifications, Projects, Achievements, Publications, Conferences and Seminars, Volunteer Experience, Professional Memberships, Hobbies and Interests, References).
Each section should represent a distinct part of the CV, such as Education, Experience, Skills, etc.
Your task is to identify the start, end, and the category of each section using the line numbers provided in square brackets (e.g., [1], [2], [3], etc.), and assign a category label to each section based on its content.
Ensure to analyze and extract information from tables as well, as they may contain crucial details about Education, Experience, Skills, or other sections.
Note: that some sections maybe the title in the same line with its content, i want you to take care with it .
"""

# Document Processing Functions
def doc_with_lines(document: str):
    document_lines = document.split("\n")
    document_with_line_numbers = ""
    line2text = {}
    for i, line in enumerate(document_lines):
        document_with_line_numbers += f"[{i}] {line}\n"
        line2text[i] = line
    return document_with_line_numbers, line2text

def get_structured_document(document_with_line_numbers: str) -> StructuredDocument:
    response = client.chat.completions.create(
        model="command-r-plus",
        response_model=StructuredDocument,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": document_with_line_numbers},
        ],
    )
    return response

def extract_text_from_docx(docx_path: str) -> str:
    doc = Document(docx_path)
    text = []
    nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    for element in doc.element.body:
        if isinstance(element, CT_Tbl):
            for row in element.findall('.//w:tr', namespaces=nsmap):
                row_text = []
                for cell in row.findall('.//w:tc', namespaces=nsmap):
                    cell_text = []
                    for paragraph in cell.findall('.//w:p', namespaces=nsmap):
                        cell_text.append(paragraph.text or '')
                    row_text.append("\n".join(cell_text))
                text.append("\t".join(row_text))
        elif isinstance(element, CT_P):
            text.append(element.text or '')
    return "\n".join(text)

def get_sections_text(structured_doc: StructuredDocument, line2text: dict) -> list:
    segments = []
    for s in structured_doc.sections:
        contents = [line2text.get(line_id, '') for line_id in range(s.start_index, s.end_index)]
        segments.append({
            "title": s.title,
            "content": "\n".join(contents),
            "start": s.start_index,
            "end": s.end_index,
            "category": s.category
        })
    return segments

def get_job_sections_from_user():
    job_sections = []
    num_sections = int(input("Enter the number of job sections: "))
    for i in range(num_sections):
        title = input(f"Enter title for job section {i + 1}: ")
        text = input(f"Enter content for job section {i + 1}: ")
        job_sections.append({
            'idx': str(i),
            'title': title,
            'content': text
        })
    return job_sections



# Main Script
docx_path = '/files/SanadAhmad-CV.docx'

try:
    document = extract_text_from_docx(docx_path)
    document_with_line_numbers, line2text = doc_with_lines(document)
    structured_doc = get_structured_document(document_with_line_numbers)
    cv_sections = get_sections_text(structured_doc, line2text)

    print("Structured Document:")
    print("\nSegments:")
    for section in cv_sections:
        print(f"Title: {section['title']}")
        print(f"Start: {section['start']}")
        print(f"End: {section['end']}")
        print(f"Category: {section['category']}")
        print("Content:")
        print(section['content'])
        print("-" * 40)

    job_sections = get_job_sections_from_user()
    print("Model 1 _________________________________________________________________________________________________")
    compute_similarity_between_sections(cv_sections, job_sections)
    print("Model 2 _________________________________________________________________________________________________")
    calculate_similarity_TF_IDF(cv_sections, job_sections)
    print("Model 3 _________________________________________________________________________________________________")
    compute_similarity_between_sections_using_distilbert(cv_sections, job_sections)
    # Calculate MSE for each model

    import pandas as pd

    # Load the dataset
    df = pd.read_csv('/files/Job Datsset.csv')
    print(df)
    # Extract ground truth similarity scores
    true_similarities = df['Match_Score'].to_dict()
    print(df)
    # Example for TF-IDF
    predicted_similarities_tfidf = {
        ( item['Job_Requirements']): item['Match_Score']
        for item in calculate_similarity_TF_IDF(cv_sections, job_sections)
    }

    # Example for Sentence Transformers
    predicted_similarities_sbert = {
        ( item['Job_Requirements']): item['Match_Score']
        for item in compute_similarity_between_sections(cv_sections, job_sections)
    }

    # Example for DistilBERT
    predicted_similarities_distilbert = {
        ( item['Job_Requirements']): item['Match_Score']
        for item in compute_similarity_between_sections_using_distilbert(cv_sections, job_sections)
    }
    mse_distilbert = calculate_mse(predicted_similarities_distilbert, true_similarities)
    mse_tfidf = calculate_mse(predicted_similarities_tfidf, true_similarities)
    mse_sbert = calculate_mse(predicted_similarities_sbert, true_similarities)

    print(f"MSE for TF-IDF: {mse_tfidf:.4f}")
    print(f"MSE for Sentence Transformers: {mse_sbert:.4f}")
    print(f"MSE for DistilBERT: {mse_distilbert:.4f}")
except Exception as e:
    print(f"An error occurred: {e}")

Structured Document:

Segments:
Title: Personal Information
Start: 0
End: 5
Category: Personal Information
Content:
Sanad Ahmad      Mobile: 0594 55 80 90                                                                                    
                                           Email: sanad.ahmed2000@gmail.com

			Address: Jaba' – Jenin – Palestine
			Jobs:  IT Project Manager | AI Trainer
----------------------------------------
Title: Academic Qualifications
Start: 8
End: 15
Category: Education
Content:
** Academic Qualifications
Degree	Track	Institution	Year	GPA
Tawjihi	Scientific	Secondary School	2007	3.7
Bachelor	IT: Computer Science	Arab American University	2011	3.76
Master	Computer Science: Machine Learning	Arab American University	2021	4.00
Ph.D. Candidate	Artificial Intelligence:
Natural Language Processing - NLP	Birzeit University	2024	3.7
----------------------------------------
Title: Publications
Start: 16
End: 23
Category: Publications
Content:

** Publications
NLU-STR

KeyboardInterrupt: Interrupted by user