In [None]:
!pip install datasets
!pip install transformers
!pip install pandas
!pip install torch

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tokenizer_knowledge = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
model_knowledge = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")

tokenizer_skill = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
model_skill = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")

training_dataset = load_dataset("Appz7/jdgen", split="train[:1000]")

In [3]:
from transformers import pipeline
import pandas as pd
from datasets import load_dataset

token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")


In [None]:
import re
import pandas as pd

def aggregate_span(results):
    new_results = []
    current_result = results[0]

    for result in results[1:]:
        if result["start"] == current_result["end"] + 1:
            current_result["word"] += " " + result["word"]
            current_result["end"] = result["end"]
        else:
            new_results.append(current_result)
            current_result = result

    new_results.append(current_result)
    return new_results

def ner(text):
    input_ids = token_skill_classifier.tokenizer.encode(text, truncation=True, add_special_tokens=False)
    max_length = 510
    input_chunks = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]

    all_skills = set()
    all_knowledge = set()

    for chunk_ids in input_chunks:
        chunk_text = token_skill_classifier.tokenizer.decode(chunk_ids, skip_special_tokens=True)
        output_skills = token_skill_classifier(chunk_text)
        output_knowledge = token_knowledge_classifier(chunk_text)

        if len(output_skills) > 0:
            output_skills = aggregate_span(output_skills)
        if len(output_knowledge) > 0:
            output_knowledge = aggregate_span(output_knowledge)

        chunk_skills = {res["word"] for res in output_skills}
        chunk_knowledge = {res["word"] for res in output_knowledge}

        all_skills.update(chunk_skills)
        all_knowledge.update(chunk_knowledge)

    final_knowledge = all_knowledge.difference(all_skills)

    return all_skills.union(final_knowledge)

def process_training_dataset(training_dataset):
    training_data = []
    for t_row in training_dataset:
        cleaned_skills = ner(t_row['skills'])
        cleaned_gpt_response = ner(t_row['gpt_response'])
        training_data.append({
            "Skills": t_row['skills'],
            "GPT Response": t_row['gpt_response'],
            "Cleaned Skills": ", ".join(sorted(cleaned_skills)),
            "Cleaned GPT Response Skills": ", ".join(sorted(cleaned_gpt_response)),
        })
    training_df = pd.DataFrame(training_data)
    return training_df


training_df = process_training_dataset(training_dataset)

def calculate_percentage_and_matched_keywords(df):
    percentages = []
    matched_keywords_list = []

    for index, row in df.iterrows():
        cleaned_skills_set = set(row['Cleaned Skills'].split(", "))
        cleaned_gpt_response_set = set(row['Cleaned GPT Response Skills'].split(", "))
        matched_keywords = cleaned_skills_set.intersection(cleaned_gpt_response_set)
        matched_keywords_list.append(", ".join(sorted(matched_keywords)))

        if cleaned_skills_set:
            percentage = (len(matched_keywords) / len(cleaned_skills_set)) * 100
        else:
            percentage = 0

        percentages.append(percentage)
    df['Percentage Match'] = percentages
    df['Matched Keywords'] = matched_keywords_list

calculate_percentage_and_matched_keywords(training_df)