In [74]:
import pandas as pd
import spacy
import ast

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    # Convert string lists to actual lists using ast.literal_eval
    df['hard_skill'] = df['hard_skill'].apply(ast.literal_eval)
    df['soft_skill'] = df['soft_skill'].apply(ast.literal_eval)
    return df

# Function to extract skills from user input using NLP
def extract_skills(text, dataset):
    doc = nlp(text.lower())
    extracted_hard_skills = []
    extracted_soft_skills = []
    
    # Get unique skills from the dataset
    all_hard_skills = set()
    all_soft_skills = set()
    for _, row in dataset.iterrows():
        all_hard_skills.update(row['hard_skill'])
        all_soft_skills.update(row['soft_skill'])
        
    
    # Extract skills by matching tokens and phrases
    for token in doc:
        lemma = token.lemma_
        if lemma in all_hard_skills:
            extracted_hard_skills.append(lemma)
        if lemma in all_soft_skills:
            extracted_soft_skills.append(lemma)
    
    # Handle multi-word phrases
    for chunk in doc.noun_chunks:
        if chunk.text in all_hard_skills:
            extracted_hard_skills.append(chunk.text)
        if chunk.text in all_soft_skills:
            extracted_soft_skills.append(chunk.text)
    
    return extracted_hard_skills, extracted_soft_skills

# Function to match skills to a career field
def match_career(hard_skills, soft_skills, dataset):
    best_match = None
    max_score = -1

    # Iterate through dataset to find the best matching field
    for _, row in dataset.iterrows():
        dataset_hard_skills = set(row['hard_skill'])
        dataset_soft_skills = set(row['soft_skill'])
        candidate_field = row['candidate_field']
        label = row['label']

        # Calculate match score (number of overlapping skills)
        hard_skill_matches = len(set(hard_skills).intersection(dataset_hard_skills))
        soft_skill_matches = len(set(soft_skills).intersection(dataset_soft_skills))
        total_score = hard_skill_matches + soft_skill_matches

        # Prefer matches with label=0 and higher scores
        if total_score > max_score and label == 0:
            max_score = total_score
            best_match = candidate_field

    return best_match if best_match else "No suitable field found"

# Main function to run the career guidance system
def run_career_guidance(input_text, dataset_path="Career Guidance Expert System.csv"):
    # Load dataset
    dataset = load_dataset(dataset_path)
    
    # Extract skills from input
    hard_skills, soft_skills = extract_skills(input_text, dataset)
    
    # Match skills to a career field
    recommended_field = match_career(hard_skills, soft_skills, dataset)
    
    # Prepare output
    output = {
        "recommended_field": recommended_field,
        "extracted_hard_skills": hard_skills,
        "extracted_soft_skills": soft_skills
    }
    
    return output

# Example usage
if __name__ == "__main__":
    # input_text = "I have experience in nursing, registration, and service. I am also good at written communication." outputs : healthcare & medical
    # input_text = "I'm good at business , credit control and e-commerce, and good at management and planning." #outputs accounting
    input_text = "I'm good with computers, foods, customer service." #output retail
    result = run_career_guidance(input_text)
    
    print("Career Guidance Recommendation:")
    print(f"Recommended Field: {result['recommended_field']}")
    print(f"Extracted Hard Skills: {result['extracted_hard_skills']}")
    print(f"Extracted Soft Skills: {result['extracted_soft_skills']}")

Career Guidance Recommendation:
Recommended Field: retail & consumer products
Extracted Hard Skills: ['service', 'computers', 'foods']
Extracted Soft Skills: ['customer service']
