In [None]:
%pip install PyPDF2

In [None]:
import os
import json
from pathlib import Path
from PyPDF2 import PdfReader

def extract_chunks_from_pdf(pdf_path, chunk_size=1000, overlap=100):
    reader = PdfReader(str(pdf_path))
    full_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"

    chunks = []
    start = 0
    while start < len(full_text):
        end = min(start + chunk_size, len(full_text))
        chunk_text = full_text[start:end].strip()
        if chunk_text:
            chunks.append({
                "text": chunk_text,
                "source": pdf_path.stem
            })
        start += chunk_size - overlap
    return chunks

def process_pdf_folder(input_dir, output_file):
    input_dir = Path(input_dir)
    all_chunks = []

    for pdf_file in input_dir.glob("*.pdf"):
        chunks = extract_chunks_from_pdf(pdf_file)
        all_chunks.extend(chunks)
        print(f"Processed {pdf_file.name} -> {len(chunks)} chunks")

    with open(output_file, "w", encoding="utf-8") as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk, ensure_ascii=False) + "\n")

    print(f"All PDFs processed and saved to {output_file}")


process_pdf_folder("RAG data", "pdf_data.jsonl")


In [None]:
%pip install bs4

In [None]:
import csv
import json

from bs4 import BeautifulSoup

def clean_html(raw_html):
    return BeautifulSoup(raw_html, "html.parser").get_text(separator="\n").strip()


def process_forum_csv(input_csv, output_jsonl, source_name="forum_qa"):
    with open(input_csv, newline='', encoding='utf-8') as csvfile, \
         open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:
        
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            question_title = clean_html(row.get("questions_title", ""))
            question_body = clean_html(row.get("questions_body", ""))
            answer_body = clean_html(row.get("answers_body", ""))

            
            text = f"Question: {question_title}\n{question_body}\nAnswer: {answer_body}"
            
            record = {
                "text": text,
                "source": source_name
            }
            
            jsonlfile.write(json.dumps(record, ensure_ascii=False) + "\n")
            count += 1
        
        print(f"Processed {count} records from {input_csv} into {output_jsonl}")


process_forum_csv("RAG data/CareerVillage_forum_qna_data.csv", "forum_data.jsonl")


In [None]:
import csv
import json

def process_qna_csv(input_csv, output_jsonl, source_name="role_qa"):
    with open(input_csv, newline='', encoding='utf-8') as csvfile, \
         open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:
        
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            question = row.get("question", "").strip()
            answer = row.get("answer", "").strip()
            role = row.get("role", "").strip()
            
            text = f"Question: {question}\nAnswer: {answer}\nRelated Specialty: {role}"
            
            record = {
                "text": text,
                "source": source_name
            }
            
            jsonlfile.write(json.dumps(record, ensure_ascii=False) + "\n")
            count += 1
        
        print(f"Processed {count} records from {input_csv} into {output_jsonl}")


process_qna_csv("RAG data/role_qa_data.csv", "role_qa_data.jsonl")


In [None]:
import csv
import json

def process_big5_as_single_text(input_csv, output_jsonl, source_name="big5_traits_table"):
    traits_text = "The Big Five personality traits are associated with common behavioral characteristics and suitable careers:\n\n"
    
    with open(input_csv, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for i, row in enumerate(reader, 1):
            trait = row.get("Attributes", "").strip()
            common_traits = row.get("Common traits", "").strip()
            careers = row.get("List of careers", "").strip()
            
            traits_text += (
                f"{i}. {trait}\n"
                f"Common traits: {common_traits}\n"
                f"Suggested careers: {careers}\n\n"
            )
    
    record = {
        "text": traits_text.strip(),
        "source": source_name
    }

    with open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:
        jsonlfile.write(json.dumps(record, ensure_ascii=False) + "\n")
    
    print("Single Big 5 record written to JSONL.")

process_big5_as_single_text("RAG data/big_5_info.csv", "big5_info.jsonl")

In [None]:
import pandas as pd
import json


df = pd.read_csv("RAG data/MBTI_info.csv")  # Make sure the columns are: 'Personality Traits', 'Attributes', 'List of Careers'

# Intro paragraph (only once)
intro = {
    "text": (
        "The Myers-Briggs Type Indicator (MBTI) categorizes personalities into 16 distinct types based on four dimensions: "
        "Introversion vs. Extraversion, Sensing vs. Intuition, Thinking vs. Feeling, and Judging vs. Perceiving. "
        "Each personality type reflects unique cognitive preferences and behavioral traits, and can help individuals understand "
        "what kinds of careers or environments may suit them best."
    ),
    "source": "mbti_intro"
}

# Predefined mini-descriptions per type (for simplicity, short + generic)
descriptions = {
    "INTJ": "Strategic and independent thinkers who value knowledge, structure, and competence. Often seen as long-term planners.",
    "INTP": "Innovative and analytical problem solvers who enjoy exploring abstract theories and building logical systems.",
    "ENTJ": "Natural leaders who are decisive, assertive, and organized. They are driven to take charge and implement effective strategies.",
    "ENTP": "Curious, energetic, and intellectually agile individuals who thrive on new challenges and creative problem-solving.",
    "INFJ": "Idealistic, insightful, and compassionate. They seek deep meaning in relationships, ideas, and personal missions.",
    "INFP": "Empathetic, introspective, and imaginative people who are guided by strong inner values and creativity.",
    "ENFJ": "Charismatic leaders who are attuned to others' needs and dedicated to helping people and causes they care about.",
    "ENFP": "Energetic and optimistic individuals who enjoy inspiring others and exploring possibilities.",
    "ISTJ": "Responsible and detail-oriented, they value structure, tradition, and reliability.",
    "ISFJ": "Warm and conscientious caregivers who strive to protect and serve others quietly and thoughtfully.",
    "ESTJ": "Efficient and practical managers who enjoy organizing people and systems to get results.",
    "ESFJ": "Sociable and nurturing people who value harmony and are deeply loyal to those they care about.",
    "ISTP": "Observant, independent, and resourceful individuals who are often drawn to hands-on problem-solving.",
    "ISFP": "Gentle and adaptable artists who appreciate beauty and live according to their values.",
    "ESTP": "Energetic and action-oriented, they love challenges and excel at thinking on their feet.",
    "ESFP": "Spontaneous and friendly entertainers who bring enthusiasm and fun into their environment."
}

# Output list
output = [intro]

# Convert each row into a JSONL-compatible dict
for _, row in df.iterrows():
    code = row["Personality Traits"].strip()
    attributes = row["Attributes"].strip()
    careers = row["List of Careers"].strip()

    description = descriptions.get(code, "This personality type has unique characteristics and strengths.")
    text = (
        f"MBTI Type: {code}\n"
        f"Full form: {attributes}\n"
        f"Description: {description}\n"
        f"Suggested careers: {careers}"
    )
    
    output.append({
        "text": text,
        "source": "mbti_traits_table"
    })

# Save to JSONL
with open("mbti_data.jsonl", "w", encoding="utf-8") as f:
    for entry in output:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")

print("MBTI data saved to mbti_data.jsonl")


In [None]:
import csv
import json

def process_career_skill_csv(input_csv, output_jsonl, source_name="career_skill"):
    with open(input_csv, newline='', encoding='utf-8') as csvfile, \
         open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:
        
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            career = row.get("Career", "").strip()
            skill = row.get("Skill", "").strip()
            
            text = f"Career: {career}\nRequired skills: {skill}"
            
            record = {
                "text": text,
                "source": source_name
            }
            
            jsonlfile.write(json.dumps(record, ensure_ascii=False) + "\n")
            count += 1
        
        print(f"Processed {count} records from {input_csv} into {output_jsonl}")


process_career_skill_csv("RAG data/Career_skill_data.csv", "career_skill.jsonl")


In [None]:
%pip install pandas openpyxl

In [None]:
import pandas as pd
import json

xlsx_path = "ISCO_Structure_definitions.xlsx"
df = pd.read_excel(xlsx_path, engine="openpyxl")

column_mapping = {
    "Title EN": "Occupation",
    "Definition": "Definition",
    "Tasks include": "Typical Tasks",
    "Included occupations": "Included Occupations",
    "Excluded occupations": "Excluded Occupations",
    "Notes": "Notes"
}
df = df.rename(columns=column_mapping)

df = df[df["Occupation"].notna()]
df = df.fillna("")

output_path = "isco_data.jsonl"
source_name = "ISCO_Structure_definitions.xlsx"

with open(output_path, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        text = (
            f"Occupation: {row['Occupation']}\n"
            f"Definition: {row['Definition']}\n"
            f"Typical Tasks: {row['Typical Tasks']}\n"
            f"Included Occupations: {row['Included Occupations']}\n"
            f"Excluded Occupations: {row['Excluded Occupations']}\n"
            f"Notes: {row['Notes']}"
        )
        json.dump({"text": text, "source": source_name}, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Done! Saved to {output_path}")
