In [None]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import os
import json
from pathlib import Path
from PyPDF2 import PdfReader

def extract_chunks_from_pdf(pdf_path, chunk_size=1000, overlap=100):
    reader = PdfReader(str(pdf_path))
    full_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"

    chunks = []
    start = 0
    while start < len(full_text):
        end = min(start + chunk_size, len(full_text))
        chunk_text = full_text[start:end].strip()
        if chunk_text:
            chunks.append({
                "text": chunk_text,
                "source": pdf_path.stem
            })
        start += chunk_size - overlap
    return chunks

def process_pdf_folder(input_dir, output_file):
    input_dir = Path(input_dir)
    all_chunks = []

    for pdf_file in input_dir.glob("*.pdf"):
        chunks = extract_chunks_from_pdf(pdf_file)
        all_chunks.extend(chunks)
        print(f"Processed {pdf_file.name} -> {len(chunks)} chunks")

    with open(output_file, "w", encoding="utf-8") as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk, ensure_ascii=False) + "\n")

    print(f"All PDFs processed and saved to {output_file}")


process_pdf_folder("pdf", "pdf_data.jsonl")


Processed Skills4Dev.pdf -> 20 chunks
Processed oecd.pdf -> 62 chunks
Processed Personality-and-Career-Choice.pdf -> 38 chunks
Processed 2024_Report_unisef.pdf -> 152 chunks
Processed About_KZ_universities.pdf -> 34 chunks
Processed PersonalityandCareerChoice.pdf -> 39 chunks
Processed Passion-driver.pdf -> 104 chunks
Processed future_time.pdf -> 12 chunks
Processed FernandezTerrierKIMPersonalityisnostrangertooccupationalchoiceamong.pdf -> 72 chunks
Processed personality-occupation.pdf -> 70 chunks
Processed classification_education_fields.pdf -> 180 chunks
Processed Career Preparations of High School Students  in China Japan Sou.pdf -> 30 chunks
Processed mojes.pdf -> 24 chunks
Processed oecd-2.pdf -> 60 chunks
Processed ISCO.pdf -> 1578 chunks
Processed Mapping_International_Joint_Dual_Degrees.pdf -> 151 chunks
Processed Personality-software.pdf -> 42 chunks
Processed college-majors-academic-area-study-sat-sd.pdf -> 15 chunks
Processed 500_activities_for_studying_abroad.pdf -> 20 chu

In [None]:
%pip install bs4

Collecting bs4
  Using cached bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Using cached bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.3/187.3 kB[0m [31m887.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading soupsieve-2.7-py3-none-any.whl (36 kB)
[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/1206

In [None]:
import csv
import json

from bs4 import BeautifulSoup

def clean_html(raw_html):
    return BeautifulSoup(raw_html, "html.parser").get_text(separator="\n").strip()


def process_forum_csv(input_csv, output_jsonl, source_name="forum_qa"):
    with open(input_csv, newline='', encoding='utf-8') as csvfile, \
         open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:

        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            question_title = clean_html(row.get("questions_title", ""))
            question_body = clean_html(row.get("questions_body", ""))
            answer_body = clean_html(row.get("answers_body", ""))


            text = f"Question: {question_title}\n{question_body}\nAnswer: {answer_body}"

            record = {
                "text": text,
                "source": source_name
            }

            jsonlfile.write(json.dumps(record, ensure_ascii=False) + "\n")
            count += 1

        print(f"Processed {count} records from {input_csv} into {output_jsonl}")


process_forum_csv("RAG data/CareerVillage_forum_qna_data.csv", "forum_data.jsonl")


Processed 51123 records from RAG data/CareerVillage_forum_qna_data.csv into forum_data.jsonl


In [None]:
import csv
import json

def process_qna_csv(input_csv, output_jsonl, source_name="role_qa"):
    with open(input_csv, newline='', encoding='utf-8') as csvfile, \
         open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:

        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            question = row.get("question", "").strip()
            answer = row.get("answer", "").strip()
            role = row.get("role", "").strip()

            text = f"Question: {question}\nAnswer: {answer}\nRelated Specialty: {role}"

            record = {
                "text": text,
                "source": source_name
            }

            jsonlfile.write(json.dumps(record, ensure_ascii=False) + "\n")
            count += 1

        print(f"Processed {count} records from {input_csv} into {output_jsonl}")


process_qna_csv("RAG data/role_qa_data.csv", "role_qa_data.jsonl")


Processed 1620 records from RAG data/role_qa_data.csv into role_qa_data.jsonl


In [None]:
import csv
import json

def process_big5_as_single_text(input_csv, output_jsonl, source_name="big5_traits_table"):
    traits_text = "The Big Five personality traits are associated with common behavioral characteristics and suitable careers:\n\n"

    with open(input_csv, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for i, row in enumerate(reader, 1):
            trait = row.get("Attributes", "").strip()
            common_traits = row.get("Common traits", "").strip()
            careers = row.get("List of careers", "").strip()

            traits_text += (
                f"{i}. {trait}\n"
                f"Common traits: {common_traits}\n"
                f"Suggested careers: {careers}\n\n"
            )

    record = {
        "text": traits_text.strip(),
        "source": source_name
    }

    with open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:
        jsonlfile.write(json.dumps(record, ensure_ascii=False) + "\n")

    print("Single Big 5 record written to JSONL.")

process_big5_as_single_text("RAG data/big_5_info.csv", "big5_info.jsonl")

Single Big 5 record written to JSONL.


In [None]:
import pandas as pd
import json


df = pd.read_csv("RAG data/MBTI_info.csv")  # Make sure the columns are: 'Personality Traits', 'Attributes', 'List of Careers'

# Intro paragraph (only once)
intro = {
    "text": (
        "The Myers-Briggs Type Indicator (MBTI) categorizes personalities into 16 distinct types based on four dimensions: "
        "Introversion vs. Extraversion, Sensing vs. Intuition, Thinking vs. Feeling, and Judging vs. Perceiving. "
        "Each personality type reflects unique cognitive preferences and behavioral traits, and can help individuals understand "
        "what kinds of careers or environments may suit them best."
    ),
    "source": "mbti_intro"
}

# Predefined mini-descriptions per type (for simplicity, short + generic)
descriptions = {
    "INTJ": "Strategic and independent thinkers who value knowledge, structure, and competence. Often seen as long-term planners.",
    "INTP": "Innovative and analytical problem solvers who enjoy exploring abstract theories and building logical systems.",
    "ENTJ": "Natural leaders who are decisive, assertive, and organized. They are driven to take charge and implement effective strategies.",
    "ENTP": "Curious, energetic, and intellectually agile individuals who thrive on new challenges and creative problem-solving.",
    "INFJ": "Idealistic, insightful, and compassionate. They seek deep meaning in relationships, ideas, and personal missions.",
    "INFP": "Empathetic, introspective, and imaginative people who are guided by strong inner values and creativity.",
    "ENFJ": "Charismatic leaders who are attuned to others' needs and dedicated to helping people and causes they care about.",
    "ENFP": "Energetic and optimistic individuals who enjoy inspiring others and exploring possibilities.",
    "ISTJ": "Responsible and detail-oriented, they value structure, tradition, and reliability.",
    "ISFJ": "Warm and conscientious caregivers who strive to protect and serve others quietly and thoughtfully.",
    "ESTJ": "Efficient and practical managers who enjoy organizing people and systems to get results.",
    "ESFJ": "Sociable and nurturing people who value harmony and are deeply loyal to those they care about.",
    "ISTP": "Observant, independent, and resourceful individuals who are often drawn to hands-on problem-solving.",
    "ISFP": "Gentle and adaptable artists who appreciate beauty and live according to their values.",
    "ESTP": "Energetic and action-oriented, they love challenges and excel at thinking on their feet.",
    "ESFP": "Spontaneous and friendly entertainers who bring enthusiasm and fun into their environment."
}

# Output list
output = [intro]

# Convert each row into a JSONL-compatible dict
for _, row in df.iterrows():
    code = row["Personality Traits"].strip()
    attributes = row["Attributes"].strip()
    careers = row["List of Careers"].strip()

    description = descriptions.get(code, "This personality type has unique characteristics and strengths.")
    text = (
        f"MBTI Type: {code}\n"
        f"Full form: {attributes}\n"
        f"Description: {description}\n"
        f"Suggested careers: {careers}"
    )

    output.append({
        "text": text,
        "source": "mbti_traits_table"
    })

# Save to JSONL
with open("mbti_data.jsonl", "w", encoding="utf-8") as f:
    for entry in output:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")

print("MBTI data saved to mbti_data.jsonl")


MBTI data saved to mbti_data.jsonl


In [None]:
import csv
import json

def process_career_skill_csv(input_csv, output_jsonl, source_name="career_skill"):
    with open(input_csv, newline='', encoding='utf-8') as csvfile, \
         open(output_jsonl, 'w', encoding='utf-8') as jsonlfile:

        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            career = row.get("Career", "").strip()
            skill = row.get("Skill", "").strip()

            text = f"Career: {career}\nRequired skills: {skill}"

            record = {
                "text": text,
                "source": source_name
            }

            jsonlfile.write(json.dumps(record, ensure_ascii=False) + "\n")
            count += 1

        print(f"Processed {count} records from {input_csv} into {output_jsonl}")


process_career_skill_csv("RAG data/Career_skill_data.csv", "career_skill.jsonl")


Processed 4076 records from RAG data/Career_skill_data.csv into career_skill.jsonl


In [None]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import json
from pathlib import Path
from tqdm import tqdm

def filter_short_responses(input_path, output_path, min_words=30):
    input_path = Path(input_path)
    output_path = Path(output_path)
    kept = 0
    total = 0

    with input_path.open(encoding="utf-8") as infile, output_path.open("w", encoding="utf-8") as outfile:
        for line in tqdm(infile, desc="Filtering short responses"):
            total += 1
            record = json.loads(line)
            text = record.get("text", "")
            if len(text.split()) >= min_words:
                json.dump(record, outfile, ensure_ascii=False)
                outfile.write("\n")
                kept += 1

    print(f"\n✅ {kept} out of {total} responses kept (min {min_words} words)")

filter_short_responses("forum_data.jsonl", "long_forum_data.jsonl", min_words=30)

Filtering short responses: 51123it [00:03, 13254.09it/s]


✅ 50743 out of 51123 responses kept (min 30 words)





In [None]:
import re
import json
from pathlib import Path
from tqdm import tqdm

def contains_subjective_keywords(text):
    keywords = [
        r"\bi\b", r"\bme\b", r"\bmy\b", r"\bmine\b", r"\bmyself\b", r"\bwe\b", r"\bus\b", r"\bours\b", r"\bourselves\b",
        r"\bi think\b", r"\bi believe\b", r"\bi feel\b", r"\bi guess\b", r"\bpersonally\b", r"\bin my experience\b",
        r"\bfor me\b", r"\bi remember\b", r"\bi tried\b", r"\bi used to\b", r"\bwhen i was\b", r"\bas for me\b",

        r"\bgood luck\b", r"\bstay strong\b", r"\bdon't worry\b", r"\btrust me\b", r"\bhang in there\b",
        r"\bcheer up\b", r"\bno worries\b", r"\byou got this\b", r"\byou’ll be fine\b",

        r"\bhello\b", r"\bhi\b", r"\bhey\b", r"\bdear\b", r"\bgreetings\b",

        r"\bi love\b", r"\bi hate\b", r"\bi enjoyed\b", r"\bi disliked\b", r"\bi’m not sure\b", r"\bi’m sure\b",

        r"\bhope this helps\b", r"\bhope that helps\b", r"\bhope it helps\b", r"\blet me know\b", r"\bfeel free to ask\b",
        r"\bthanks for\b", r"\bthank you for\b", r"\bglad\b", r"\bsorry to hear\b", r"\bi'm sorry\b",

        r"\bjust my opinion\b", r"\bi’d say\b", r"\bi would say\b", r"\bin my opinion\b", r"\bas far as i know\b"
    ]

    text_lower = text.lower()
    return any(re.search(pattern, text_lower) for pattern in keywords)

def filter_subjective_responses(input_path, output_path):
    input_path = Path(input_path)
    output_path = Path(output_path)
    kept = 0
    total = 0

    with input_path.open(encoding="utf-8") as infile, output_path.open("w", encoding="utf-8") as outfile:
        for line in tqdm(infile, desc="Filtering subjective responses"):
            total += 1
            record = json.loads(line)
            text = record.get("text", "")
            if contains_subjective_keywords(text):
                continue
            json.dump(record, outfile, ensure_ascii=False)
            outfile.write("\n")
            kept += 1

    print(f"\n✅ {kept} out of {total} responses kept (after subjective keyword filtering)")

filter_subjective_responses("long_forum_data.jsonl", "filtered_forum_data_keywords.jsonl")


Filtering subjective responses: 50743it [00:02, 21384.31it/s]


✅ 1396 out of 50743 responses kept (after subjective keyword filtering)





In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path
from tqdm import tqdm
import json

model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

generation_args = {
    "max_new_tokens": 150,
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.7,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": tokenizer.eos_token_id
}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# input_path = Path("filtered_forum_data_keywords.jsonl")
# output_path = Path("responses_from_phi2.jsonl")

In [None]:
# with input_path.open(encoding="utf-8") as infile, output_path.open("w", encoding="utf-8") as outfile:
#     for line in tqdm(infile, total=1396, desc="Answering"):
#         record = json.loads(line)
#         prompt = f"Student: {record['text'].strip()}\nCareer Advisor:"

#         inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

#         with torch.no_grad():
#             outputs = model.generate(**inputs, **generation_args)

#         generated = outputs[0][inputs['input_ids'].shape[-1]:]
#         answer = tokenizer.decode(generated, skip_special_tokens=True).strip().split("\n")[0]

#         result = {
#             "question": record["text"],
#             "answer": answer
#         }

#         json.dump(result, outfile, ensure_ascii=False)
#         outfile.write("\n")

#         del inputs, outputs
#         torch.cuda.empty_cache()

Answering:  24%|██▍       | 336/1396 [26:51<1:22:57,  4.70s/it]

In [None]:
!pip install pandas openpyxl

In [None]:
import pandas as pd
import json

xlsx_path = "ISCO_Structure_definitions.xlsx"
df = pd.read_excel(xlsx_path, engine="openpyxl")

column_mapping = {
    "Title EN": "Occupation",
    "Definition": "Definition",
    "Tasks include": "Typical Tasks",
    "Included occupations": "Included Occupations",
    "Excluded occupations": "Excluded Occupations",
    "Notes": "Notes"
}
df = df.rename(columns=column_mapping)

df = df[df["Occupation"].notna()]
df = df.fillna("")

output_path = "isco_data.jsonl"
source_name = "ISCO_Structure_definitions.xlsx"

with open(output_path, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        text = (
            f"Occupation: {row['Occupation']}\n"
            f"Definition: {row['Definition']}\n"
            f"Typical Tasks: {row['Typical Tasks']}\n"
            f"Included Occupations: {row['Included Occupations']}\n"
            f"Excluded Occupations: {row['Excluded Occupations']}\n"
            f"Notes: {row['Notes']}"
        )
        json.dump({"text": text, "source": source_name}, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Done! Saved to {output_path}")


✅ Done! Saved to isco_data.jsonl
