In [None]:
import os
import json

def load_jsonl_files(folder_path):
    all_records = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".jsonl"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        record = json.loads(line)
                        all_records.append(record)
                    except json.JSONDecodeError:
                        print(f"Error decoding line in file: {filename}")

    print(f"Loaded {len(all_records)} records from folder: {folder_path}")
    return all_records

data_folder = "jsonl datafiles"
rag_data = load_jsonl_files(data_folder)


In [None]:
# %pip install -U "huggingface_hub[cli]"
%conda install -c conda-forge huggingface_hub -y
%conda install -c conda-forge huggingface_hub transformers sentence-transformers -y
%conda install pytorch torchvision torchaudio cpuonly -c pytorch -y
%conda install -c conda-forge transformers -y
%conda install -c conda-forge sentence-transformers -y

In [None]:
%conda install -c conda-forge transformers -y

In [None]:
import os
os.environ["HF_TOKEN"] = "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"
from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])

In [None]:
# from transformers import pipeline

# pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B") #8B

In [None]:
import huggingface_hub
print(huggingface_hub.__version__)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") #3.2-1B #3.1-8B
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") #3.2-1B

In [None]:
%pip install annoy==1.17.3

In [None]:
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")
texts = [item["text"] for item in rag_data]
sources = [item["source"] for item in rag_data]
embeddings = embedder.encode(texts, convert_to_numpy=True)


dimension = embeddings.shape[1]
annoy_index = AnnoyIndex(dimension, 'angular')

for i, vec in enumerate(embeddings):
    annoy_index.add_item(i, vec)


annoy_index.build(10)

def retrieve_relevant_documents_annoy(query, k=5):
    query_embedding = embedder.encode([query], convert_to_numpy=True)

    indices = annoy_index.get_nns_by_vector(query_embedding[0], k, include_distances=False)
    results = [texts[i] for i in indices]
    return results

def build_prompt(query, context_docs):
    context = "\n\n".join(context_docs)
    prompt = f"""
You are a career advisor for high school students.

Your only task is to select 3 to 5 career paths that are the best possible match for the student's stated interests, strengths, and dislikes.

Strict instructions:
- Base your suggestions strictly on the student’s message. Do not invent or assume anything not mentioned.
- Recommend only career paths that clearly align with what the student enjoys and is good at, and that avoid what they dislike or find difficult.
- For each suggested path, explain in 1–2 sentences why it fits this student specifically.
- Do not give general advice or list unrelated options "just in case."
- Do not exceed 5 suggestions. Do not use bullet points or numbered lists.
- Keep the total response under 130 words. Be focused and relevant.

Student’s message:
{query}

Career Advisor’s answer:
"""

    return prompt


def answer_question(query):
    context_docs = retrieve_relevant_documents_annoy(query)
    prompt = build_prompt(query, context_docs)
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            min_new_tokens=80,
            do_sample=True,
            top_p=0.9,
            temperature=0.3,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    answer = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
    return answer


# user_query = "What are good careers for someone who is introverted and likes programming?"
# print(answer_question(user_query))


In [None]:
questions = [
    "My name is Ivan, I'm 15 years old. I enjoy math and programming, but I don't like talking to people. What career would suit me?",
    "I'm 16 and I really enjoy solving logic puzzles and building small apps in Python. I also like reading sci-fi. What kind of job should I consider?",
    "I'm a quiet 17-year-old student. I get good grades in physics and computer science, but I find teamwork stressful. I want to work independently. Any career suggestions?",
    "I'm 15 and I love biology and caring for animals, but I can't stand the sight of blood. I want a job that lets me help animals but not as a vet. What do you suggest?",
    "I'm 17 and bilingual in English and Japanese. I love drawing manga, have won local art competitions, and enjoy using AI tools to create comics. I don't want a traditional office job. What creative career paths could fit my interests and skills?"
]

for i, question in enumerate(questions, 1):
    answer = answer_question(question)
    print(f"\n--- Question {i} ---")
    print(f"Q: {question}")
    print(f"A: {answer}")


In [None]:
user_query = "I'm 17 and bilingual in English and Japanese. I love drawing manga, have won local art competitions, and enjoy using AI tools to create comics. I don't want a traditional office job. What creative career paths could fit my interests and skills?"
print(answer_question(user_query))