In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. 데이터셋 텍스트화

- user.jsonl

In [None]:
import json

INPUT_PATH = "/content/drive/MyDrive/user.jsonl"
OUTPUT_PATH = "/content/drive/MyDrive/user_textified.jsonl"

def safe_join(items):
    """리스트가 비어있거나 None이면 빈 문자열 반환"""
    if not items:
        return ""
    return ", ".join(items)

with open(INPUT_PATH, "r", encoding="utf-8") as fin, \
     open(OUTPUT_PATH, "w", encoding="utf-8") as fout:

    for line in fin:
        user = json.loads(line)
        user_id = user.get("user_id")
        student_num = user.get("student_num")
        name = user.get("name")
        profile = user.get("profile", {})
        history = user.get("history", [])
        bio = profile.get("bio", "")
        history_descs = [
            h.get("desc", "")
            for h in history
            if h.get("desc")
        ]

        u_text = " ".join([bio] + history_descs).strip()

        skills = profile.get("skills", [])
        u_skill = f"skills: {safe_join(skills)}"

        prefer_role = user.get("prefer_role", "")
        u_role = f"preferred role: {prefer_role}"

        majors = profile.get("major", [])
        interests = profile.get("interests", [])

        u_interest = (
            f"major: {safe_join(majors)} "
            f"interests: {safe_join(interests)}"
        ).strip()

        output = {
            "user_id": user_id,
            "student_num": student_num,
            "name": name,
            "u_text": u_text,
            "u_skill": u_skill,
            "u_role": u_role,
            "u_interest": u_interest
        }

        fout.write(json.dumps(output, ensure_ascii=False) + "\n")


- project.jsonl

In [None]:
import json

INPUT_PATH = "/content/drive/MyDrive/project.jsonl"
OUTPUT_PATH = "/content/drive/MyDrive/project_textified.jsonl"


def safe_join(items):
    """리스트가 비어있거나 None이면 빈 문자열 반환"""
    if not items:
        return ""
    return ", ".join(items)


def normalize_text(x):
    if isinstance(x, list):
        return ". ".join(s.strip() for s in x if s.strip())
    if isinstance(x, str):
        return x.strip()
    return ""


with open(INPUT_PATH, "r", encoding="utf-8") as fin, \
     open(OUTPUT_PATH, "w", encoding="utf-8") as fout:

    for line in fin:
        project = json.loads(line)

        project_id = project.get("project_id")
        deadline = project.get("deadline")

        title = normalize_text(project.get("p_title"))
        desc = normalize_text(project.get("p_dis"))

        if desc:
            p_text = f"{title}. {desc}"
        else:
            p_text = title

        skills = project.get("p_skill", [])
        p_skill = f"required skills: {safe_join(skills)}"

        roles = project.get("p_role", [])
        p_role = f"required roles: {safe_join(roles)}"

        fields = project.get("p_field", [])
        p_field = f"project fields: {safe_join(fields)}"

        output = {
            "project_id": project_id,
            "p_text": p_text,
            "p_skill": p_skill,
            "p_role": p_role,
            "p_field": p_field,
            "deadline": deadline
        }

        fout.write(json.dumps(output, ensure_ascii=False) + "\n")



# 2. 데이터셋 구축

- 기본 환경 설정

In [None]:
pip install openai faiss-cpu numpy tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [None]:
import json
import numpy as np
import faiss
from tqdm import tqdm
from openai import OpenAI

In [None]:
client = OpenAI(api_key=)

- 함수 정의

In [None]:
# jsonl 로드 함수
def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [None]:
# openai 임베딩 함수 (배치)
def embed_texts(texts, model="text-embedding-3-small", batch_size=100):
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        response = client.embeddings.create(
            model=model,
            input=batch
        )

        embeddings.extend([r.embedding for r in response.data])

    return np.array(embeddings, dtype="float32")

- L2 정규화

In [None]:
def l2_normalize(vectors):
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

- FAISS index 생성, 저장

In [None]:
def build_faiss_index(vectors, index_path):
    dim = vectors.shape[1]
    index = faiss.IndexFlatIP(dim)  # cosine similarity
    index.add(vectors)
    faiss.write_index(index, index_path)
    return index

###project_textified → index 생성

In [None]:
project_data = load_jsonl("/content/drive/MyDrive/Contentbased/project_textified.jsonl")

views = {
    "text": "p_text",
    "skill": "p_skill",
    "role": "p_role",
    "field": "p_field"
}

project_ids = [p["project_id"] for p in project_data]

id_map = {}   # index → project_id
indexes = {}  # view → faiss index

for view, col in views.items():
    texts = [p[col] for p in project_data]

    embeddings = embed_texts(texts)
    embeddings = l2_normalize(embeddings)

    index_path = f"/content/drive/MyDrive/Contentbased/project_{view}.index"
    index = build_faiss_index(embeddings, index_path)

    indexes[view] = index
    id_map[view] = project_ids

    print(f"[DONE] {view} index saved → {index_path}")

[DONE] text index saved → /content/drive/MyDrive/Contentbased/project_text.index
[DONE] skill index saved → /content/drive/MyDrive/Contentbased/project_skill.index
[DONE] role index saved → /content/drive/MyDrive/Contentbased/project_role.index
[DONE] field index saved → /content/drive/MyDrive/Contentbased/project_field.index
