In [None]:
# =========================
# 0) 설치 & 임포트
# =========================
!pip -q install implicit

import json
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

# (선택) 재현성
np.random.seed(42)


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m61.4/70.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone


In [None]:
# =========================
# 1) jsonl 로드 -> DataFrame
# =========================
JSONL_PATH = "/content/interaction.jsonl"  # ✅ 너 파일 경로로 수정

rows = []
with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

df = pd.DataFrame(rows)

# 컬럼 확인
print(df.columns)
df.head()


Index(['log_id', 'user_id', 'project_id', 'event_type', 'dwell_time'], dtype='object')


Unnamed: 0,log_id,user_id,project_id,event_type,dwell_time
0,l_00001,U00001,p_0628,view,152.0
1,l_00002,U00001,p_0628,apply,
2,l_00003,U00001,p_1089,view,143.0
3,l_00004,U00002,p_2351,view,83.0
4,l_00005,U00002,p_2351,apply,


In [None]:
# =========================
# 2) A안 규칙: preference 만들기 (1/2/3) + max 집계
# - apply => 3
# - view  => dwell_time로 1/2/3 구간화
#     50~79 -> 1
#     80~119 -> 2
#     120~180 -> 3
# =========================

def view_dwell_to_pref(dt: float) -> int:
    # dt는 view일 때만 있다고 했으므로 None 처리도 포함
    if pd.isna(dt):
        return 1
    dt = int(dt)
    if dt < 80:
        return 1
    elif dt < 120:
        return 2
    else:
        return 3

def to_preference(row) -> int:
    et = row["event_type"]
    if et == "apply":
        return 3
    # view
    return view_dwell_to_pref(row.get("dwell_time", np.nan))

df["preference"] = df.apply(to_preference, axis=1)

# user-project 단위 max 집계
agg = (
    df.groupby(["user_id", "project_id"], as_index=False)["preference"]
      .max()
)

print("raw logs:", len(df), "-> aggregated user-item:", len(agg))
agg.head()


raw logs: 17934 -> aggregated user-item: 12498


Unnamed: 0,user_id,project_id,preference
0,U00001,p_0628,3
1,U00001,p_1089,3
2,U00001,p_1248,3
3,U00001,p_1689,1
4,U00002,p_0029,3


In [None]:
# =========================
# 3) user_id / project_id -> index 매핑
# =========================
user_ids = agg["user_id"].unique()
item_ids = agg["project_id"].unique()

user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {p: i for i, p in enumerate(item_ids)}
idx2user = {i: u for u, i in user2idx.items()}
idx2item = {i: p for p, i in item2idx.items()}

agg["user_idx"] = agg["user_id"].map(user2idx)
agg["item_idx"] = agg["project_id"].map(item2idx)

n_users = len(user_ids)
n_items = len(item_ids)
print("n_users:", n_users, "n_items:", n_items)


n_users: 4000 n_items: 1668


In [None]:
# =========================
# 4) Sparse matrix 구성
# - implicit 라이브러리 ALS는 보통 item-user (items x users)를 많이 씀
#   => item_user = (item_idx, user_idx, confidence_or_score)
#
# 여기서는 preference(1/2/3)를 "강도"로 사용하고,
# alpha로 confidence scaling을 적용할 수 있게 해둠.
# =========================
ALPHA = 15.0  # ✅ 튜닝용 (10~40 많이 씀)

# preference를 confidence로 스케일(권장)
values = 1.0 + ALPHA * agg["preference"].astype(np.float32).values

item_user = csr_matrix(
    (values, (agg["item_idx"].values, agg["user_idx"].values)),
    shape=(n_items, n_users)
)

item_user


<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 12498 stored elements and shape (1668, 4000)>

In [None]:
# =========================
# 5) Implicit ALS 학습
# =========================
model = AlternatingLeastSquares(
    factors=64,
    regularization=0.05,
    iterations=20,
    random_state=42
)

# implicit은 내부에서 sparse 최적화를 위해 float32 선호
item_user = item_user.astype(np.float32)

model.fit(item_user)
print("trained")


  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

trained


In [None]:
import json, numpy as np, pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

JSONL_PATH = "/content/interaction.jsonl"

# 1) load
rows = []
with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))
df = pd.DataFrame(rows)

# 2) preference rule (A안)
def view_dwell_to_pref(dt):
    if pd.isna(dt): return 1
    dt = int(dt)
    if dt < 80: return 1
    if dt < 120: return 2
    return 3

def to_preference(r):
    if r["event_type"] == "apply":
        return 3
    return view_dwell_to_pref(r.get("dwell_time", np.nan))

df["preference"] = df.apply(to_preference, axis=1)

# 3) relevant 정의 (추천: apply만)
df["is_relevant"] = (
    (df["event_type"] == "apply") |
    ((df["event_type"] == "view") & (df["dwell_time"] >= 120))
)
  # 필요하면 조건 확장 가능

# 4) user-item max 집계 + (time용) 마지막 로그 추적 위해 log_id도 함께 유지
#    여기서는 user-project별 최고 preference, 그리고 해당 최고가 발생한 시점(log_id의 max) 사용
agg = (df.groupby(["user_id","project_id"], as_index=False)
         .agg(preference=("preference","max"),
              last_log_id=("log_id","max"),
              relevant=("is_relevant","max")))

# 5) leave-one-out split: 각 유저의 relevant 중 마지막 1개를 test로
#    relevant가 없는 유저는 평가에서 제외(또는 view-high를 relevant로 잡아 포함)
agg_rel = agg[agg["relevant"] == True].copy()
agg_rel = agg_rel.sort_values(["user_id", "last_log_id"])

test_idx = agg_rel.groupby("user_id").tail(1).index
test = agg_rel.loc[test_idx, ["user_id","project_id"]].copy()

# train은 전체 agg에서 test로 뽑힌 (user,project)만 제거한 것
train = agg.copy()
train = train.merge(test.assign(_is_test=1), on=["user_id","project_id"], how="left")
train = train[train["_is_test"].isna()].drop(columns=["_is_test"])

print("train interactions:", len(train), "test interactions:", len(test),
      "test users:", test["user_id"].nunique())

# 6) build mappings from TRAIN only (중요)
user_list = np.sort(train["user_id"].unique())
item_list = np.sort(train["project_id"].unique())
user2idx = {u:i for i,u in enumerate(user_list)}
item2idx = {p:i for i,p in enumerate(item_list)}
idx2item_list = list(item_list)

train2 = train.copy()
train2["user_idx"] = train2["user_id"].map(user2idx)
train2["item_idx"] = train2["project_id"].map(item2idx)

n_users, n_items = len(user_list), len(item_list)

ALPHA = 15.0
vals = (1.0 + ALPHA * train2["preference"].astype(np.float32).values)

user_items = csr_matrix(
    (vals, (train2["user_idx"].values, train2["item_idx"].values)),
    shape=(n_users, n_items),
    dtype=np.float32
)

# 7) train ALS
model = AlternatingLeastSquares(factors=64, regularization=0.05, iterations=20, random_state=42)
model.fit(user_items)

# 8) evaluate Precision@K, Recall@K (leave-one-out이면 Rel_u 크기=1이라 Recall=hit-rate와 같음)
K = 30
# test의 project가 train item_list에 없는 경우(완전 신규 아이템)는 평가 불가 -> 제외
test_eval = test[test["user_id"].isin(user2idx) & test["project_id"].isin(item2idx)].copy()

hits = []
precisions = []
recalls = []

for u, group in test_eval.groupby("user_id"):
    uidx = user2idx[u]
    user_row = user_items[uidx]
    rec_item_idx, _ = model.recommend(uidx, user_row, N=K, filter_already_liked_items=True)

    rec_set = set(int(i) for i in rec_item_idx)
    rel_items = set([item2idx[p] for p in group["project_id"].tolist()])
    hit = len(rec_set & rel_items)

    precisions.append(hit / K)
    recalls.append(hit / len(rel_items))
    hits.append(1 if hit > 0 else 0)

print(f"Users evaluated: {len(test_eval['user_id'].unique())}")
print(f"Precision@{K}: {np.mean(precisions):.4f}")
print(f"Recall@{K}: {np.mean(recalls):.4f}")
print(f"HitRate@{K}: {np.mean(hits):.4f}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/interaction.jsonl'

민서야 아래 코드 실행하면 모델/매핑(가중치)/user_items 저장되거든? 여기서 생긴 파일 그대로 가져다쓰면 돼

In [None]:
import os, pickle, numpy as np
from scipy.sparse import save_npz

EXPORT_DIR = "/content/"
os.makedirs(EXPORT_DIR, exist_ok=True)

# =========================
# 1) ALS latent factors 저장
# =========================
np.savez_compressed(
    os.path.join(EXPORT_DIR, "als_model.npz"),
    user_factors=model.user_factors.astype(np.float32),
    item_factors=model.item_factors.astype(np.float32),
)

# =========================
# 2) 매핑 + 메타 정보 저장
# =========================
meta = {
    "evaluation": {
        "K": K,
        "split": "leave-one-out",
        "relevant_rule": "apply OR (view & dwell_time>=120)"
    },
    "training": {
        "alpha": float(ALPHA),
        "factors": int(model.user_factors.shape[1]),
        "regularization": 0.05,
        "iterations": 20,
    },
    "data": {
        "n_users": int(n_users),
        "n_items": int(n_items),
    },
    "preference_rule": {
        "apply": 3,
        "view_bins_seconds": [
            {"range": "50-79", "pref": 1},
            {"range": "80-119", "pref": 2},
            {"range": ">=120", "pref": 3},
        ],
        "aggregation": "max",
    }
}

with open(os.path.join(EXPORT_DIR, "mappings.pkl"), "wb") as f:
    pickle.dump(
        {
            "user2idx": user2idx,
            "item2idx": item2idx,
            "idx2item_list": idx2item_list,
            "meta": meta,
        },
        f,
        protocol=pickle.HIGHEST_PROTOCOL
    )

# =========================
# 3) user_items 저장 (filter_already_liked_items용)
# =========================
save_npz(
    os.path.join(EXPORT_DIR, "user_items.npz"),
    user_items
)

print("✅ Export completed")
print("Saved files:", os.listdir(EXPORT_DIR))
print("Export path:", EXPORT_DIR)


✅ Export completed
Saved files: ['.config', 'mappings.pkl', 'als_model.npz', 'user_items.npz', '.ipynb_checkpoints', 'sample_data']
Export path: /content
