# Import lib

In [1]:
import pandas as pd
import numpy as np
import pickle

# Load dataset dan Matrix

In [2]:
# Load cleaned data
df = pd.read_csv("../dataset/cooked/cleaned_data.csv")

# Load similarity matrix
with open("../dataset/cooked/similarity_matrix.pkl", "rb") as f:
    similarity_matrix = pickle.load(f)

print("Data shape:", df.shape)
print("Similarity matrix shape:", similarity_matrix.shape)


Data shape: (39537, 19)
Similarity matrix shape: (39537, 39537)


# Preview

In [3]:
df[[
    "mbrTitle",
    "companyName",
    "posTitle",
    "country",
    "avgMemberPosDuration",
    "avgCompanyPosDuration"
]].head()


Unnamed: 0,mbrTitle,companyName,posTitle,country,avgMemberPosDuration,avgCompanyPosDuration
0,portfolio executive at commonwealth bank,commonwealth bank,portfolio executive,au,760.5,989.9361
1,portfolio executive at commonwealth bank,commonwealth bank,solution delivery executive,au,760.5,989.9361
2,portfolio executive at commonwealth bank,commsec,project manager,au,760.5,747.2308
3,portfolio executive at commonwealth bank,commonwealth bank,project manager,au,760.5,989.9361
4,"senior marketing manager, paypal",paypal,senior marketing manager,au,395.2857,683.3496


# Recoomender Function

In [25]:
def recommend_jobs(
    profile_index: int,
    top_n: int = 5,
    min_similarity: float = 0.1
):
    if profile_index >= df.shape[0]:
        raise ValueError("profile_index melebihi jumlah data")

    # Ambil baris similarity
    similarities = similarity_matrix[profile_index]

    # Jika sparse matrix → konversi ke numpy array
    if hasattr(similarities, "toarray"):
        similarities = similarities.toarray().ravel()
    else:
        similarities = np.array(similarities).ravel()

    similarity_scores = list(enumerate(similarities))

    # Urutkan similarity
    similarity_scores = sorted(
        similarity_scores,
        key=lambda x: float(x[1]),
        reverse=True
    )

    # Filter diri sendiri & threshold
    similarity_scores = [
        (idx, float(score))
        for idx, score in similarity_scores
        if idx != profile_index and float(score) >= min_similarity
    ]

    top_matches = similarity_scores[:top_n]

    recommendations = []
    for idx, score in top_matches:
        recommendations.append({
            "similarity_score": round(score, 4),
            "recommended_posTitle": df.loc[idx, "posTitle"],
            "companyName": df.loc[idx, "companyName"],
            "country": df.loc[idx, "country"],
            "avgMemberPosDuration": df.loc[idx, "avgMemberPosDuration"],
            "avgCompanyPosDuration": df.loc[idx, "avgCompanyPosDuration"]
        })

    return pd.DataFrame(recommendations)


In [26]:
print(type(similarity_matrix))
print(type(similarity_matrix[0]))

<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>


# Uji 1 Profil (Sanity Check)

In [30]:
sample_index = 10

print("Profil Asal:")
display(df.loc[sample_index, [
    "mbrTitle",
    "posTitle",
    "companyName",
    "country"
]])

print("\nRekomendasi:")
recommend_jobs(sample_index, top_n=5)


Profil Asal:


mbrTitle                   senior marketing manager, paypal
posTitle       internal communications and field enablement
companyName                                             ibm
country                                                  au
Name: 10, dtype: object


Rekomendasi:


Unnamed: 0,similarity_score,recommended_posTitle,companyName,country,avgMemberPosDuration,avgCompanyPosDuration
0,0.7535,brand system graduate,ibm,au,395.2857,1188.7339
1,0.7138,brand expression lead,ibm,au,395.2857,1188.7339
2,0.7129,brand strategist,ibm,au,395.2857,1188.7339
3,0.7076,social brand marketing,ibm,au,395.2857,1188.7339
4,0.6079,anz service management manager,ibm,au,707.5,1188.7339


# Uji Beberapa Profil Sekaligus

In [24]:
test_indices = [5, 25, 100]

for idx in test_indices:
    print("="*60)
    print(f"Profil Index: {idx}")
    print("Asal:", df.loc[idx, "posTitle"], "-", df.loc[idx, "companyName"])
    
    recs = recommend_jobs(idx, top_n=3)
    display(recs)


Profil Index: 5
Asal: digital and social engagement leader, optus business - optus


Unnamed: 0,similarity_score,recommended_posTitle,companyName,country,avgMemberPosDuration,avgCompanyPosDuration
0,0.6318,data scientist,optus,au,1001.0,977.6553
1,0.5955,application support analyst,optus,au,471.875,977.6553
2,0.5936,knowledgebase manager - optus@home,optus,au,1164.875,977.6553


Profil Index: 25
Asal: manager - organisational strategy - transport for nsw


Unnamed: 0,similarity_score,recommended_posTitle,companyName,country,avgMemberPosDuration,avgCompanyPosDuration
0,0.6309,"senior manager, change practice",transport for nsw,au,1145.7857,574.1019
1,0.6308,director digital engineering,transport for nsw,au,867.0,574.1019
2,0.6288,program change manager,transport for nsw,au,765.3077,574.1019


Profil Index: 100
Asal: judge - corporate counsel awards - lawyers weekly


Unnamed: 0,similarity_score,recommended_posTitle,companyName,country,avgMemberPosDuration,avgCompanyPosDuration
0,0.8022,judge - australian law awards 2016,lawyers weekly,au,1204.1429,313.2857
1,0.8006,member of the judging panel: 2019 corporate co...,lawyers weekly,au,1204.1429,313.2857
2,0.7951,member of judging panel: 2018 australian law a...,lawyers weekly,au,1204.1429,313.2857


# Validasi Logis (Manual Reasoning)

In [14]:
def validate_recommendation(profile_index):
    original = df.loc[profile_index]
    recs = recommend_jobs(profile_index, top_n=5)
    
    print("PROFIL ASAL")
    print("- Posisi :", original["posTitle"])
    print("- Lokasi :", original["country"])
    print("- Rata durasi posisi :", original["avgMemberPosDuration"])
    
    print("\nREKOMENDASI")
    for i, row in recs.iterrows():
        print(
            f"- {row['recommended_posTitle']} | "
            f"{row['companyName']} | "
            f"Similarity: {row['similarity_score']}"
        )

validate_recommendation(10)


PROFIL ASAL
- Posisi : internal communications and field enablement
- Lokasi : au
- Rata durasi posisi : 395.2857

REKOMENDASI
- brand system graduate | ibm | Similarity: 0.7535
- brand expression lead | ibm | Similarity: 0.7138
- brand strategist | ibm | Similarity: 0.7129
- social brand marketing | ibm | Similarity: 0.7076
- anz service management manager | ibm | Similarity: 0.6079


Berdasarkan pengujian pada beberapa indeks profil, sistem rekomendasi mampu menghasilkan posisi pekerjaan yang relevan secara semantik dan kontekstual. Rekomendasi cenderung berada pada domain pekerjaan, level jabatan, dan organisasi yang serupa dengan profil asal, sehingga menunjukkan bahwa similarity matrix berhasil menangkap kemiripan pola karier secara logis.

# Fungsi Precision K

In [31]:
def precision_at_k(profile_index, k=5):
    recs = recommend_jobs(profile_index, top_n=k)
    
    if recs is None or len(recs) == 0:
        return 0.0
    
    target_company = df.loc[profile_index, "companyName"]
    target_country = df.loc[profile_index, "country"]
    
    relevant = 0
    for _, row in recs.iterrows():
        if (
            row["companyName"] == target_company or
            row["country"] == target_country
        ):
            relevant += 1
    
    return relevant / k


# Evaluasi Beberapa Sampel

In [33]:
test_indices = [5, 10, 25, 50, 100]

scores = []
for idx in test_indices:
    p_at_5 = precision_at_k(idx, k=5)
    scores.append(p_at_5)
    print(f"Index {idx} → Precision 5 = {p_at_5:.2f}")

print("\nRata-rata Precision 5:", round(sum(scores) / len(scores), 2))


Index 5 → Precision 5 = 1.00
Index 10 → Precision 5 = 1.00
Index 25 → Precision 5 = 1.00
Index 50 → Precision 5 = 1.00
Index 100 → Precision 5 = 1.00

Rata-rata Precision 5: 1.0


# recommend_jobs versi tuning

In [40]:
def recommend_jobs_tuned(
    profile_index,
    top_n=10,
    min_similarity=0.6,
    max_per_company=3
):
    similarities = similarity_matrix[profile_index]
    
    if hasattr(similarities, "toarray"):
        similarities = similarities.toarray().flatten()
    
    similarity_scores = list(enumerate(similarities))
    similarity_scores = sorted(
        similarity_scores,
        key=lambda x: float(x[1]),
        reverse=True
    )
    
    target_company = df.loc[profile_index, "companyName"]
    company_count = {}
    results = []
    
    for idx, score in similarity_scores:
        if idx == profile_index:
            continue
        if score < min_similarity:
            continue
        
        company = df.loc[idx, "companyName"]
        company_count.setdefault(company, 0)
        
        if company_count[company] >= max_per_company:
            continue
        
        company_count[company] += 1
        
        results.append({
            "similarity_score": round(float(score), 4),
            "recommended_posTitle": df.loc[idx, "posTitle"],
            "companyName": company,
            "country": df.loc[idx, "country"],
            "avgMemberPosDuration": df.loc[idx, "avgMemberPosDuration"],
            "avgCompanyPosDuration": df.loc[idx, "avgCompanyPosDuration"],
        })
        
        if len(results) >= top_n:
            break
    
    return pd.DataFrame(results)


In [43]:
idx = 10

print("=== TANPA TUNING ===")
display(recommend_jobs(idx, top_n=5))

print("\n=== DENGAN TUNING ===")
display(recommend_jobs_tuned(idx, top_n=5))


=== TANPA TUNING ===


Unnamed: 0,similarity_score,recommended_posTitle,companyName,country,avgMemberPosDuration,avgCompanyPosDuration
0,0.7535,brand system graduate,ibm,au,395.2857,1188.7339
1,0.7138,brand expression lead,ibm,au,395.2857,1188.7339
2,0.7129,brand strategist,ibm,au,395.2857,1188.7339
3,0.7076,social brand marketing,ibm,au,395.2857,1188.7339
4,0.6079,anz service management manager,ibm,au,707.5,1188.7339



=== DENGAN TUNING ===


Unnamed: 0,similarity_score,recommended_posTitle,companyName,country,avgMemberPosDuration,avgCompanyPosDuration
0,0.7535,brand system graduate,ibm,au,395.2857,1188.7339
1,0.7138,brand expression lead,ibm,au,395.2857,1188.7339
2,0.7129,brand strategist,ibm,au,395.2857,1188.7339


Hasil pengujian menunjukkan bahwa penerapan tuning berupa similarity threshold dan pembatasan jumlah rekomendasi per perusahaan mampu mengurangi redundansi rekomendasi. Tanpa tuning, sistem cenderung menghasilkan rekomendasi yang homogen dari satu perusahaan yang sama. Setelah tuning diterapkan, rekomendasi menjadi lebih beragam meskipun jumlah item yang direkomendasikan dapat berkurang, yang menunjukkan adanya trade-off antara relevansi dan diversitas.