In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse

In [2]:
input_file = '../dataset/cooked/tfidf_matrix.pkl'

print(f"Loading feature matrix from {input_file}...")

try:
    with open(input_file, 'rb') as f:
        # Karena di notebook sebelumnya Anda dump langsung variabel final_features
        feature_matrix = pickle.load(f)
    
    print("Feature matrix loaded successfully.")
    print(f"Matrix type: {type(feature_matrix)}")
    print(f"Matrix shape: {feature_matrix.shape}")

except FileNotFoundError:
    print(f"Error: File {input_file} tidak ditemukan. Pastikan Anda sudah menjalankan 03_feature_engineering.ipynb")
    exit()

Loading feature matrix from ../dataset/cooked/tfidf_matrix.pkl...
Feature matrix loaded successfully.
Matrix type: <class 'scipy.sparse._coo.coo_matrix'>
Matrix shape: (39537, 124253)


In [3]:
print("Menghitung Cosine Similarity...")

# OPSI A: Jika RAM Besar (>16GB) - Hitung sekaligus
try:
    similarity_matrix = cosine_similarity(feature_matrix, dense_output=False) 
    # dense_output=False agar hasilnya tetap sparse matrix (hemat memori)
    print("Perhitungan selesai.")
    print(f"Similarity matrix shape: {similarity_matrix.shape}")

except MemoryError:
    print("Memory Error! RAM tidak cukup untuk menghitung seluruh matriks sekaligus.")
    print("Saran: Gunakan teknik chunking atau library 'Annoy' / 'Faiss' untuk data besar.")
    # Stop proses jika memori tidak cukup
    exit()

Menghitung Cosine Similarity...
Perhitungan selesai.
Similarity matrix shape: (39537, 39537)


In [4]:
output_file = '../dataset/cooked/similarity_matrix.pkl'

print(f"Menyimpan Similarity Matrix ke {output_file}...")

with open(output_file, 'wb') as f:
    pickle.dump(similarity_matrix, f)

print("Berhasil disimpan!")

Menyimpan Similarity Matrix ke ../dataset/cooked/similarity_matrix.pkl...
Berhasil disimpan!
