In [1]:
import pandas as pd
import numpy as np
import psycopg2
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, Flatten, Concatenate, Input
from tensorflow.keras.optimizers import Adam
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle
import os
import hashlib
from datetime import datetime
import logging

In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [29]:
class PostgreSQLContentBasedRecommender:
    def __init__(self, db_config, model_path='models/'):
        """
        Initialize the recommender with database configuration
        
        Args:
            db_config (dict): PostgreSQL connection parameters
            model_path (str): Path to save/load models
        """
        self.db_config = db_config
        self.model_path = model_path
        self.model = None
        self.tfidf_vectorizer = None
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.similarity_matrix = None
        self.articles_data = None
        self.data_hash = None
        
        # Create model directory if it doesn't exist
        os.makedirs(model_path, exist_ok=True)
        
    def get_db_connection(self):
        """Create and return database connection"""
        try:
            conn = psycopg2.connect(**self.db_config)
            return conn
        except Exception as e:
            logger.error(f"Database connection failed: {e}")
            raise
    
    def load_data_from_db(self):
        """Load data from PostgreSQL database"""
        conn = self.get_db_connection()
        
        try:
            # Load articles data
            articles_query = """
            SELECT id, title, slug, province, city, active, user_id, created_at, updated_at
            FROM "Articles" as articles 
            WHERE active = true
            ORDER BY created_at DESC
            """
            articles_df = pd.read_sql(articles_query, conn)
            
            # Load likes data
            likes_query = """
            SELECT id, article_id, user_id, created_at
            FROM "Article_likes" as article_likes
            """
            likes_df = pd.read_sql(likes_query, conn)
            
            # Load comments data
            comments_query = """
            SELECT id, article_id, user_id, created_at
            from "Article_comments" aS  article_comments
            """
            comments_df = pd.read_sql(comments_query, conn)
            
            logger.info(f"Loaded {len(articles_df)} articles, {len(likes_df)} likes, {len(comments_df)} comments")
            
            return articles_df, likes_df, comments_df
            
        except Exception as e:
            logger.error(f"Error loading data from database: {e}")
            raise
        finally:
            conn.close()
    
    def calculate_data_hash(self, articles_df, likes_df, comments_df):
        """Calculate hash of the data to detect changes"""
        data_string = (
            str(articles_df.shape) + 
            str(likes_df.shape) + 
            str(comments_df.shape) +
            str(articles_df['updated_at'].max()) +
            str(likes_df['created_at'].max() if not likes_df.empty else '') +
            str(comments_df['created_at'].max() if not comments_df.empty else '')
        )
        return hashlib.md5(data_string.encode()).hexdigest()
    
    def preprocess_data_old(self, articles_df, likes_df, comments_df):
        """Preprocess the loaded data"""
        # Ensure all IDs are strings
        articles_df['id'] = articles_df['id'].astype(str)
        likes_df['article_id'] = likes_df['article_id'].astype(str)
        comments_df['article_id'] = comments_df['article_id'].astype(str)
        
        # Calculate likes per article
        article_likes = likes_df.groupby('article_id').size().reset_index(name='likes_count')
        
        # Calculate comments per article
        article_comments = comments_df.groupby('article_id').size().reset_index(name='comments_count')
        
        # Combine data with articles
        articles_enriched = articles_df.copy()
        
        # Add likes count
        articles_enriched = articles_enriched.merge(
            article_likes, left_on='id', right_on='article_id', how='left'
        )
        articles_enriched['likes_count'] = articles_enriched['likes_count'].fillna(0)
        
        # Add comments count
        articles_enriched = articles_enriched.merge(
            article_comments, left_on='id', right_on='article_id', how='left'
        )
        articles_enriched['comments_count'] = articles_enriched['comments_count'].fillna(0)
        
        # Clean up merge columns
        articles_enriched = articles_enriched.drop(['article_id_x', 'article_id_y'], axis=1, errors='ignore')
        
        # Create text features
        articles_enriched['text_features'] = (
            articles_enriched['title'].fillna('') + ' ' + 
            articles_enriched['province'].fillna('') + ' ' + 
            articles_enriched['city'].fillna('')
        )
        
        # Calculate engagement score
        articles_enriched['engagement_score'] = (
            articles_enriched['likes_count'] + (2 * articles_enriched['comments_count'])
        )
        
        # Add time-based features
        articles_enriched['created_at'] = pd.to_datetime(articles_enriched['created_at'])
        articles_enriched['days_since_creation'] = (
            (datetime.now() - articles_enriched['created_at']).dt.days
        )
        
        # Calculate recency score (newer articles get higher scores)
        max_days = articles_enriched['days_since_creation'].max()
        articles_enriched['recency_score'] = 1 - (articles_enriched['days_since_creation'] / max_days)
        
        return articles_enriched
    def preprocess_data(self, articles_df, likes_df, comments_df):
        """Preprocess the loaded data"""
        # Ensure all IDs are strings
        articles_df['id'] = articles_df['id'].astype(str)
        likes_df['article_id'] = likes_df['article_id'].astype(str)
        comments_df['article_id'] = comments_df['article_id'].astype(str)
        
        # Calculate likes per article
        article_likes = likes_df.groupby('article_id').size().reset_index(name='likes_count')
        
        # Calculate comments per article
        article_comments = comments_df.groupby('article_id').size().reset_index(name='comments_count')
        
        # Combine data with articles
        articles_enriched = articles_df.copy()
        
        # Add likes count
        articles_enriched = articles_enriched.merge(
            article_likes, left_on='id', right_on='article_id', how='left'
        )
        articles_enriched['likes_count'] = articles_enriched['likes_count'].fillna(0)
        
        # Add comments count
        articles_enriched = articles_enriched.merge(
            article_comments, left_on='id', right_on='article_id', how='left'
        )
        articles_enriched['comments_count'] = articles_enriched['comments_count'].fillna(0)
        
        # Clean up merge columns
        articles_enriched = articles_enriched.drop(['article_id_x', 'article_id_y'], axis=1, errors='ignore')
        
        # Create text features
        articles_enriched['text_features'] = (
            articles_enriched['title'].fillna('') + ' ' + 
            articles_enriched['province'].fillna('') + ' ' + 
            articles_enriched['city'].fillna('')
        )
        
        # Calculate engagement score
        articles_enriched['engagement_score'] = (
            articles_enriched['likes_count'] + (2 * articles_enriched['comments_count'])
        )
        
        print(articles_enriched.head(5))
        print(articles_enriched.info())
        # Add time-based features - FIX FOR TIMEZONE ISSUE
        articles_enriched['created_at'] = pd.to_datetime(articles_enriched['created_at'])
        
        # Handle timezone-aware datetime comparison
        if articles_enriched['created_at'].dt.tz is not None:
            # Data is timezone-aware (UTC), so use UTC for current time
            from datetime import timezone
            now = datetime.now(timezone.utc)
        else:
            # Data is timezone-naive, use naive datetime
            now = datetime.now()
        
        articles_enriched['days_since_creation'] = (
            (now - articles_enriched['created_at']).dt.days
        )
        
        # Calculate recency score (newer articles get higher scores)
        max_days = articles_enriched['days_since_creation'].max()
        if max_days > 0:
            articles_enriched['recency_score'] = 1 - (articles_enriched['days_since_creation'] / max_days)
        else:
            # If all articles are from today, give them all the same recency score
            articles_enriched['recency_score'] = 1.0
        
        return articles_enriched
    def prepare_features(self, articles_enriched):
        """Prepare features for the model"""
        # Text features using TF-IDF
        if self.tfidf_vectorizer is None:
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=1000,
                stop_words='english',
                ngram_range=(1, 2)
            )
            tfidf_features = self.tfidf_vectorizer.fit_transform(articles_enriched['text_features'])
        else:
            tfidf_features = self.tfidf_vectorizer.transform(articles_enriched['text_features'])
        
        # Categorical features
        categorical_features = []
        for col in ['province', 'city']:
            if col not in self.label_encoders:
                self.label_encoders[col] = LabelEncoder()
                encoded = self.label_encoders[col].fit_transform(articles_enriched[col].fillna('unknown'))
            else:
                # Handle unseen categories
                known_classes = set(self.label_encoders[col].classes_)
                articles_enriched[col] = articles_enriched[col].fillna('unknown')
                articles_enriched.loc[~articles_enriched[col].isin(known_classes), col] = 'unknown'
                encoded = self.label_encoders[col].transform(articles_enriched[col])
            
            categorical_features.append(encoded.reshape(-1, 1))
        
        # Numerical features
        numerical_cols = ['likes_count', 'comments_count', 'engagement_score', 'recency_score']
        numerical_features = articles_enriched[numerical_cols].values
        
        if not hasattr(self, 'scaler_fitted'):
            numerical_features = self.scaler.fit_transform(numerical_features)
            self.scaler_fitted = True
        else:
            numerical_features = self.scaler.transform(numerical_features)
        
        return tfidf_features, categorical_features, numerical_features
    
    def build_model(self, tfidf_dim, categorical_dims, numerical_dim):
        """Build the TensorFlow model"""
        # Text input
        text_input = Input(shape=(tfidf_dim,), name='text_input')
        text_dense = Dense(128, activation='relu')(text_input)
        text_dense = Dense(64, activation='relu')(text_dense)
        
        # Categorical inputs
        categorical_inputs = []
        categorical_embeddings = []
        
        for i, dim in enumerate(categorical_dims):
            cat_input = Input(shape=(1,), name=f'cat_input_{i}')
            cat_embedding = Embedding(dim, min(50, dim//2 + 1))(cat_input)
            cat_embedding = Flatten()(cat_embedding)
            categorical_inputs.append(cat_input)
            categorical_embeddings.append(cat_embedding)
        
        # Numerical input
        num_input = Input(shape=(numerical_dim,), name='num_input')
        num_dense = Dense(32, activation='relu')(num_input)
        
        # Combine all features
        if categorical_embeddings:
            combined = Concatenate()([text_dense] + categorical_embeddings + [num_dense])
        else:
            combined = Concatenate()([text_dense, num_dense])
        
        # Final layers
        combined = Dense(128, activation='relu')(combined)
        combined = Dense(64, activation='relu')(combined)
        output = Dense(32, activation='linear', name='content_embedding')(combined)
        
        # Create model
        all_inputs = [text_input] + categorical_inputs + [num_input]
        model = tf.keras.Model(inputs=all_inputs, outputs=output)
        
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='mse',
            metrics=['mae']
        )
        
        return model
    
    def should_retrain_old(self, current_hash):
        """Check if model should be retrained"""
        if self.data_hash is None or self.data_hash != current_hash:
            return True
        
        model_files = [
            os.path.join(self.model_path, 'content_model.h5'),
            os.path.join(self.model_path, 'tfidf_vectorizer.pkl'),
            os.path.join(self.model_path, 'scaler.pkl'),
            os.path.join(self.model_path, 'label_encoders.pkl')
        ]
        
        return not all(os.path.exists(f) for f in model_files)
    def should_retrain(self, current_hash):
        """Check if model should be retrained"""
        if self.data_hash is None or self.data_hash != current_hash:
            return True
        
        # Check for both new and old model formats
        model_files = [
            os.path.join(self.model_path, 'tfidf_vectorizer.pkl'),
            os.path.join(self.model_path, 'scaler.pkl'),
            os.path.join(self.model_path, 'label_encoders.pkl')
        ]
        
        # Check if either model format exists
        keras_model_exists = os.path.exists(os.path.join(self.model_path, 'content_model.keras'))
        h5_model_exists = os.path.exists(os.path.join(self.model_path, 'content_model.h5'))
        
        model_exists = keras_model_exists or h5_model_exists
        other_files_exist = all(os.path.exists(f) for f in model_files)
        
        return not (model_exists and other_files_exist)
    
    def fit(self, force_retrain=False):
        """Fit the recommender model"""
        # Load data from database
        articles_df, likes_df, comments_df = self.load_data_from_db()
        
        # Calculate data hash
        current_hash = self.calculate_data_hash(articles_df, likes_df, comments_df)
        
        # Check if retraining is needed
        if not force_retrain and not self.should_retrain(current_hash):
            logger.info("No changes detected in data. Loading existing model...")
            self.load_model()
            return
        
        logger.info("Training model with new/updated data...")
        
        # Preprocess data
        self.articles_data = self.preprocess_data(articles_df, likes_df, comments_df)
        
        # Prepare features
        tfidf_features, categorical_features, numerical_features = self.prepare_features(self.articles_data)
        
        # Build model
        tfidf_dim = tfidf_features.shape[1]
        categorical_dims = [len(encoder.classes_) for encoder in self.label_encoders.values()]
        numerical_dim = numerical_features.shape[1]
        
        self.model = self.build_model(tfidf_dim, categorical_dims, numerical_dim)
        
        # Prepare training data
        X_text = tfidf_features.toarray()
        X_categorical = [cat_feat.flatten() for cat_feat in categorical_features]
        X_numerical = numerical_features
        
        # Create target (we'll use the features themselves for autoencoder-like training)
        y = np.concatenate([X_text[:, :32], X_numerical], axis=1)
        if y.shape[1] > 32:
            y = y[:, :32]
        elif y.shape[1] < 32:
            y = np.pad(y, ((0, 0), (0, 32 - y.shape[1])), mode='constant')
        
        # Train model
        X_train = [X_text] + X_categorical + [X_numerical]
        
        self.model.fit(
            X_train, y,
            epochs=50,
            batch_size=32,
            validation_split=0.2,
            verbose=1
        )
        
        # Calculate similarity matrix
        embeddings = self.model.predict(X_train)
        self.similarity_matrix = cosine_similarity(embeddings)
        
        # Update data hash
        self.data_hash = current_hash
        
        # Save model
        self.save_model()
        
        logger.info("Model training completed and saved!")
    
    def recommend(self, article_id, top_n=5):
        """Generate recommendations for a given article"""
        if self.model is None or self.articles_data is None:
            self.fit()
        
        # Find article index
        article_idx = self.articles_data[self.articles_data['id'] == str(article_id)].index
        
        if len(article_idx) == 0:
            logger.warning(f"Article ID {article_id} not found")
            return pd.DataFrame()
        
        article_idx = article_idx[0]
        
        # Get similarity scores
        sim_scores = list(enumerate(self.similarity_matrix[article_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top recommendations (excluding the article itself)
        sim_scores = sim_scores[1:top_n+1]
        article_indices = [i[0] for i in sim_scores]
        
        # Return recommended articles
        recommendations = self.articles_data.iloc[article_indices].copy()
        recommendations['similarity_score'] = [score[1] for score in sim_scores]
        
        return recommendations[['id', 'title', 'province', 'city', 'engagement_score', 'similarity_score']]
    
    def save_model_old(self):
        """Save the trained model and preprocessors"""
        # Save TensorFlow model
        self.model.save(os.path.join(self.model_path, 'content_model.h5'))
        
        # Save preprocessors
        with open(os.path.join(self.model_path, 'tfidf_vectorizer.pkl'), 'wb') as f:
            pickle.dump(self.tfidf_vectorizer, f)
        
        with open(os.path.join(self.model_path, 'scaler.pkl'), 'wb') as f:
            pickle.dump(self.scaler, f)
        
        with open(os.path.join(self.model_path, 'label_encoders.pkl'), 'wb') as f:
            pickle.dump(self.label_encoders, f)
        
        with open(os.path.join(self.model_path, 'similarity_matrix.pkl'), 'wb') as f:
            pickle.dump(self.similarity_matrix, f)
        
        with open(os.path.join(self.model_path, 'articles_data.pkl'), 'wb') as f:
            pickle.dump(self.articles_data, f)
        
        with open(os.path.join(self.model_path, 'data_hash.pkl'), 'wb') as f:
            pickle.dump(self.data_hash, f)
        
        logger.info("Model and preprocessors saved successfully!")
    def save_model(self):
        """Save the trained model and preprocessors"""
        # Save TensorFlow model in native Keras format (recommended)
        self.model.save(os.path.join(self.model_path, 'content_model.keras'))
        
        # Save preprocessors
        with open(os.path.join(self.model_path, 'tfidf_vectorizer.pkl'), 'wb') as f:
            pickle.dump(self.tfidf_vectorizer, f)
        
        with open(os.path.join(self.model_path, 'scaler.pkl'), 'wb') as f:
            pickle.dump(self.scaler, f)
        
        with open(os.path.join(self.model_path, 'label_encoders.pkl'), 'wb') as f:
            pickle.dump(self.label_encoders, f)
        
        with open(os.path.join(self.model_path, 'similarity_matrix.pkl'), 'wb') as f:
            pickle.dump(self.similarity_matrix, f)
        
        with open(os.path.join(self.model_path, 'articles_data.pkl'), 'wb') as f:
            pickle.dump(self.articles_data, f)
        
        with open(os.path.join(self.model_path, 'data_hash.pkl'), 'wb') as f:
            pickle.dump(self.data_hash, f)
        
        logger.info("Model and preprocessors saved successfully!")    
    
    def load_model_old(self):
        """Load the trained model and preprocessors"""
        try:
            # Load TensorFlow model
            self.model = load_model(os.path.join(self.model_path, 'content_model.h5'))
            
            # Load preprocessors
            with open(os.path.join(self.model_path, 'tfidf_vectorizer.pkl'), 'rb') as f:
                self.tfidf_vectorizer = pickle.load(f)
            
            with open(os.path.join(self.model_path, 'scaler.pkl'), 'rb') as f:
                self.scaler = pickle.load(f)
                self.scaler_fitted = True
            
            with open(os.path.join(self.model_path, 'label_encoders.pkl'), 'rb') as f:
                self.label_encoders = pickle.load(f)
            
            with open(os.path.join(self.model_path, 'similarity_matrix.pkl'), 'rb') as f:
                self.similarity_matrix = pickle.load(f)
            
            with open(os.path.join(self.model_path, 'articles_data.pkl'), 'rb') as f:
                self.articles_data = pickle.load(f)
            
            with open(os.path.join(self.model_path, 'data_hash.pkl'), 'rb') as f:
                self.data_hash = pickle.load(f)
            
            logger.info("Model and preprocessors loaded successfully!")
            
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise
    def load_model(self):
        """Load the trained model and preprocessors"""
        try:
            # Try to load new format first, then fallback to old format
            keras_model_path = os.path.join(self.model_path, 'content_model.keras')
            h5_model_path = os.path.join(self.model_path, 'content_model.h5')
            
            if os.path.exists(keras_model_path):
                # Load TensorFlow model (new format)
                self.model = load_model(keras_model_path)
            elif os.path.exists(h5_model_path):
                # Load TensorFlow model (old format)
                self.model = load_model(h5_model_path)
                logger.warning("Loaded model from legacy HDF5 format. Consider retraining to save in new format.")
            else:
                raise FileNotFoundError("No model file found (neither .keras nor .h5)")
            
            # Load preprocessors
            with open(os.path.join(self.model_path, 'tfidf_vectorizer.pkl'), 'rb') as f:
                self.tfidf_vectorizer = pickle.load(f)
            
            with open(os.path.join(self.model_path, 'scaler.pkl'), 'rb') as f:
                self.scaler = pickle.load(f)
                self.scaler_fitted = True
            
            with open(os.path.join(self.model_path, 'label_encoders.pkl'), 'rb') as f:
                self.label_encoders = pickle.load(f)
            
            with open(os.path.join(self.model_path, 'similarity_matrix.pkl'), 'rb') as f:
                self.similarity_matrix = pickle.load(f)
            
            with open(os.path.join(self.model_path, 'articles_data.pkl'), 'rb') as f:
                self.articles_data = pickle.load(f)
            
            with open(os.path.join(self.model_path, 'data_hash.pkl'), 'rb') as f:
                self.data_hash = pickle.load(f)
            
            logger.info("Model and preprocessors loaded successfully!")
            
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise       

In [30]:
 # Database configuration
db_config = {
        'host': 'ep-tight-mountain-a1oxjc3h-pooler.ap-southeast-1.aws.neon.tech',
        'database': 'db_artikel',
        'user': 'neondb_owner',
        'password': 'npg_4XAZOBI8qjWk',
        'port': 5432
}

In [44]:
import os
from dotenv import load_dotenv

load_dotenv()  # Load .env file

db_config = {
    'host': os.getenv('DB_HOST'),
    'database': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': int(os.getenv('DB_PORT', 5432))  # default ke 5432
}


In [45]:
# Initialize recommender
recommender = PostgreSQLContentBasedRecommender(db_config)

In [46]:
# Fit the model (will automatically check if retraining is needed)
recommender.fit()

  articles_df = pd.read_sql(articles_query, conn)
  likes_df = pd.read_sql(likes_query, conn)
  comments_df = pd.read_sql(comments_query, conn)
INFO:__main__:Loaded 48 articles, 4 likes, 1 comments
INFO:__main__:Training model with new/updated data...


                                     id  \
0  8959dc75-d87a-4c13-9fc7-feead6b13b45   
1  6b1de6e6-d2ac-4094-ae81-1d9c39551e61   
2  d287687c-a50a-42d0-93db-2bcb2fb43ec8   
3  026b1b59-e6d1-42d3-a4ff-42967b55a526   
4  0fb0bc6c-2d97-4b18-a6cd-ef5fd3f1414f   

                                             title  \
0  Cagar Alam Kawah Ijen di Banyuwangi, Jawa Timur   
1      Cagar Alam Maninjau di Agam, Sumatera Barat   
2                                       Seren Raun   
3     Tradisi Pasola di Sumba, Nusa Tenggara Timur   
4                Suku Asmat dan Seni Ukir di Papua   

                                             slug             province  \
0  cagar-alam-kawah-ijen-di-banyuwangi-jawa-timur           Jawa Timur   
1      cagar-alam-maninjau-di-agam-sumatera-barat       Sumatera Barat   
2                                      seren-raun               Banten   
3     tradisi-pasola-di-sumba-nusa-tenggara-timur  Nusa Tenggara Timur   
4               suku-asmat-dan-seni-ukir-di-pa

INFO:__main__:Model and preprocessors saved successfully!
INFO:__main__:Model training completed and saved!


In [36]:
# Get recommendations
article_id = '8959dc75-d87a-4c13-9fc7-feead6b13b45'
recommendations = recommender.recommend(article_id, top_n=10)

In [51]:
print(f"Rekomendasi untuk artikel ID '{article_id}':")
for _, row in recommendations.iterrows():
    print(f"- {row['id']} (Skor: {row['similarity_score']:.4f})")
    print(f"- {row['title']} (Skor: {row['similarity_score']:.4f})")

Rekomendasi untuk artikel ID '8959dc75-d87a-4c13-9fc7-feead6b13b45':
- 73294f35-0382-4af7-8930-476919f550d9 (Skor: 0.3888)
- Cagar Alam Lembah Harau: Keindahan Alam yang Menakjubkan (Skor: 0.3888)
- 026b1b59-e6d1-42d3-a4ff-42967b55a526 (Skor: 0.2204)
-  Tradisi Pasola di Sumba, Nusa Tenggara Timur (Skor: 0.2204)
- 27bbfb6f-5fd1-4a54-8317-f95f481ecfd7 (Skor: 0.1833)
- Taman Nasional Ujung Kulon (Skor: 0.1833)
- aa6a9d8c-797d-4c3b-a1af-59e2594f421e (Skor: 0.1672)
- Upacara Ngaben di Bali (Skor: 0.1672)
- 6b1de6e6-d2ac-4094-ae81-1d9c39551e61 (Skor: 0.1547)
- Cagar Alam Maninjau di Agam, Sumatera Barat (Skor: 0.1547)
- d764ffa1-9d2d-4d2e-98f5-142c120df86e (Skor: 0.1505)
- Bunaken (Skor: 0.1505)
- 58feb5ad-f9ba-487f-bb7f-8bde07fa8733 (Skor: 0.1417)
- Jelajah Keraton Surakarta (Skor: 0.1417)
- 4399b0cb-a105-4207-a257-970a199d1d6a (Skor: 0.1262)
- Tugu JOGJA (Skor: 0.1262)
- 8474948f-5892-4035-9e26-9841e1286cda (Skor: 0.1047)
- Keindahan Pantai Carocok: Surga Bahari di Pesisir Selatan (Skor: 

In [38]:
# Force retrain if needed
recommender.fit(force_retrain=True)

  articles_df = pd.read_sql(articles_query, conn)
  likes_df = pd.read_sql(likes_query, conn)
  comments_df = pd.read_sql(comments_query, conn)
INFO:__main__:Loaded 48 articles, 4 likes, 1 comments
INFO:__main__:Training model with new/updated data...


                                     id  \
0  8959dc75-d87a-4c13-9fc7-feead6b13b45   
1  6b1de6e6-d2ac-4094-ae81-1d9c39551e61   
2  d287687c-a50a-42d0-93db-2bcb2fb43ec8   
3  026b1b59-e6d1-42d3-a4ff-42967b55a526   
4  0fb0bc6c-2d97-4b18-a6cd-ef5fd3f1414f   

                                             title  \
0  Cagar Alam Kawah Ijen di Banyuwangi, Jawa Timur   
1      Cagar Alam Maninjau di Agam, Sumatera Barat   
2                                       Seren Raun   
3     Tradisi Pasola di Sumba, Nusa Tenggara Timur   
4                Suku Asmat dan Seni Ukir di Papua   

                                             slug             province  \
0  cagar-alam-kawah-ijen-di-banyuwangi-jawa-timur           Jawa Timur   
1      cagar-alam-maninjau-di-agam-sumatera-barat       Sumatera Barat   
2                                      seren-raun               Banten   
3     tradisi-pasola-di-sumba-nusa-tenggara-timur  Nusa Tenggara Timur   
4               suku-asmat-dan-seni-ukir-di-pa

INFO:__main__:Model and preprocessors saved successfully!
INFO:__main__:Model training completed and saved!


In [42]:
# Get recommendations
article_id = '8959dc75-d87a-4c13-9fc7-feead6b13b45'
recommendations = recommender.recommend(article_id, top_n=10)

In [43]:
print(f"Rekomendasi untuk artikel ID '{article_id}':")
for _, row in recommendations.iterrows():
    print(f"- {row['title']} (Skor: {row['similarity_score']:.4f})")

Rekomendasi untuk artikel ID '8959dc75-d87a-4c13-9fc7-feead6b13b45':
- Cagar Alam Lembah Harau: Keindahan Alam yang Menakjubkan (Skor: 0.3888)
-  Tradisi Pasola di Sumba, Nusa Tenggara Timur (Skor: 0.2204)
- Taman Nasional Ujung Kulon (Skor: 0.1833)
- Upacara Ngaben di Bali (Skor: 0.1672)
- Cagar Alam Maninjau di Agam, Sumatera Barat (Skor: 0.1547)
- Bunaken (Skor: 0.1505)
- Jelajah Keraton Surakarta (Skor: 0.1417)
- Tugu JOGJA (Skor: 0.1262)
- Keindahan Pantai Carocok: Surga Bahari di Pesisir Selatan (Skor: 0.1047)
- Tari Barong di Desa Batubulan (Skor: 0.0999)
