In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import json
from collections import Counter
from math import log


In [13]:
# 1. LOAD & PREPROCESSING DATA
def load_data():
    # Nanti Akan Load Data dari Database
    articles_df = pd.read_csv('data/articles_202505071725.csv')
    likes_df = pd.read_csv('data/article_likes_202505071726.csv')
    comments_df = pd.read_csv('data/article_comments_202505071727.csv')
    return articles_df, likes_df, comments_df


In [51]:
def preprocess_data(articles_df, likes_df, comments_df):
    
      # Ensure all IDs are strings
    articles_df['id'] = articles_df['id'].astype(str)
    likes_df['article_id'] = likes_df['article_id'].astype(str)
    comments_df['article_id'] = comments_df['article_id'].astype(str)
    
    # Calculate likes per article
    article_likes = likes_df.groupby('article_id').size().reset_index(name='likes_count')
    
    # Calculate comments per article
    article_comments = comments_df.groupby('article_id').size().reset_index(name='comments_count')
    
    # Combine data with articles
    articles_enriched = articles_df.copy()
    articles_enriched['id'] = articles_enriched['id'].astype(str)
    
    # Add likes count
    articles_enriched = articles_enriched.merge(article_likes, left_on='id', right_on='article_id', how='left')
    articles_enriched['likes_count'] = articles_enriched['likes_count'].fillna(0)
    
    # Add comments count
    articles_enriched = articles_enriched.merge(article_comments, left_on='id', right_on='article_id', how='left')
    articles_enriched['comments_count'] = articles_enriched['comments_count'].fillna(0)
    
    # Clean up
    articles_enriched = articles_enriched.drop(['article_id_x', 'article_id_y'], axis=1, errors='ignore')
    
    # Extract features from article title using Text Processing
    # Combine title, province, and city for text-based features
    articles_enriched['text_features'] = articles_enriched['title'] + ' ' + articles_enriched['province'] + ' ' + articles_enriched['city']
    
    # Add engagement score
    articles_enriched['engagement_score'] = articles_enriched['likes_count'] + (2 * articles_enriched['comments_count'])
    
    return articles_enriched

In [57]:
class TFContentBasedRecommender:
    def __init__(self):
        self.vocab = {}
        self.idf = {}
        self.article_embeddings = None
        self.articles_df = None
        self.model = None
        self.model_path = 'models/tf_content_recommender'

    def _tokenize(self, text):
        return text.lower().split()

    def _build_vocab(self, texts):
        vocab_counter = Counter()
        for text in texts:
            tokens = self._tokenize(text)
            vocab_counter.update(tokens)
        self.vocab = {word: idx for idx, (word, _) in enumerate(vocab_counter.items())}
    
    def _compute_idf(self, texts):
        N = len(texts)
        df = Counter()
        for text in texts:
            tokens = set(self._tokenize(text))
            df.update(tokens)
        self.idf = {word: log(N / (df[word] + 1)) for word in self.vocab}

    def _tfidf_vector(self, text):
        tokens = self._tokenize(text)
        tf = Counter(tokens)
        vector = np.zeros(len(self.vocab))
        for token in tokens:
            if token in self.vocab:
                tf_val = tf[token] / len(tokens)
                idf_val = self.idf.get(token, 0)
                vector[self.vocab[token]] = tf_val * idf_val
        return vector

    def _build_model(self, input_dim):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation='relu', input_shape=(input_dim,)),
            tf.keras.layers.Dense(64, activation='relu')
        ])
        return model

    def fit(self, articles_df):
        self.articles_df = articles_df.copy()
        texts = self.articles_df['text_features'].tolist()

        self._build_vocab(texts)
        self._compute_idf(texts)

        tfidf_matrix = np.array([self._tfidf_vector(text) for text in texts])
        self.model = self._build_model(input_dim=tfidf_matrix.shape[1])
        self.article_embeddings = self.model.predict(tfidf_matrix)

        return self

    def recommend(self, article_id, top_n=5):
        idx = self.articles_df[self.articles_df['id'] == article_id].index[0]
        query_embedding = self.article_embeddings[idx].reshape(1, -1)

        similarity = np.dot(self.article_embeddings, query_embedding.T).flatten()
        norms = np.linalg.norm(self.article_embeddings, axis=1) * np.linalg.norm(query_embedding)
        similarity = similarity / norms

        sim_indices = np.argsort(similarity)[::-1][1:top_n+1]
        recommended_articles = self.articles_df.iloc[sim_indices][['id', 'title', 'province', 'city']].copy()
        recommended_articles['similarity_score'] = similarity[sim_indices]

        return recommended_articles

    def save_model(self):
        os.makedirs(self.model_path, exist_ok=True)
        np.save(os.path.join(self.model_path, 'article_embeddings.npy'), self.article_embeddings)
        self.articles_df.to_pickle(os.path.join(self.model_path, 'articles_df.pkl'))

        with open(os.path.join(self.model_path, 'vocab.json'), 'w') as f:
            json.dump(self.vocab, f)
        with open(os.path.join(self.model_path, 'idf.json'), 'w') as f:
            json.dump(self.idf, f)

    def load_model(self):
        if os.path.exists(self.model_path):
            self.article_embeddings = np.load(os.path.join(self.model_path, 'article_embeddings.npy'))
            self.articles_df = pd.read_pickle(os.path.join(self.model_path, 'articles_df.pkl'))

            with open(os.path.join(self.model_path, 'vocab.json'), 'r') as f:
                self.vocab = json.load(f)
            with open(os.path.join(self.model_path, 'idf.json'), 'r') as f:
                self.idf = json.load(f)

            return True
        return False

    def update_model(self, new_articles_df):
        if self.articles_df is not None:
            existing_uuids = set(self.articles_df['id'].values)
            new_articles = new_articles_df[~new_articles_df['id'].isin(existing_uuids)]

            if len(new_articles) > 0:
                self.articles_df = pd.concat([self.articles_df, new_articles]).reset_index(drop=True)
                self.fit(self.articles_df)
                self.save_model()
                return True, f"Model updated with {len(new_articles)} new articles"
            return False, "No new articles to update"
        else:
            self.fit(new_articles_df)
            self.save_model()
            return True, f"Initial model created with {len(new_articles_df)} articles"


In [14]:
# Memuat data
articles, likes, comments = load_data()

In [40]:
# Ambil kolom-kolom yang diinginkan dari DataFrame
articles_df = articles[['id', 'title', 'slug', 'province', 'city', 'active', 'user_id']]


In [41]:
articles_df

Unnamed: 0,id,title,slug,province,city,active,user_id
0,1,Might group board positive campaign per partic...,arm-happy-book-win,North Carolina,Estradaborough,True,d50e8d4f-871d-4ef9-9a7f-d360180588a3
1,2,Build someone around eight past chance.,nothing-same-hand,Idaho,Reidberg,True,ba2372f3-5250-402a-ab7d-4012b924c020
2,3,Realize strong simply attorney.,data-skin-from,Tennessee,North Ronaldberg,True,d4b08900-100d-426f-8dff-5f8ddc134485
3,4,Allow million school.,study-view-wish,Oregon,West Brandonview,True,0f9cbb49-8e26-4f8b-9306-d810710feb87
4,5,Push stage need officer process.,notice-attack-kid,Delaware,West Cynthiaview,True,755c75c2-2642-471a-93d1-13d7920d5530
...,...,...,...,...,...,...,...
3995,3996,Again above box.,american-commercial,Utah,North Barbarastad,True,57480c0a-91c2-4067-9e5f-bac77e2fc557
3996,3997,Likely bit they into once.,student-site-line,Michigan,Jackport,True,28634318-d553-4ca7-8d85-c76012a9ea1c
3997,3998,Condition cut too somebody back couple approac...,old-response-late,Oklahoma,Matthewburgh,True,947aeccc-feca-4c40-af8c-683d019f7539
3998,3999,Trial evening indicate follow put yourself spe...,force-name-husband,Pennsylvania,Port Shelleyport,True,52f865af-89ed-450b-8c25-8396a628dfd3


In [43]:
# Ambil kolom-kolom yang diinginkan dari DataFrame
comments_df = comments[['id', 'article_id', 'user_id']]

In [44]:
comments_df

Unnamed: 0,id,article_id,user_id
0,51f47a74-5963-4578-b457-caac6e2a13e9,1,1
1,a79d17b3-17fd-4102-ae4a-6b74c8b343e2,2,2
2,8fb9249a-eb00-47be-8581-27b64a974e4a,3,1
3,c222b802-a46c-4c61-817b-3f7213824af4,4,2
4,c2566f5d-2016-485e-a7a8-2dc8a2f5862e,5,1
...,...,...,...
3995,d7bb2185-93f6-4a61-abae-877f53fa3b8e,3996,1
3996,2c02e141-1d36-403d-b9ed-9ea86e2d2d84,3997,3
3997,5b56536d-b693-46ba-8b62-f1b2e832abc5,3998,3
3998,472514e4-26bd-4f18-ac17-fea3e025cfc4,3999,4


In [45]:
# Ambil kolom-kolom yang diinginkan dari DataFrame
likes_df = likes[['id', 'article_id', 'user_id']]

In [47]:
likes_df

Unnamed: 0,id,article_id,user_id
0,f2d39235-dc3f-45f6-aa28-e9a5b743a2c9,1,2
1,80aa4790-8a72-4498-bb39-d8fd3f590b02,2,2
2,24b8ace1-1dd1-4d4c-a507-ce6fbe1cfefa,3,2
3,8b588322-5b48-4546-8afe-61344e1f9bff,4,2
4,8b12918c-099d-4f33-a9bb-8dec217e1779,5,2
...,...,...,...
7995,f4fcb644-2246-41d2-b677-464566a095f8,3996,1
7996,2bd1173e-a1f3-41d3-a150-20f0afa59d1d,3997,3
7997,eff3ae86-7d90-41e0-a52e-0a278f36dfb4,3998,3
7998,1b85c544-04cb-4ad4-8eea-cac9bb5a4eca,3999,4


In [52]:
# Proses data
articles_enriched = preprocess_data(articles_df, likes_df, comments_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_df['id'] = articles_df['id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  likes_df['article_id'] = likes_df['article_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments_df['article_id'] = comments_df['article_id'].astype(str)


In [54]:
articles_enriched.head()

Unnamed: 0,id,title,slug,province,city,active,user_id,likes_count,comments_count,text_features,engagement_score
0,1,Might group board positive campaign per partic...,arm-happy-book-win,North Carolina,Estradaborough,True,d50e8d4f-871d-4ef9-9a7f-d360180588a3,2,1,Might group board positive campaign per partic...,4
1,2,Build someone around eight past chance.,nothing-same-hand,Idaho,Reidberg,True,ba2372f3-5250-402a-ab7d-4012b924c020,2,1,Build someone around eight past chance. Idaho ...,4
2,3,Realize strong simply attorney.,data-skin-from,Tennessee,North Ronaldberg,True,d4b08900-100d-426f-8dff-5f8ddc134485,2,1,Realize strong simply attorney. Tennessee Nort...,4
3,4,Allow million school.,study-view-wish,Oregon,West Brandonview,True,0f9cbb49-8e26-4f8b-9306-d810710feb87,2,1,Allow million school. Oregon West Brandonview,4
4,5,Push stage need officer process.,notice-attack-kid,Delaware,West Cynthiaview,True,755c75c2-2642-471a-93d1-13d7920d5530,2,1,Push stage need officer process. Delaware West...,4


In [58]:
# Inisialisasi model Content-Based Recommender
recommender = TFContentBasedRecommender()
recommender.fit(articles_enriched)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


<__main__.TFContentBasedRecommender at 0x1846121bc80>

In [63]:
# Mendapatkan rekomendasi
article_id = '2'  # ID artikel referensi
recommendations = recommender.recommend(article_id, top_n=3)
    
print(f"Rekomendasi untuk artikel '{articles_df[articles_df['id'] == article_id]['title'].values[0]}':")
for _, row in recommendations.iterrows():
    print(f"- {row['title']} (Skor: {row['similarity_score']:.4f})")

Rekomendasi untuk artikel 'Build someone around eight past chance.':
- Sea bar around indeed drop positive. (Skor: 0.8268)
- Local event peace parent leg head. (Skor: 0.8154)
- Throughout four tax chance might. (Skor: 0.8080)


In [71]:
# Memberikan rekomendasi artikel berdasarkan artikel dengan UUID tertentu
article_id = '1'
recommended_articles = recommender.recommend(article_id=article_id, top_n=5)
print("Artikel yang direkomendasikan:")
recommended_articles

Artikel yang direkomendasikan:


Unnamed: 0,id,title,province,city,similarity_score
3439,3440,Player reach eye situation treat report art.,Alabama,South Tamara,0.832967
3033,3034,Participant head voice identify attention turn.,Hawaii,North Carla,0.804139
643,643,Trade popular commercial collection hand.,Tennessee,Lake Tylermouth,0.800131
3292,3293,Network sport offer likely usually.,Missouri,North Denise,0.789957
273,273,Simple series daughter play indeed country ahe...,North Carolina,West Christian,0.784626


In [72]:
# Menyimpan model
recommender.save_model()  