In [4]:
import pandas as pd
import os
import re

# Konfigurasi file
FILE_LAMA_JSON = 'News_Category_Dataset_v3.json'       
FILE_BARU_CSV = 'final_merge_dataset.csv'
OUTPUT_FILE = 'processed_news_data.csv'

def clean_text_artifacts(text):
    """
    Membersihkan karakter encoding yang rusak (Mojibake) dan whitespace berlebih.
    """
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text)
    
    #Perbaikan Karakter Aneh
    replacements = {
        'â€“': '-', 'â€”': '-', 
        'â€œ': '"', 'â€': '"',  
        'â€™': "'", 'â€˜': "'", 
        'Â': '', 'â': '',       
        '\xa0': ' '             
    }
    for bad, good in replacements.items():
        text = text.replace(bad, good)
        
    #Hapus Intro Berita
    text = re.sub(r'^[A-Z\s,]+(\.|kompas\.com|detik\.com)\s*(--|-)\s*', '', text, flags=re.IGNORECASE)
    
    #Hapus pola
    text = re.sub(r'\[Gambas:.*?\]', '', text)
    
    #Rapikan Spasi
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def combine_tags(row):
    """Menggabungkan tag1 s/d tag5 menjadi satu string kategori"""
    tags = []
    for i in range(1, 6):
        col = f'tag{i}'
        if col in row and pd.notna(row[col]) and str(row[col]).strip() != 'nan':
            tags.append(str(row[col]).strip())

    if not tags:
        if 'source' in row and pd.notna(row['source']):
            return str(row['source'])
        return "General"
        
    return ", ".join(tags)

def load_and_process():
    dfs = []

    #Dataset Lama (JSON)
    print(f"1. Memuat dataset JSON: {FILE_LAMA_JSON}...")
    if os.path.exists(FILE_LAMA_JSON):
        try:
            df_old = pd.read_json(FILE_LAMA_JSON, lines=True)
            df_old = df_old[['headline', 'short_description', 'link', 'category']]
            dfs.append(df_old)
            print(f"   -> JSON OK: {len(df_old)} baris.")
        except Exception as e:
            print(f"   -> Error JSON: {e}")
    else:
        print(f"   -> File JSON tidak ditemukan (Dilewati).")

    #Dataset Baru (CSV)
    print(f"2. Memuat dataset CSV: {FILE_BARU_CSV}...")
    if os.path.exists(FILE_BARU_CSV):
        try:
            try:
                df_new = pd.read_csv(FILE_BARU_CSV, encoding='utf-8')
            except UnicodeDecodeError:
                df_new = pd.read_csv(FILE_BARU_CSV, encoding='latin-1')

            print(f"   -> Kolom Asli: {list(df_new.columns)}")

            #Gabungkan Tags menjadi Category
            print("   -> Menggabungkan Tags...")
            df_new['category'] = df_new.apply(combine_tags, axis=1)

            #Mapping Nama Kolom agar sama dengan JSON
            rename_map = {
                'Judul': 'headline',
                'Content': 'short_description',
                'Link': 'link',
            }
            df_new = df_new.rename(columns=rename_map)

            #Bersihkan Teks (Cleaning)
            print("   -> Membersihkan artefak teks...")
            df_new['headline'] = df_new['headline'].apply(clean_text_artifacts)
            df_new['short_description'] = df_new['short_description'].apply(clean_text_artifacts)

            #Seleksi Kolom
            df_new = df_new[['headline', 'short_description', 'link', 'category']]
            
            dfs.append(df_new)
            print(f"   -> CSV OK: {len(df_new)} baris.")

        except Exception as e:
            print(f"   -> Error CSV: {e}")
    else:
        print(f"   -> File CSV tidak ditemukan (Dilewati).")

    #Penggabungan 
    if not dfs:
        print("❌ Tidak ada data untuk diproses.")
        return

    print("3. Menggabungkan & Memproses Data Akhir...")
    df_final = pd.concat(dfs, ignore_index=True)

    #Hapus Duplikat & Kosong
    df_final.drop_duplicates(subset=['headline'], inplace=True)
    df_final = df_final.dropna(subset=['headline', 'short_description'])
    df_final = df_final[df_final['short_description'].str.len() > 10]

    #Buat Combined Text untuk RAG
    def format_rag_text(row):
        
        content = str(row['short_description'])
        if len(content) > 1200:
            content = content[:1200] + "..."
        
        return f"Kategori: {row['category']}\nJudul: {row['headline']}\nIsi: {content}"

    df_final['combined_text'] = df_final.apply(format_rag_text, axis=1)

    #Buat Doc ID Baru
    df_final['doc_id'] = [f"doc_{i}" for i in range(len(df_final))]

    #Simpan
    print(f"4. Menyimpan {len(df_final)} data ke '{OUTPUT_FILE}'...")
    df_final.to_csv(OUTPUT_FILE, index=False)
    
    #Preview Data
    print("\n--- CONTOH DATA HASIL CLEANING ---")
    print(df_final.iloc[-1]['combined_text']) 
    print("----------------------------------")

if __name__ == "__main__":
    load_and_process()

1. Memuat dataset JSON: News_Category_Dataset_v3.json...
   -> JSON OK: 209527 baris.
2. Memuat dataset CSV: final_merge_dataset.csv...
   -> Kolom Asli: ['Judul', 'Waktu', 'Link', 'Content', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'source']
   -> Menggabungkan Tags...
   -> Membersihkan artefak teks...
   -> CSV OK: 80472 baris.
3. Menggabungkan & Memproses Data Akhir...
4. Menyimpan 254032 data ke 'processed_news_data.csv'...

--- CONTOH DATA HASIL CLEANING ---
Kategori: israel, gaza, global-sumud-flotilla, kapal, flotilla
Judul: Protes Meletus di Seluruh Eropa Usai Serangan Israel terhadap Kapal Flotilla ke Gaza
Isi: Baca berita dengan sedikit iklan, klik di sini Aktivis pro-Palestina menggelar protes pada Rabu malam di seluruh Eropa seperti dilaporkan Anadolu menyusul serangan pasukan Israel terhadap armada Global Sumud Flotilla, armada kapal yang membawa bantuan kemanusiaan ke Gaza, Palestina. Baca berita dengan sedikit iklan, klik di sini Di Roma, ratusan demonstran, termasuk mah

In [5]:
import pandas as pd
import os
import sys
import shutil

try:
    from langchain_community.embeddings import HuggingFaceEmbeddings
    from langchain_community.vectorstores import FAISS
    from langchain_core.documents import Document
except ImportError as e:
    print(f"Error Import: {e}")
    sys.exit(1)

#Konfigurasi
DATA_PATH = 'processed_news_data.csv' 
INDEX_PATH = 'faiss_index_news'       
MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

def main():
    #Cek File CSV
    if not os.path.exists(DATA_PATH):
        print(f"❌ ERROR: File '{DATA_PATH}' tidak ditemukan.")
        print("   Pastikan Anda sudah menjalankan 'preprocess_gabungan.py'.")
        return

    print("1. Memuat data CSV...")
    try:
        df = pd.read_csv(DATA_PATH)
        df = df.dropna(subset=['combined_text'])

        print(f"   Total dokumen: {len(df)}")
    except Exception as e:
        print(f"❌ Gagal membaca CSV: {e}")
        return

    print("2. Menyiapkan dokumen...")
    documents = []
    for _, row in df.iterrows():
        doc = Document(
            page_content=row['combined_text'],
            metadata={"doc_id": row['doc_id'], "category": row.get('category', 'unknown')}
        )
        documents.append(doc)

    print(f"3. Memuat Model Embedding ({MODEL_NAME})...")
    embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)

    print("4. Membuat Index FAISS Baru (Tunggu sebentar)...")
    if os.path.exists(INDEX_PATH):
        try:
            shutil.rmtree(INDEX_PATH)
        except:
            pass
        
    vectorstore = FAISS.from_documents(documents, embeddings)
    
    print(f"5. Menyimpan Index ke '{INDEX_PATH}'...")
    vectorstore.save_local(INDEX_PATH)
    print("\n✅ SUKSES! Index baru berhasil dibuat.")

if __name__ == "__main__":
    main()

1. Memuat data CSV...
   Total dokumen: 254032
2. Menyiapkan dokumen...
3. Memuat Model Embedding (sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)...


  from .autonotebook import tqdm as notebook_tqdm


4. Membuat Index FAISS Baru (Tunggu sebentar)...
5. Menyimpan Index ke 'faiss_index_news'...

✅ SUKSES! Index baru berhasil dibuat.


In [2]:
# Install versi stabil yang kita gunakan (LangChain 0.1.0 & Pydantic 1.x)
%pip install langchain==0.1.0 langchain-community==0.0.10 langchain-core==0.1.10 faiss-cpu sentence-transformers torch pandas ipykernel "pydantic<2.0.0"

Collecting langchain==0.1.0
  Using cached langchain-0.1.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community==0.0.10
  Using cached langchain_community-0.0.10-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain-core==0.1.10
  Using cached langchain_core-0.1.10-py3-none-any.whl.metadata (4.0 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.13.0-cp311-cp311-win_amd64.whl.metadata (7.7 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting torch
  Using cached torch-2.9.1-cp311-cp311-win_amd64.whl.metadata (30 kB)
Collecting pandas
  Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pydantic<2.0.0
  Using cached pydantic-1.10.24-cp311-cp311-win_amd64.whl.metadata (156 kB)
Collecting PyYAML>=5.3 (from langchain==0.1.0)
  Using cached pyyaml-6.0.3-cp311-cp311-win_amd64.whl.metadata (2.4 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain==0.1.0)
  Using cached sqlalchemy-2


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import langchain
import langchain_community
import pydantic
import pandas

print(f"LangChain Ver  : {langchain.__version__}")
print(f"Community Ver  : {langchain_community.__version__}")
print(f"Pydantic Ver   : {pydantic.VERSION}")
print(f"Pandas Ver     : {pandas.__version__}")

LangChain Ver  : 0.1.0
Community Ver  : 0.0.10
Pydantic Ver   : 1.10.24
Pandas Ver     : 2.3.3
