## Storage

In [1]:
# import redis
# redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)

# try:
#     r = redis.Redis(host='localhost', port=6379, decode_responses=True)
#     r.ping()
#     print("✅ Redis terkoneksi!")
# except redis.ConnectionError as e:
#     print("❌ Redis gagal terkoneksi:", e)

In [None]:

# hybrid_instance.py

import mysql.connector
from sentence_transformers import SentenceTransformer
from functools import lru_cache

chat_memory = {}

def load_faq():
    conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password="",
        database="chatbot-humas"
    )
    cursor = conn.cursor()

    cursor.execute("SELECT * FROM pertanyaan")
    faq = cursor.fetchall()

    cursor.close()
    conn.close()

    return faq

@lru_cache(maxsize=1)
def get_model(model_name='intfloat/multilingual-e5-small'):
    return SentenceTransformer(model_name)

def load_cache(session_id):
    return chat_memory.get(session_id, [])

def load_context(session_id):
    history = load_cache(session_id)
    return " ".join([entry["query"] if entry["query"] else "" for entry in history])

def save_cache(session_id, query, results):
    if session_id not in chat_memory:
        chat_memory[session_id] = []
    chat_memory[session_id].append({
        "query": query,
        "results": results
    })

def clear_cache():
    global chat_memory
    chat_memory = {}
    print('chat_memory has been cleared.')

  from .autonotebook import tqdm as notebook_tqdm


## Class Definition

In [None]:
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cos_sim
# import redis

class HybridSearch:
    def __init__(self, model_name='intfloat/multilingual-e5-small', top_k=1, redis_host='localhost', redis_port=6379):
        self.model = get_model(model_name)
        self.top_k = top_k
        self.ttl = 28800

        # # Redis
        # self.redis = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)

    def build_index(self, faq):
        self.faq_questions = [item[1] for item in faq]
        self.faq_answers = [item[2] for item in faq]

        self.tfidf_vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.faq_questions)

        self.semantic_embeddings = self.model.encode(self.faq_questions, normalize_embeddings=True)

    def search(self, query, session_id=None, use_cache=False):
        # # Redis
        # if session_id and self.redis_is_on:
        #     cached = self.redis.get(f"faq:{session_id}:{query}")
        #     if cached:
        #         return json.loads(cached)

        # dict
        if session_id:
            cached = load_context(session_id) if use_cache is True else ""

        # Step 1: TF-IDF filter
        tfidf_queries = self.preprocess_text(query)  # hanya query baru

        best_tfidf_score = 0
        best_query = None
        best_tfidf_query = None

        # Precompute TF-IDF dari cached (memori lama)
        cached_score = None
        if cached:
            cached_tfidf = self.tfidf_vectorizer.transform([cached])
            cached_score = cos_sim(cached_tfidf, self.tfidf_matrix)[0]
            cached_score *= 0.3  # Bobot 30% dari konteks lama

        for paragraph in tfidf_queries:
            tfidf_query = self.tfidf_vectorizer.transform([paragraph])
            tfidf_scores = cos_sim(tfidf_query, self.tfidf_matrix)[0]

            # Bobot tambahan jika ada tanda tanya
            tfidf_scores *= 1.2 if self.is_question(paragraph) else tfidf_scores

            # Gabungkan skor baru dengan skor lama jika ada
            if cached_score is not None:
                combined_score = tfidf_scores * 0.7 + cached_score
            else:
                combined_score = tfidf_scores

            highest_score = max(combined_score)
            if highest_score > best_tfidf_score:
                best_query = paragraph
                best_tfidf_score = highest_score
                best_tfidf_query = combined_score
                print(f"question: {paragraph}")
                print(f"related question: {self.faq_questions[np.argmax(combined_score)]}")
                print(f"score: {highest_score}")

        if best_tfidf_query is None or best_tfidf_score < 0.5:
            results = [(best_query, "", 0.0)]
        else:
            top_k_indices = np.argsort(best_tfidf_query)[::-1][:self.top_k]
            top_k_faqs = [self.faq_questions[i] for i in top_k_indices]
            top_k_answer = [self.faq_answers[i] for i in top_k_indices]

            # Step 2: Semantic reranking
            query_embedding = self.model.encode([query], normalize_embeddings=True)
            selected_embeddings = self.semantic_embeddings[top_k_indices]
            semantic_scores = np.dot(query_embedding, selected_embeddings.T)[0]

            # Sort top-k by semantic similarity
            final_rank = np.argsort(semantic_scores)[::-1]
            results = [(top_k_faqs[i], top_k_answer[i], float(semantic_scores[i])) for i in final_rank]

        # # Redis
        # if session_id and self.redis_is_on:
        #     self.redis.setex(f"faq:{session_id}:{query}", self.ttl, json.dumps(results))

        # dict
        if session_id and use_cache is True:
            save_cache(session_id, best_query, results)

        return results

    def preprocess_text(self, text, cached=None):
        text = self.cleaning_tanda_baca_berulang(text)
        texts = self.spliting_paragraph(text, cached)
        processed_texts = []

        for text in texts:
            text = text.lower()
            text = self.append_titik(text)
            processed_texts.append(text)

        return processed_texts

    def cleaning_tanda_baca_berulang(self, text):
        return re.sub(r'([!?.;,:\-\n])\1+', r'\1', text)

    def append_titik(self, text):
        if not text.endswith(('.', '?', '!')):
            return text + '.'
        return text

    def spliting_paragraph(self, text, cached=None):
        paragraph = text.split('\n')
        return paragraph

    def is_question(self, text):
        if '?' in text:
            return True
        return False

In [4]:
faq = load_faq()

engine = HybridSearch()
engine.build_index(faq)

## Testing

In [14]:
chat_memory = {}

list_pertanyaan = [
    "Selamat siang, apakah ini benar dengan humas uns?",
    "Saya merupakan mahasiswa asing yang diterima di UNS melalui beasiswa internasional. Kemudian saya diminta untuk menghubungi international office uns.",
    # "Gimana ya caranya?",
    "APakah saya bisa meminta no mereka?",
    # "Kalau saya mengundurkan diri begitu, apakah ada pengembalian dana ya Kak? Terima kasih",
    # "Terima kasih kak atas jawabannya"
]

session_id = '1'
for pertanyaan in list_pertanyaan:
    results = engine.search(query=pertanyaan, session_id=session_id, use_cache=True)

    # print("\nHasil pencarian:")
    # for question, answer, score in results:
    #     print(f"{question} —> {answer} — score: {score:.4f}")

    print(f"\njawaban: {results[0][1]}\n")

question: selamat siang, apakah ini benar dengan humas uns?
related question: Saya ingin bertanya berkaitan dengan website siakad uns yang mengalami masalah
score: 0.2739801383071332

jawaban: 

cached: selamat siang, apakah ini benar dengan humas uns?
question: saya merupakan mahasiswa asing yang diterima di uns melalui beasiswa internasional. kemudian saya diminta untuk menghubungi international office uns.
related question: kontak international office uns
score: 0.1684099775963998

jawaban: 

cached: selamat siang, apakah ini benar dengan humas uns? saya merupakan mahasiswa asing yang diterima di uns melalui beasiswa internasional. kemudian saya diminta untuk menghubungi international office uns.
question: apakah saya bisa meminta no mereka?
related question: pembayaran registrasi calon mahasiswa bisa menghubungi no kontak berapa
score: 0.37785646769497655

jawaban: 



In [15]:
print(chat_memory)

{'1': [{'query': 'selamat siang, apakah ini benar dengan humas uns?', 'results': [('selamat siang, apakah ini benar dengan humas uns?', '', 0.0)]}, {'query': 'saya merupakan mahasiswa asing yang diterima di uns melalui beasiswa internasional. kemudian saya diminta untuk menghubungi international office uns.', 'results': [('saya merupakan mahasiswa asing yang diterima di uns melalui beasiswa internasional. kemudian saya diminta untuk menghubungi international office uns.', '', 0.0)]}, {'query': 'apakah saya bisa meminta no mereka?', 'results': [('apakah saya bisa meminta no mereka?', '', 0.0)]}]}


In [16]:
print(load_context(session_id=session_id))

selamat siang, apakah ini benar dengan humas uns? saya merupakan mahasiswa asing yang diterima di uns melalui beasiswa internasional. kemudian saya diminta untuk menghubungi international office uns. apakah saya bisa meminta no mereka?
