In [2]:
import pandas as pd

df = pd.read_json("combined_dataset.json", lines=True)
df.to_csv("Dataset.csv", index=False)

In [3]:
df.isnull().sum()
df.drop_duplicates(inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2752 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   2752 non-null   object
 1   Response  2752 non-null   object
dtypes: object(2)
memory usage: 64.5+ KB


In [None]:
import re

def ProcessAndCombineText(text1,text2):
    # Remove leading and trailing whitespace
    text1 = text1.strip()
    text2 = text2.strip()

    # Replace newline characters with spaces
    text1 = text1.replace("\n", " ")
    text2 = text2.replace("\n", " ")

    # Remove multiple spaces
    text1 = re.sub(r'\s+', ' ', text1)
    text2 = re.sub(r'\s+', ' ', text2)

    # Add space after punctuation if not present
    text1 = re.sub(r'([\.\,\?\!])(\w)', r'\1 \2', text1)
    text2 = re.sub(r'([\.\,\?\!])(\w)', r'\1 \2', text2)

    # Add label to the sentences for better understanding
    text1 = "Client: " + text1
    text2 = "Assistant: " + text2

    # Combine the two texts
    text = text1 + "\n" + text2

    return text

df['combined'] = df.apply(lambda x: ProcessAndCombineText(x['Context'], x['Response']), axis=1)

In [7]:
print(df['combined'].iloc[1])

Client: I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here. I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it. How can I change my feeling of being worthless to everyone?
Assistant: Hello, and thank you for your question and seeking advice on this. Feelings of worthlessness is unfortunately common. In fact, most people, if not all, have felt this to some degree at some point in their life. You are not alone. Changing our feelings is like changing our thoughts - it's hard to do. Our minds are so amazing that the minute you change your thought another one can be right there to take it's place. Without your permission, another thought can just pop in there. The new thought may feel worse than the last one! My guess is that you have tried several things to improve this on your own even before reaching out on here. People often try t

In [None]:
from openai import OpenAI

client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
    return client.embeddings.create(input = [text], model=model).data[0].embedding

df['embedding'] = df['combined'].apply(lambda x: get_embedding(x))

# Save the embedding DF into a csv.
df.to_csv('Dataset_with_embeddings.csv', index=False)

In [None]:
import faiss
import numpy as np

# Function description
# We can choose either to use the cosine_similarity(Inner Prodcut) or the similarity_score(Euclidean Distance)
# Kayaknya lebih bagus kalo pake cosine_similarity - Domo 

def build_faiss_index(embeddings, mode):
    # Convert to numpy array of float32
    embedding_matrix = np.array(embeddings).astype(np.float32)
    
    # Get dimension
    dimension = embedding_matrix.shape[1]

    if mode == 'cosine':
        faiss.normalize_L2(embedding_matrix)
        index = faiss.IndexFlatIP(dimension)  # Inner Product ≈ Cosine
        index.add(embedding_matrix)

    elif mode == 'similarity_score':
        index = faiss.IndexFlatL2(dimension)  # Euclidean distance
        index.add(embedding_matrix)

    else:
        raise ValueError("mode must be either 'cosine' or 'similarity_score'")

    return index

In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt', quiet=True)

# Tokenize the combined text for BM25 (Optional: can try to use TF-IDF)
documents = df['combined'].to_list()
tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]

# Store the tokenized_documents in the BM25
bm25 = BM25Okapi(tokenized_documents)

# Function to get the BM25 score based on the query
def GetBM25Score(query):
    tokenized_query = word_tokenize(query.lower())
    return bm25.get_scores(tokenized_query)

In [None]:
def GetSemanticScores(query, index, mode='cosine'):
    # Embed the query
    embedded_query = np.array([get_embedding(query)]).astype(np.float32)

    if mode == 'cosine':
        # Normalize query for cosine similarity
        faiss.normalize_L2(embedded_query)
        D, _ = index.search(embedded_query, k=index.ntotal)
        return D[0]  # Cosine similarity (higher is better)

    elif mode == 'similarity_score':
        D, _ = index.search(embedded_query, k=index.ntotal)
        return 1 / (D[0] + 1e-6)  # Convert L2 distance into similarity score

    else:
        raise ValueError("mode must be 'cosine' or 'similarity_score'")

In [None]:
import cohere
from sklearn.preprocessing import minmax_scale

def GetTopKDocumentsBasedOnQuery(query,index, mode,n, k):
    bm25_scores = GetBM25Score(query)
    semantic_scores = GetSemanticScores(query, index=index, mode=mode)

    # Normalize each score becuase each have their own scale (Optional: can try to use Z-normalize)
    bm25_scores_normalize = minmax_scale(bm25_scores)
    semantic_scores_normalize = minmax_scale(semantic_scores)

    # Set the weight of each score (Semantic, IMO Semantic is more important for mental health chatbot)
    semantic_weight = 0.8
    lexicon_weight = 0.2

    # Combine each score
    fused_scores = semantic_scores_normalize * semantic_weight + lexicon_weight * bm25_scores_normalize

    # Get the top 20 scores
    top_n = n
    top_indices = np.argsort(fused_scores)[-top_n:][::-1]

    # Get the top n documents that has the highest fused_scores
    top_docs = [documents[i] for i in top_indices]

    co = cohere.ClientV2(api_key="BELOM ADA")

    # Rerank the top n documents and take the top 2
    response = co.rerank(
        model = "rerank-v3.5",
        query = query,
        documents=top_docs,
        top_n= k,
    )

    # Get the top ranked documents through the index from the output of the response.
    return [top_docs[r.index] for r in response.results]

## Should only be run once (might take some time), the code will:
### 1. Select the mode of the FAISS DB index
### 2. Turn the df['embedding'] into a list
### 3. Build the FAISS DB with the appropriate index mode
### 4. Insert the embedding documents into the FAISS DB
### 5. Save the FAISS DB into a file

In [None]:
# Choose between cosine or similarity_score
MODE = "cosine"

# Turn the embedding data frame into list
embeddings_documents = df['embedding'].to_list()

index = build_faiss_index(embeddings=embeddings_documents,mode=MODE)

# Save the index to a file so we can use it later
faiss.write_index(index, "sanity_llm_index.faiss")

In [None]:
# Read the FAISS DB from a file, this is to prevent building the FAISS DB over and over
index = faiss.read_index("sanity_llm_index.faiss")

query = "Test"

# To indicate how many documents to be rank (Top n fused_scores will be rank)
TopN = 20

# To indicate how many documents to be returned after the reranking must be < TopN
TopK = 10

ranked_documents = GetTopKDocumentsBasedOnQuery(query=query,index=index,mode=MODE,n=TopN,k=TopK)