In [7]:
# ✅ Install (if you haven’t already)
!pip install -q pandas sentence-transformers faiss-cpu

# ✅ Imports
import json
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
# Load old JSONL file
old_data = []
with open("/content/genshin_dataset_cleaned.jsonl", "r") as f:
    for line in f:
        item = json.loads(line.strip())
        if isinstance(item, list):
            old_data.extend(item)
        else:
            old_data.append(item)

print(f"✅ Loaded {len(old_data)} old QA pairs")


✅ Loaded 89 old QA pairs


In [10]:
# Load new Kaggle CSV
df = pd.read_csv("/content/genshin_dataset_cleaned.jsonl")
print(df.head())

# Transform Kaggle CSV rows into the same {"question": ..., "answer": ...} format
new_data = []
for _, row in df.iterrows():
    name = row.get("Name") or row.get("name") or "Unknown"
    vision = row.get("Vision") or ""
    weapon = row.get("Weapon") or ""
    nation = row.get("Nation") or ""
    affiliation = row.get("Affiliation") or ""
    rarity = row.get("Rarity") or ""
    constellation = row.get("Constellation") or ""

    answer = f"{name} is a {rarity}-star {vision} character from {nation}, affiliated with {affiliation}. Uses a {weapon}. Constellation: {constellation}."

    new_data.append({"question": f"Tell me about {name}", "answer": answer})

print(f"✅ Loaded {len(new_data)} Kaggle QA pairs")


Empty DataFrame
Columns: [[{"question": "Tell me about Yelan",  "answer": "Yelan is a 5-star Hydro bow user in Genshin Impact who excels as a sub-DPS or off-field DPS,  making her perfect for applying Hydro and triggering powerful elemental reactions like Vaporize,  Electro-Charged,  or Freeze while supporting your main damage dealer. Her playstyle revolves around dealing consistent Hydro damage while off-field,  enabling other characters to shine as the main DPS. This makes her extremely flexible and valuable in a wide variety of team compositions. Her main source of damage comes from her Elemental Burst,  Depth-Clarion Dice,  which summons a Hydro-infused die that follows your active character and deals damage whenever they perform a Normal Attack. This effect continues even after you switch characters,  which means Yelan can support your team even while she\u2019s not on the field. Her Elemental Skill,  Lingering Lifeline,  lets her dash around,  marking enemies in her path and deal

In [11]:
combined_data = old_data + new_data
print(f"✅ Total combined QA pairs: {len(combined_data)}")


✅ Total combined QA pairs: 89


In [12]:
splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)

chunks = []
for entry in combined_data:
    for chunk in splitter.split_text(entry["answer"]):
        chunks.append({"text": chunk, "source": entry["question"]})

print(f"✅ Created {len(chunks)} chunks")


✅ Created 1434 chunks


In [13]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
texts = [chunk["text"] for chunk in chunks]
embeddings = embedder.encode(texts)

print(f"✅ Embeddings shape: {embeddings.shape}")

# Save embeddings + chunks (optional)
np.save("embeddings.npy", embeddings)
with open("chunks.json", "w") as f:
    json.dump(chunks, f)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embeddings shape: (1434, 384)


In [14]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
faiss.write_index(index, "genshin_index.faiss")

print("✅ New FAISS index saved as genshin_index.faiss")


✅ New FAISS index saved as genshin_index.faiss


In [17]:
# Example test function
def retrieve_context(query, top_k=4):
    query_vec = embedder.encode([query])
    D, I = index.search(np.array(query_vec), top_k)
    return "\n\n".join(chunks[idx]["text"] for idx in I[0])

print("📄 Retrieved Context:\n", retrieve_context("Nahida's best weapon"))


📄 Retrieved Context:
 brings top-tier utility and damage support to virtually any Dendro-based team. Overall, Nahida is a powerful, easy-to-use character that fits into a wide variety of team comps and enables some of the strongest elemental reactions in the game. Whether you’re building for Bloom or looking to enhance Quicken or Hyperbloom teams, Nahida will quickly become one of the cornerstones of your roster.

Quicken or Hyperbloom teams, Nahida will quickly become one of the cornerstones of your roster. She's not only strong, but her playstyle is satisfying, with smart targeting mechanics and broad synergy with many characters, making her a top-tier investment for new and veteran players alike.

Gilded Dreams for a huge Elemental Mastery boost. Her best-in-slot weapon is *A Thousand Floating Dreams*, which provides a hefty EM boost and passive team buffs. However, she works extremely well with 4-star weapons like *The Widsith* or even *Sacrificial Fragments*, especially for player