In [18]:
# Install required ML, NLP, and search libraries
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install -q sentence-transformers faiss-cpu transformers pandas langdetect requests

In [19]:
# Upload FAQ knowledge base (CSV file)
from google.colab import files
uploaded = files.upload()


Saving kb_1000_faqs.csv to kb_1000_faqs (1).csv


In [20]:
# Load FAQ data into DataFrame
import pandas as pd

df = pd.read_csv("kb_1000_faqs.csv")
df.head()

Unnamed: 0,question,answer
0,How can I reset my password? (#1),Click on Forgot Password on the login page and...
1,How do I track my order? (#2),Go to My Orders and click Track Order.
2,What payment methods do you accept? (#3),"We accept credit cards, debit cards, UPI, wall..."
3,How do I return a product? (#4),Go to My Orders and select Return Item.
4,Can I cancel my order? (#5),Orders can be canceled before shipping.


In [21]:
import json
import os # Import os module to check for file existence

# TODO: Update this path to the correct filename of your notebook.
# You can check the files in your Colab environment using `!ls` in a new cell.
path = "/content/Multilinual_GenAi.ipynb"  # change filename - THIS FILE WAS NOT FOUND

if not os.path.exists(path):
    print(f"Error: The file '{path}' was not found. Please update the 'path' variable to an existing notebook file.")
else:
    with open(path, "r") as f:
        nb = json.load(f)

    nb["metadata"].pop("widgets", None)

    with open(path, "w") as f:
        json.dump(nb, f)

    print("Widgets metadata removed")

Widgets metadata removed


In [22]:
# Load multilingual sentence embedding model
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [23]:
# Create FAISS index for semantic search
import faiss
import numpy as np

questions = df["question"].tolist()

question_embeddings = embed_model.encode(
    questions,
    convert_to_numpy=True,
    normalize_embeddings=True
)

dimension = question_embeddings.shape[1]

index = faiss.IndexFlatIP(dimension)
index.add(question_embeddings)

print("FAISS index ready")


FAISS index ready


In [24]:
# Load translation models for multilingual support
from transformers import MarianMTModel, MarianTokenizer

def load_translator(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# Any language → English
tokenizer_to_en, model_to_en = load_translator("Helsinki-NLP/opus-mt-mul-en")

# English → Other languages
language_models = {
    "en": None,
    "hi": "Helsinki-NLP/opus-mt-en-hi",
    "es": "Helsinki-NLP/opus-mt-en-es",
    "fr": "Helsinki-NLP/opus-mt-en-fr",
    "de": "Helsinki-NLP/opus-mt-en-de"
}

translators = {}

for lang, model_name in language_models.items():
    if model_name:
        translators[lang] = load_translator(model_name)



Loading weights:   0%|          | 0/258 [00:00<?, ?it/s]



Loading weights:   0%|          | 0/258 [00:00<?, ?it/s]



Loading weights:   0%|          | 0/258 [00:00<?, ?it/s]



Loading weights:   0%|          | 0/258 [00:00<?, ?it/s]



Loading weights:   0%|          | 0/258 [00:00<?, ?it/s]



In [25]:
# Translation helper function
def translate(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [26]:
# ScaleDown API integration for query compression
import requests
import json

SCALEDOWN_API_KEY = "cjpWxC3K6A5gUBm1wrPcw4HsAPpTteGzIENqgpj2"
SCALEDOWN_URL = "https://api.scaledown.xyz/compress/raw/"

def scaledown_compress(context, prompt):
    headers = {
        "x-api-key": SCALEDOWN_API_KEY,
        "Content-Type": "application/json"
    }

    payload = {
        "context": context,
        "prompt": prompt,
        "scaledown": {
            "rate": "auto"
        }
    }

    try:
        response = requests.post(
            SCALEDOWN_URL,
            headers=headers,
            data=json.dumps(payload),
            timeout=10
        )

        result = response.json()

        # NEW API FORMAT
        if result.get("results") and result["results"].get("compressed_prompt"):
            return result["results"]["compressed_prompt"]

        # OLD / FLAT FORMAT
        if result.get("compressed_prompt"):
            return result["compressed_prompt"]

        print("ScaleDown response (no compressed prompt found):", result)
        return prompt

    except Exception as e:
        print("ScaleDown error:", str(e))
        return prompt


In [27]:
# Semantic search and answer retrieval
def find_best_answer(query):

    compressed_query = scaledown_compress(
        context="FAQ-based question answering system",
        prompt=query
    )

    q_embed = embed_model.encode(
        [compressed_query],
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    scores, indices = index.search(q_embed, k=1)

    print("Matched Question:", df.iloc[indices[0][0]]["question"])

    return df.iloc[indices[0][0]]["answer"]

In [28]:
# End-to-end multilingual question answering pipeline
def multilingual_support(user_text, output_lang):

    english_query = translate(user_text, tokenizer_to_en, model_to_en)

    answer = find_best_answer(english_query)

    if output_lang == "en":
        return answer
    else:
        tokenizer, model = translators[output_lang]
        return translate(answer, tokenizer, model)

In [29]:
# Interactive user query loop with repeat option

while True:
    print("\nChoose Output Language:")
    print("en = English")
    print("hi = Hindi")
    print("es = Spanish")
    print("fr = French")
    print("de = German")

    lang = input("Enter language code: ").strip()

    user = input("Ask your question: ").strip()
    print("Answer:", multilingual_support(user, lang))

    again = input("\nDo you want to ask another question? (yes/no): ").strip().lower()
    if again != "yes":
        print("Exiting...")
        break


Choose Output Language:
en = English
hi = Hindi
es = Spanish
fr = French
de = German
Enter language code: en
Ask your question: How can I reset my password?
Matched Question: How do I track my order? (#202)
Answer: Go to My Orders and click Track Order.

Do you want to ask another question? (yes/no): no
Exiting...
