### Installing Required Libraries
We need to install `transformers`, `datasets`, and `sentence-transformers` along with FAISS-GPU. The following commands ensure that everything is set up.


In [None]:
!pip install transformers datasets sentence-transformers
!pip install faiss-gpu
!pip install faiss-cpu


In [None]:
!pip install streamlit

### Data Loading and Preprocessing
We'll start by loading the IMDb dataset and then preprocessing it to extract the relevant information. We will also generate sentence embeddings for the movie overviews using the `SentenceTransformer`.


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

data_path = "/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv"

def load_data(data_path):
    """Load data from csv"""
    df = pd.read_csv(data_path)
    return df

def preprocess_data(df):
    """Select relevant columns and preprocess the text"""
    df_selected = df[
        [
            "Series_Title",
            "Genre",
            "Overview",
            "Director",
            "Star1",
            "Star2",
            "Star3",
            "Star4",
        ]
    ]
    return df_selected




df = load_data(data_path)
df_selected = preprocess_data(df)


In [None]:
def build_faiss_index(embeddings):
    """Build and return faiss IVF index"""
    dimensions = embeddings[0].shape[0]
    nlist = 30
    quantizer = faiss.IndexFlatL2(dimensions)
    index = faiss.IndexIVFFlat(quantizer, dimensions, nlist, faiss.METRIC_L2)

    embeddings_array = np.array(embeddings.tolist())

    assert not index.is_trained
    index.train(embeddings_array)
    assert index.is_trained

    index.add(embeddings_array)
    return index
index = build_faiss_index(df_selected["Overview_Embeddings"])

### Preparing RAG Data and FAISS Index
This step involves converting the preprocessed data into a format suitable for RAG and saving it, along with creating and saving a FAISS index of the embeddings.


In [None]:
from datasets import Dataset
import faiss


dataset_path = "/kaggle/working/rag_dataset"
index_path = "/kaggle/working/rag_index/faiss_index"


def generate_embeddings(df, model_name="all-mpnet-base-v2"):
    """Generate embeddings using SentenceTransformer model."""
    model = SentenceTransformer(model_name)
    df["Overview_Embeddings"] = df["Overview"].apply(
        lambda x: model.encode(x).astype(np.float32)
    )
    return df, model


def prepare_rag_data(df, dataset_path, index_path):
    """Prepare RAG data by saving it to disk."""
    df["Overview_Embeddings"] = df["Overview_Embeddings"].apply(
        lambda x: np.array(x, dtype=np.float32)
    )

    
    df_rag = df.rename(columns={"Series_Title": "title", "Overview": "text"})
    df_rag["embeddings"] = df["Overview_Embeddings"]

    rag_dataset = Dataset.from_pandas(df_rag[["title", "text", "embeddings"]])

   
    rag_dataset.save_to_disk(dataset_path)

    rag_dataset.add_faiss_index(column="embeddings", index_name="embeddings")
    rag_dataset.get_index("embeddings").save(index_path)

    return rag_dataset


df, model = generate_embeddings(df_selected)
rag_dataset = prepare_rag_data(df, dataset_path, index_path)

### Initializing the RAG Model
We'll now initialize the RAG model, using the FAISS index and dataset we prepared. This model will be used for retrieval-augmented generation.


In [None]:
from datasets import load_from_disk
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
import faiss

def initialize_rag_model():
    dataset_path = "/kaggle/working/rag_dataset"
    index_path = "/kaggle/working/rag_index/faiss_index"

    dataset = load_from_disk(dataset_path)
    dataset.load_faiss_index("embeddings", index_path)

    rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
    rag_retriever = RagRetriever.from_pretrained(
        "facebook/rag-token-nq",
        index_name="embeddings",
        indexed_dataset=dataset,
    )
    rag_model = RagTokenForGeneration.from_pretrained(
        "facebook/rag-token-nq", retriever=rag_retriever
    )
    
    return rag_tokenizer, rag_retriever, rag_model

rag_tokenizer, rag_retriever, rag_model = initialize_rag_model()


### Implementing the Search Functionality
Finally, we implement the search functionality. This enables users to input a query and get a list of similar movies, either by using FAISS or by refining the results with RAG.


In [None]:
def search_query(
    query,
    model,
    index,
    df,
    k=5,
    nprobe=10,
    use_rag=False,
    rag_model=None,
    rag_tokenizer=None,
):
    """Search for the most similar items"""

    if use_rag and rag_model and rag_tokenizer:
        input_ids = rag_tokenizer(query, return_tensors="pt")["input_ids"]
        generated_ids = np.array([df_selected])
        generated_ids = np.array(generated_ids)  # Convert the list to a numpy array
        rag_response = rag_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        question_hidden_states = rag_model.question_encoder(input_ids=input_ids).last_hidden_state
        query_embedding = rag_model.retriever(question_hidden_states, input_ids, return_tensors="pt").embedding.detach().cpu().numpy()[0]
    else:
        
        query_embedding = model.encode(query)

    query_embedding = np.array([query_embedding])
    index.nprobe = nprobe
    distances, indices = index.search(query_embedding, k)

    results = df.iloc[indices[0]]
    return results, rag_response if use_rag else None


query = "A story about hope and friendship."
use_rag = True  

results, rag_response = search_query(query, model, index, df_selected, use_rag=use_rag, rag_model=rag_model, rag_tokenizer=rag_tokenizer)


print("Top results:")
for idx, row in results.iterrows():
    print(f"Title: {row['Series_Title']}")
    print(f"Overview: {row['Overview']}")
    print("---")

if rag_response:
    print("RAG response:")
    print(rag_response)


## Run the app with streamlit (Optional)


In [None]:

app_code = """
import streamlit as st

data_path = "/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv"
dataset_path = "/kaggle/working/rag_dataset"
index_path = "/kaggle/working/rag_index/faiss_index"


@st.cache_data
def load_and_prepare_data():
    df = load_data(data_path)
    df_selected = preprocess_data(df)
    df_selected, model = generate_embeddings(df_selected)
    index = build_faiss_index(df_selected["Overview_Embeddings"])

    prepare_rag_data(df_selected, dataset_path, index_path)
    rag_tokenizer, rag_retriever, rag_model = initialize_rag_model()

    return df_selected, model, index, rag_tokenizer, rag_model


df_selected, model, index, rag_tokenizer, rag_model = load_and_prepare_data()

st.title("Movie Search App with RAG")

st.write("Enter a movie description or plot to find similar movies:")

query = st.text_input("Search query", "")

use_rag = st.checkbox("Use RAG to refine search")

if query:
    st.write("Searching for movies similar to your query...")
    results, rag_response = search_query(
        query,
        model,
        index,
        df_selected,
        use_rag=use_rag,
        rag_model=rag_model,
        rag_tokenizer=rag_tokenizer,
    )

    if use_rag and rag_response:
        st.write("Refined response from RAG:")
        st.write(rag_response)

    st.write(f"Top {len(results)} results:")

    for idx, row in results.iterrows():
        st.subheader(row["Series_Title"])
        st.write(f"**Genre:** {row['Genre']}")
        st.write(f"**Director:** {row['Director']}")
        st.write(f"**Overview:** {row['Overview']}")
        st.write(
            f"**Stars:** {row['Star1']}, {row['Star2']}, {row['Star3']}, {row['Star4']}"
        )
        st.write("---")

if st.checkbox("Show dataset"):
    st.write(df_selected.head(10))
"""



In [None]:

with open('/kaggle/working/app.py', 'w') as f:
    f.write(app_code)

In [None]:
!curl ipv4.icanhazip.com

In [None]:

!streamlit run /kaggle/working/app.py &>./logs.txt & npx localtunnel --port 8501