<a href="https://colab.research.google.com/github/Avik-Das-567/chatbot-groq-api/blob/main/Chatbot_Groq_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install groq



In [6]:
!pip install faiss-cpu



In [7]:
from groq import Groq
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [None]:
x = input("Enter your Groq API Key : ")
client = Groq(api_key = x)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

In [10]:
def load_csv(filepath):
  df = pd.read_csv(filepath)
  return df

In [11]:
df = load_csv('/content/netflix_titles.csv')
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [12]:
def chunk_dataframe(df, chunk_size = 5):
  chunks = []
  for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i+chunk_size]
    chunks.append(chunk.to_json())
  return chunks

In [13]:
chunks = chunk_dataframe(df, chunk_size = 5)
chunks

['{"show_id":{"0":"s1","1":"s2","2":"s3","3":"s4","4":"s5"},"type":{"0":"Movie","1":"TV Show","2":"TV Show","3":"TV Show","4":"TV Show"},"title":{"0":"Dick Johnson Is Dead","1":"Blood & Water","2":"Ganglands","3":"Jailbirds New Orleans","4":"Kota Factory"},"director":{"0":"Kirsten Johnson","1":null,"2":"Julien Leclercq","3":null,"4":null},"cast":{"0":null,"1":"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng","2":"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera","3":null,"4":"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar"},"country":{"0":"United States","1":"South Africa","2":nul

In [14]:
# Embeddings and Saving it to Vector Database --- RAG

def create_faiss_index(chunks):
  embeddings = embed_model.encode(chunks)
  dim = embeddings.shape[1]
  index = faiss.IndexFlatL2(dim)
  index.add(np.array(embeddings))
  return index, chunks, embeddings

In [15]:
index, chunks, embeddings = create_faiss_index(chunks)

In [16]:
embeddings

array([[-0.00568417,  0.02853673, -0.0152785 , ..., -0.04240915,
        -0.01923778,  0.04605638],
       [-0.0162823 , -0.0319946 ,  0.00223612, ..., -0.02103231,
        -0.00018268,  0.06269485],
       [-0.00439301,  0.04934553, -0.02552585, ..., -0.03280326,
        -0.03300194,  0.05764064],
       ...,
       [ 0.00102975,  0.07110985, -0.00855133, ..., -0.03394876,
        -0.00698007,  0.05441854],
       [ 0.00978877,  0.08129706, -0.00240051, ..., -0.05025837,
        -0.02136837,  0.04398954],
       [-0.01578068,  0.05159203,  0.00084362, ..., -0.03287766,
        -0.04273923,  0.02654465]], dtype=float32)

In [17]:
embeddings[0]

array([-5.68417413e-03,  2.85367314e-02, -1.52784977e-02, -3.01207080e-02,
       -1.14546670e-03,  6.07029088e-02,  7.51250563e-03, -2.87274141e-02,
        3.34281512e-02, -1.80931259e-02, -9.25332028e-03, -5.29771708e-02,
        1.20181339e-02,  2.56350357e-02, -2.96869371e-02,  5.62441610e-02,
       -4.64965589e-02,  2.93799373e-03, -1.88623145e-02, -7.45973438e-02,
        7.97859486e-03,  2.08737329e-02,  4.53350917e-02, -8.97485912e-02,
        1.18505964e-02,  4.03699763e-02, -1.38558438e-02,  3.46412063e-02,
       -5.05700074e-02, -1.06930779e-02,  3.57948500e-03,  2.08124388e-02,
        2.82594152e-02,  5.12035452e-02, -1.08746556e-03,  9.31577384e-03,
       -4.71101627e-02,  5.45987021e-03,  2.50736009e-02,  7.92535692e-02,
        1.02717325e-01,  5.24142310e-02, -5.58680035e-02, -1.13494508e-01,
       -8.98002367e-03, -7.63553381e-02, -9.80008319e-02, -2.08931454e-02,
        4.42238562e-02,  5.10200560e-02, -8.50073770e-02, -2.57491432e-02,
       -2.67600548e-02,  

In [18]:
# Performing Retrieval

def retrieve_relevant_chunks(query, index, chunks, embeddings, k = 2):
  query_vec = embed_model.encode([query])
  distances, indices = index.search(query_vec, k)
  return [chunks[i] for i in indices[0]]

In [19]:
def generate_response_from_groq(context, query):
  types = df.dtypes.to_string()
  prompt = f"""You are a smart data assistant. Here's some context from a csv file.\n\n{context}\n\nQuestions :{query}\nAnswer :"""

  completion = client.chat.completions.create(
      model = 'llama3-8b-8192',
      messages = [
          {"role": "system", "content": "You are a Data Analyst Assistant"},
          {"role": "user", "content": prompt}
      ],
      temperature = 0.2
  )
  return completion.choices[0].message.content

In [20]:
def generate_insights(df):
  stats = df.describe(include = 'all').to_string()
  prompt = f"""Here is a dataset summary :\n{stats}\n\nGenerate 3 smart insights or trends that you observe."""

  completion = client.chat.completions.create(
      model = 'llama3-8b-8192',
      messages = [
          {"role": "user", "content": prompt}
      ],
  )
  return completion.choices[0].message.content

In [22]:
if __name__ == "__main__":
  filepath = '/content/netflix_titles.csv'
  df = load_csv(filepath)
  chunks = chunk_dataframe(df)
  index, chunk_data, embeddings = create_faiss_index(chunks)

  while True:
    query = input("Ask a question (or type 'insight' or 'exit) : ")
    if query.lower() == "exit":
      break
    else:
      context = "\n\n".join(retrieve_relevant_chunks(query, index, chunk_data, embeddings))
      answer = generate_response_from_groq(context, query)
      print("\nAnswer : ", answer, "\n")

Ask a question (or type 'insight' or 'exit) : insight

Answer :  Based on the provided CSV file, here are some insights and answers to your questions:

1. What is the distribution of show types (Movie/TV Show) in the dataset?

From the data, we can see that there are 5 movies and 1 TV show.

2. Which show has the longest duration?

The show with the longest duration is "The Mayo Clinic" with a duration of 116 minutes.

3. Which show has the highest rating?

The show with the highest rating is "The Blue Planet: A Natural History of the Oceans" with a rating of TV-G.

4. Which show was added to the dataset the earliest?

The show that was added to the dataset the earliest is "The Memphis Belle: A Story of a Flying Fortress" with a date added of March 31, 2017.

5. Which show has the most cast members?

The show with the most cast members is "The Matrix Reloaded" with 14 cast members.

6. Which country is represented the most in the dataset?

The country represented the most in the datase