In [38]:
import json
import faiss
import numpy as np
import google.generativeai as genai
import qdrant_client
from qdrant_client.models import VectorParams, Distance
import os
from sentence_transformers import SentenceTransformer

: 

In [42]:
# Set your Google Cloud project and credentials
from google.colab import userdata

os.environ["GEMINI_API_KEY"] = userdata.get('GEMINI_API_KEY')

# Initialize Google Cloud AI Platform
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Initialize Qdrant client
client = qdrant_client.QdrantClient("localhost", port=6333)  # Adjust for your Qdrant instance

In [19]:
# Load the JSON data (for testing, load only the first 10 courses)
with open('all_courses.json', 'r') as file:
    courses = json.load(file)

# Use only the first 10 courses by taking the first 10 keys
courses_keys = ["APL100", "MTL100", "MTL101", "CML101", "MTL106", "COL106", "MTL122", "COL759", "MTL145", "ELL101", "CVL759", "SBL100"]  # Getting the first 10 course codes

In [20]:
# Initialize model for embedding generation
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can change this to other models as well

# Extract course descriptions (assuming 'data' contains the course description)
course_descriptions = [courses[course_code]['data'] for course_code in courses_keys]

# Generate embeddings for the course descriptions
embeddings = model.encode(course_descriptions)

# Convert embeddings to numpy array (Faiss requires numpy arrays)
embeddings = np.array(embeddings).astype(np.float32)

In [21]:
# Initialize Faiss index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance for similarity search

# Add embeddings to the index
index.add(embeddings)

In [22]:
# Function to query the index
def query_courses(query, top_k=5):
    # Generate the embedding for the query
    query_embedding = model.encode([query]).astype(np.float32)

    # Perform the search in the Faiss index
    distances, indices = index.search(query_embedding, k=top_k)

    # Output the search results
    print(f"Top {top_k} courses for query: '{query}':")
    for i in range(top_k):
        course_code = courses_keys[indices[0][i]]
        course_data = courses[course_code]
        course_name = course_code  # Use course code as name, or extract another field if needed
        distance = distances[0][i]
        print(f"Course Code: {course_code}, Course Name: {course_name}, Distance: {distance}")
        print(f"Course Description: {course_data['data']}\n")

In [26]:
query_courses("number theory", 6)

Top 6 courses for query: 'number theory':
Course Code: MTL145, Course Name: MTL145, Distance: 0.9712423086166382
Course Description: Divisibility: basic definition, properties, prime numbers, some
results on distribution of primes; Congruences: basic definitions
and properties, complete and reduced residue systems, theorems
of Fermat, Euler & Wilson, application to RSA cryptosystem,
linear congruences and Chinese Remainder theorem, quadratic
congruences, and Quadratic Reciprocity law; Arithmetical functions:
examples, with some properties and their rate of growth; Continued
fractions, and their connections with Diophantine approximatins,
applications tolinear and Pell’s equations; Binary quadratic forms;
Partition: basic properties and results; Diophatine equations: linear
and quadratic, some general equations.

Course Code: COL759, Course Name: COL759, Distance: 1.2121696472167969
Course Description: Part 1: Foundations: Perfect secrecy and its limitations, computational
security, pse

In [43]:
client

<qdrant_client.qdrant_client.QdrantClient at 0x79c127e31db0>

In [45]:
# Create the model with generation configuration
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

# Initialize the model
model = genai.GenerativeModel(
    model_name="gemini-2.0-flash-exp",  # Using Gemini 2.0 model
    generation_config=generation_config,
)

# Function to get embeddings from Google Generative AI (Vertex AI)
def get_google_embeddings(text):
    response = model.predict(instances=[{"content": text}])
    embeddings = response.predictions[0]["embedding"]
    return np.array(embeddings).astype(np.float32)

# Initialize Qdrant collection if it doesn't exist
collection_name = "courses"
if client.collection_exists(collection_name=collection_name):
    client.create_collection(
        collection_name=collection_name,
        vector_params=VectorParams(size=768, distance=Distance.COSINE)  # Assuming embeddings size = 768
    )

# Create and insert embeddings for courses into Qdrant
course_embeddings = [get_google_embeddings(description) for description in course_descriptions]
course_ids = list(range(len(course_embeddings)))

# Upsert embeddings into Qdrant
client.upsert(
    collection_name=collection_name,
    points=[(str(course_ids[i]), course_embeddings[i]) for i in range(len(course_embeddings))]
)

ResponseHandlingException: [Errno 99] Cannot assign requested address