In [21]:
import os
import google.generativeai as genai
import chromadb
import fitz
from dotenv import load_dotenv
import requests
import unicodedata
import pdfplumber
import pandas as pd
import re

In [3]:
load_dotenv("../.env")
GOOGLE_API_KEY = os.getenv("API_Key")

In [5]:
def check_api_key():
    try:
        API_URL = "https://generativelanguage.googleapis.com/v1/models/gemini-1.5-pro:generateContent"
        headers = {"Content-Type": "application/json"}
        payload = {"contents": [{"parts": [{"text": "Hello"}]}]}

        response = requests.post(f"{API_URL}?key={GOOGLE_API_KEY}", json=payload, headers=headers)
        response.raise_for_status()  

        print("API Key is valid!")

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {response.status_code} - {response.text}")
    except requests.exceptions.RequestException as e:
        print(f"Network error: {e}")

In [7]:
check_api_key()

API Key is valid!


In [9]:
genai.configure(api_key=GOOGLE_API_KEY)

In [11]:
chroma_client = chromadb.PersistentClient(path="../ChromaStorage/BoardGames")
collection = chroma_client.get_or_create_collection(name="BoardGames")

In [53]:
pdf_path = input("Enter the file path: ")

Enter the file path:  ../Files/Board_games.pdf


In [83]:
import pdfplumber

def is_within_bbox(word_bbox, table_bbox):
    x0, y0, x1, y1 = word_bbox
    tx0, ty0, tx1, ty1 = table_bbox
    return (x0 >= tx0 and x1 <= tx1 and y0 >= ty0 and y1 <= ty1)

def extract_text_excluding_tables(pdf_path):
    full_text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Step 1: Get all table bounding boxes
            table_bboxes = [table.bbox for table in page.find_tables()]

            # Step 2: Get all words with positions
            page_words = page.extract_words()
            cleaned_words = []

            for word in page_words:
                word_bbox = (float(word['x0']), float(word['top']), float(word['x1']), float(word['bottom']))
                
                # Step 3: Only include words that are NOT inside any table bbox
                if not any(is_within_bbox(word_bbox, bbox) for bbox in table_bboxes):
                    cleaned_words.append(word['text'])

            full_text.append(" ".join(cleaned_words))  # Join words for each page

    return "\n\n".join(full_text)

In [87]:
pdf_path = "../Files/Board_games.pdf"
clean_text = extract_text_excluding_tables(pdf_path)
print(clean_text)

Classic Board Games: A Comprehensive Guide Introduction Classic board games have stood the test of time, entertaining generations of players with their simple yet engaging gameplay. This guide explores five iconic board games that have become staples in households around the world. Ludo Origin: Ancient Indian game Pachisi First Commercial Version: 1896 Players: 2-4 Playing Time: 30-60 minutes Overview Ludo is a race game where players move their four tokens from start to finish according to dice rolls. The game's simplicity makes it accessible to players of all ages, while still providing exciting moments as players capture opponents' pieces and race to the center.

Key Features ● Simple rules make it accessible to children as young as 4 ● Element of chance balanced with basic strategy ● Social game that brings families together ● Available in countless variations worldwide (Parcheesi, Sorry!, etc.) Game Mechanics ● Roll-and-move gameplay ● Player interaction through capturing pieces ●

In [89]:
def extract_tables_and_send_to_gemini(pdf_path):
    all_table_texts = []  # 

    model = genai.GenerativeModel("gemini-2.0-flash")

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if table:
                  
                    df = pd.DataFrame(table[1:], columns=table[0])

                    
                    table_text = df.to_string(index=False)

                    
                    prompt = f"""
                    You are an expert in extracting meaningful, clean text from a table.

                    Below is the table data:

                    {table_text}

                    Convert the table into a clean, human-readable text summary. 
                    Preserve important relationships from the table but present it as plain text.
                    Be concise and clear. Don't use special signs and symbols.
                    """

                   
                    gemini_response = model.generate_content(prompt).parts[0].text

                    
                    all_table_texts.append(gemini_response)

    # After all tables processed, join them together
    concatenated_text = "\n\n".join(all_table_texts)

    return concatenated_text

In [93]:
table_text = extract_tables_and_send_to_gemini(pdf_path)

In [95]:
print(table_text)

Here is a summary of the game characteristics:

Ludo is a game for ages 4 and up, with low complexity, high luck factor, and low strategy, with an average playing time of 30-60 minutes.

Monopoly is for ages 8 and up, with medium complexity, medium luck factor, and medium strategy, and an average playing time of 60-180 minutes.

Chess is for ages 6 and up, with high complexity, no luck factor, and very high strategy, and an average playing time of 10-60+ minutes.

Checkers is for ages 6 and up, with low complexity, no luck factor, and medium strategy, and an average playing time of 15-30 minutes.

Clue is for ages 8 and up, with medium complexity, medium luck factor, and medium strategy, and an average playing time of 45-60 minutes.


Here's a summary of the game skills based on the table:

Ludo: Low in logical thinking and math skills, high in social skills, low in planning, and moderate in patience.

Monopoly: Moderate in logical thinking, high in math and social skills, moderate in 

In [106]:
def append_table_summaries(clean_text, table_summaries):
    final_text = f"{clean_text.strip()}\n\n\n\nTable Summaries:\n\n{table_summaries}"
    return final_text

In [108]:
final_doc_text = append_table_summaries(clean_text, table_text)
print(final_doc_text)

Classic Board Games: A Comprehensive Guide Introduction Classic board games have stood the test of time, entertaining generations of players with their simple yet engaging gameplay. This guide explores five iconic board games that have become staples in households around the world. Ludo Origin: Ancient Indian game Pachisi First Commercial Version: 1896 Players: 2-4 Playing Time: 30-60 minutes Overview Ludo is a race game where players move their four tokens from start to finish according to dice rolls. The game's simplicity makes it accessible to players of all ages, while still providing exciting moments as players capture opponents' pieces and race to the center.

Key Features ● Simple rules make it accessible to children as young as 4 ● Element of chance balanced with basic strategy ● Social game that brings families together ● Available in countless variations worldwide (Parcheesi, Sorry!, etc.) Game Mechanics ● Roll-and-move gameplay ● Player interaction through capturing pieces ●

In [110]:
def clean_text(text):
    return unicodedata.normalize("NFKC", text)

In [112]:
try:
    text = clean_text(final_doc_text)
    print(text)
except Exception as e:
    print(f"An error occurred: {e}")

Classic Board Games: A Comprehensive Guide Introduction Classic board games have stood the test of time, entertaining generations of players with their simple yet engaging gameplay. This guide explores five iconic board games that have become staples in households around the world. Ludo Origin: Ancient Indian game Pachisi First Commercial Version: 1896 Players: 2-4 Playing Time: 30-60 minutes Overview Ludo is a race game where players move their four tokens from start to finish according to dice rolls. The game's simplicity makes it accessible to players of all ages, while still providing exciting moments as players capture opponents' pieces and race to the center.

Key Features ● Simple rules make it accessible to children as young as 4 ● Element of chance balanced with basic strategy ● Social game that brings families together ● Available in countless variations worldwide (Parcheesi, Sorry!, etc.) Game Mechanics ● Roll-and-move gameplay ● Player interaction through capturing pieces ●

In [114]:
def chunk_text(text, chunk_size=100, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

In [128]:
try:
    chunks = chunk_text(text)
    print(chunks)
except Exception as e:
    print(f"An error occurred: {e}")

["Classic Board Games: A Comprehensive Guide Introduction Classic board games have stood the test of time, entertaining generations of players with their simple yet engaging gameplay. This guide explores five iconic board games that have become staples in households around the world. Ludo Origin: Ancient Indian game Pachisi First Commercial Version: 1896 Players: 2-4 Playing Time: 30-60 minutes Overview Ludo is a race game where players move their four tokens from start to finish according to dice rolls. The game's simplicity makes it accessible to players of all ages, while still providing exciting moments as players capture opponents' pieces and", "Version: 1896 Players: 2-4 Playing Time: 30-60 minutes Overview Ludo is a race game where players move their four tokens from start to finish according to dice rolls. The game's simplicity makes it accessible to players of all ages, while still providing exciting moments as players capture opponents' pieces and race to the center. Key Feat

In [118]:
def generate_embeddings(text_chunks):
    embeddings = []
    
    for chunk in text_chunks:
        response = genai.embed_content(
            model="models/text-embedding-004",
            content=chunk,
            task_type="RETRIEVAL_QUERY" 
        )
        embeddings.append(response['embedding'])
    return embeddings

In [120]:
try:
    embeddings = generate_embeddings(chunks)
except Exception as e:
    print(f"An error occurred: {e}")

In [122]:
def store_embeddings_in_chromadb(text_chunks, embeddings):
    for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings)):
        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            metadatas=[{"content": chunk}]
        )

In [126]:
try:
    store_embeddings_in_chromadb(chunks, embeddings)
except Exception as e:
    print(f"An error occurred: {e}")

Insert of existing embedding ID: 0
Add of existing embedding ID: 0
Insert of existing embedding ID: 1
Add of existing embedding ID: 1
Insert of existing embedding ID: 2
Add of existing embedding ID: 2
Insert of existing embedding ID: 3
Add of existing embedding ID: 3
Insert of existing embedding ID: 4
Add of existing embedding ID: 4
Insert of existing embedding ID: 5
Add of existing embedding ID: 5
Insert of existing embedding ID: 6
Add of existing embedding ID: 6
Insert of existing embedding ID: 7
Add of existing embedding ID: 7
Insert of existing embedding ID: 8
Add of existing embedding ID: 8
Insert of existing embedding ID: 9
Add of existing embedding ID: 9
Insert of existing embedding ID: 10
Add of existing embedding ID: 10
Insert of existing embedding ID: 11
Add of existing embedding ID: 11
Insert of existing embedding ID: 12
Add of existing embedding ID: 12
Insert of existing embedding ID: 13
Add of existing embedding ID: 13
Insert of existing embedding ID: 14
Add of existing em