In [7]:
import os
import pandas as pd
import sqlite3
import numpy as np
import time
from tqdm import tqdm
import pickle
from concurrent.futures import ThreadPoolExecutor, as_completed
from google import genai
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

data_path = "../../datasets/sentiment_data.csv"

In [8]:
df = pd.read_csv(data_path, encoding="ISO-8859-1", header=None)
df.columns = ['label', 'text']
df['label'] = df['label'].str.lower().str.strip()
df = df.dropna(subset=["text", "label"]).reset_index(drop=True)

In [13]:
def get_gemini_embedding(text):
    try:
        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=text,
            config={"task_type":'RETRIEVAL_DOCUMENT'}
        )
        time.sleep(0.7)
        return response.embeddings[0].values
    except Exception as e:
        print("Embedding failed:", e)
        return None

In [15]:
# Create SQLite DB
conn = sqlite3.connect("labeled_news.db")
c = conn.cursor()
c.execute("""
CREATE TABLE IF NOT EXISTS news (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    text TEXT,
    label TEXT,
    embedding BLOB
)
""")

for i, row in tqdm(df.iterrows(), total=len(df)):
    emb = get_gemini_embedding(row['text'])
    if emb is not None:
        emb_blob = pickle.dumps(np.array(emb, dtype=np.float32))
        c.execute(
            "INSERT INTO news (text, label, embedding) VALUES (?, ?, ?)",
            (row['text'], row['label'], emb_blob)
        )
        if i % 100 == 0:
            conn.commit()

conn.commit()
conn.close()
print("Done! Embeddings and data saved in labeled_news.db")

100%|█████████████████████████████████████| 4846/4846 [1:14:23<00:00,  1.09it/s]

Done! Embeddings and data saved in labeled_news.db





## Explore Stored Index

In [16]:
conn = sqlite3.connect("labeled_news.db")
c = conn.cursor()

# Check how many rows are in the news table
c.execute("SELECT COUNT(*) FROM news")
row_count = c.fetchone()[0]
print(f"Number of rows in DB: {row_count}")

# Fetch and display a few sample rows (e.g., first 3)
c.execute("SELECT id, text, label, embedding FROM news LIMIT 3")
samples = c.fetchall()

for sample in samples:
    sample_id, text, label, emb_blob = sample
    print(f"\nID: {sample_id}")
    print(f"Label: {label}")
    print(f"Text: {text[:100]}{'...' if len(text) > 100 else ''}")  # Print first 100 chars for brevity
    embedding = pickle.loads(emb_blob)
    print(f"Embedding shape: {embedding.shape}, First 5 values: {embedding[:5]}")

conn.close()

Number of rows in DB: 4846

ID: 1
Label: neutral
Text: According to Gran , the company has no plans to move all production to Russia , although that is whe...
Embedding shape: (768,), First 5 values: [-0.00908443 -0.01821281 -0.02909375 -0.01231977 -0.00335723]

ID: 2
Label: neutral
Text: Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to hos...
Embedding shape: (768,), First 5 values: [-0.02230478  0.00380668 -0.00939238 -0.01575931  0.01340601]

ID: 3
Label: negative
Text: The international electronic industry company Elcoteq has laid off tens of employees from its Tallin...
Embedding shape: (768,), First 5 values: [-0.00400596  0.00566283  0.01386932  0.02713201  0.04100661]
