# Employment-Focused City RAG Database Setup
This notebook loads employment and life-stage data into Pinecone for the Rate My City system.

In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
import json
import google.generativeai as genai
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

In [None]:
# Configure Gemini API
genai.configure(api_key=os.getenv("GEMINIAI_API_KEY"))

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [None]:
# Check if index exists, if not create it
index_name = "employment-rag"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=768,  # text-embedding-004 produces 768-dimensional vectors
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled"
    )
    print(f"Created index: {index_name}")
else:
    print(f"Index {index_name} already exists")

In [None]:
# Load employment reviews data
with open("employment_reviews.json", "r") as f:
    data = json.load(f)

print(f"Loaded {len(data['reviews'])} city employment reviews")
print("Sample review:")
print(data['reviews'][0])

In [None]:
# Create embeddings for employment data
processed_data = []
model_name = "models/text-embedding-004"

for i, review in enumerate(data['reviews']):
    try:
        # Create comprehensive text for embedding
        review_text = review['review']
        
        # Generate embedding
        response = genai.embed_content(
            model=model_name,
            content=review_text,
        )
        
        # Prepare data for Pinecone
        processed_data.append({
            "id": f"{review['city']}_{review['state']}".replace(" ", "_"),
            "values": response['embedding'],
            "metadata": {
                "city": review['city'],
                "state": review['state'],
                "stars": review['stars'],
                "review": review['review'],
                "unemployment_rate": review['employment_metrics']['unemployment_rate'],
                "average_salary": review['employment_metrics']['average_salary'],
                "job_growth_rate": review['employment_metrics']['job_growth_rate'],
                "top_industries": ", ".join(review['employment_metrics']['top_industries']),
                "recent_graduate_score": review['employment_metrics']['life_stage_scores']['recent_graduate'],
                "mid_career_score": review['employment_metrics']['life_stage_scores']['mid_career'],
                "career_change_score": review['employment_metrics']['life_stage_scores']['career_change'],
                "family_starting_score": review['employment_metrics']['life_stage_scores']['family_starting'],
                "pre_retirement_score": review['employment_metrics']['life_stage_scores']['pre_retirement']
            }
        })
        
        print(f"Processed {i+1}/{len(data['reviews'])}: {review['city']}, {review['state']}")
        
    except Exception as e:
        print(f"Error processing {review['city']}: {e}")

print(f"\nSuccessfully processed {len(processed_data)} cities")

In [None]:
# Check a sample processed record
if processed_data:
    print("Sample processed data:")
    sample = processed_data[0]
    print(f"ID: {sample['id']}")
    print(f"Vector dimensions: {len(sample['values'])}")
    print(f"Metadata keys: {list(sample['metadata'].keys())}")
    print(f"City: {sample['metadata']['city']}")
    print(f"Average salary: ${sample['metadata']['average_salary']:,}")

In [None]:
# Upload to Pinecone
index = pc.Index(index_name)

# Upsert in batches to avoid rate limits
batch_size = 10
for i in range(0, len(processed_data), batch_size):
    batch = processed_data[i:i+batch_size]
    index.upsert(
        vectors=batch,
        namespace="employment-data"
    )
    print(f"Uploaded batch {i//batch_size + 1}/{(len(processed_data)-1)//batch_size + 1}")

print("All data uploaded to Pinecone!")

In [None]:
# Verify the upload
stats = index.describe_index_stats()
print("Index Statistics:")
print(f"Total vectors: {stats.total_vector_count}")
print(f"Namespaces: {list(stats.namespaces.keys())}")
if 'employment-data' in stats.namespaces:
    print(f"Employment data vectors: {stats.namespaces['employment-data'].vector_count}")

In [None]:
# Test a query
test_query = "I'm a recent graduate looking for tech jobs with good salary"
test_embedding = genai.embed_content(
    model=model_name,
    content=test_query
)['embedding']

results = index.query(
    vector=test_embedding,
    top_k=3,
    include_metadata=True,
    namespace="employment-data"
)

print("\nTest Query Results:")
for i, match in enumerate(results.matches):
    print(f"\n{i+1}. {match.metadata['city']}, {match.metadata['state']} (Score: {match.score:.3f})")
    print(f"   Recent Graduate Score: {match.metadata['recent_graduate_score']}/10")
    print(f"   Average Salary: ${match.metadata['average_salary']:,}")
    print(f"   Top Industries: {match.metadata['top_industries']}")