In [16]:
#!pip install google-genai
#!pip install pymongo

In [1]:
import pandas as pd
import pymysql
import sqlalchemy
from pymongo import MongoClient
import os
from google import genai
import json

## API Set Up: Google Gemini LLM API

In [2]:
# Set up API key for Google Gemini LLM API
# Read in the key from json file
def load_config():
    with open("config.json", "r") as file:
        return json.load(file)

In [3]:
# load the key
config = load_config()
API_KEY = config["API_KEY"]

In [4]:
# Set up the AI model we want to use
class Custom_GenAI:

    def __init__(self, API_KEY):
        
        self.client = genai.Client(api_key=API_KEY)

    def ask_ai(self, question):
        response = self.client.models.generate_content(
        model="gemini-2.0-flash",
        contents=question,
        )

        return response.text

In [5]:
ai = Custom_GenAI(API_KEY)

### Test the AI

In [6]:
# Test whether the AI works
ques = "How many states are in US, give me simple answer"
res = ai.ask_ai(ques)
print(res)

50



## Database Connection

### MySQL & MongoDB NLI

In [29]:
# MySQL Connection 
apt = sqlalchemy.create_engine("mysql+pymysql://root:Dsci-551@localhost/aptadditional")
# MongoDB Connection 
client = MongoClient("mongodb://localhost:27017/")

In [38]:
class Custom_GenAI:

    def __init__(self, API_KEY):
        self.client = genai.Client(api_key=API_KEY)

    def ask_ai(self, question):
        prompt = f"""
You are a natural language interface for both MySQL and MongoDB databases.

The user has access to the following databases and their respective tables/collections:

MySQL Database: aptadditional
- amenities (amenity_id, amenity_name)
- property_amenities (id, amenity_id)
- pricing (id, price, currency)
- price_details (id, price_display, price_type)
- pets (id, pets_allowed, fee)
    - `pets_allowed` includes only: 'Cats,Dogs', 'No pets allowed', 'Cats', or 'Dogs'
    - `fee` includes 'Yes' or 'No' for extra fees

MongoDB Database: rental
- general_info (id, title, body)
    - `title` is a short description of the apartment
    - `body` is the long description of the apartment
- location (id, cityname, state, latitude, longitude)
    - `state` uses U.S. state abbreviations like 'CA', 'NY', 'NC'
- property_details (id, square_feet, bedrooms, bathrooms)
- media (id, has_photo)
- sources (id, source, time)

Assume mySQL has initialized as:
apt = sqlalchemy.create_engine("mysql+pymysql://root:Dsci-551@localhost/aptadditional")

Assume MongoClient has been initialized as:
client = MongoClient("mongodb://localhost:27017/")

Your task:
- Determine whether the query should run on the MySQL or MongoDB database
- Return a valid JSON object with two keys:
  - "engine": either "mysql" or "mongodb"
  - "query": a single-line SQL or a **single-line valid PyMongo command**

Pymongo Supported commands:
- `.find(filter, projection)` for simple filtering
- `.aggregate([...])` for advanced operations using `$match`, `$group`, `$sort`, `$limit`, `$skip`, `$project`, `$lookup`
- `.insertOne(...)`, `.insertMany(...)`, `.updateOne(...)`, `.deleteOne(...)` for data modification
- Do **not** use unsupported methods like `.count_documents()`, `.find_one()`, etc.

Always:
- Use fully qualified MySQL table names (e.g., `additionalInfo.pricing`)
- Format the response as a single-line JSON 
- Use **PyMongo command only** for MongoDB: Use PyMongo syntax with client[...] for all MongoDB collections.
Example: client['rental'].aggregate([...])


**DO NOT** explain the query.  
**DO NOT** use natural language.  
**DO NOT** return markdown.

---

Question: {question}
"""
        try:
            response = self.client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt
            )
            
            raw_output = response.text.strip()
            
            # Handle common formatting issues
            if raw_output.startswith("```json"):
                raw_output = raw_output.replace("```json", "").replace("```", "").strip()
            
            parsed = json.loads(raw_output)
            return parsed  # { "engine": ..., "query": ... }

        except json.JSONDecodeError as e:
            print("Failed to parse LLM output as JSON:", response.text)
            raise e

In [40]:
ai = Custom_GenAI(API_KEY)

print("Welcome to the Unified Database Natural Language Interface!")
print("Type 'exit' to quit.\n")

while True:
    question = input("Enter your question: ")
    if question.lower() == "exit":
        print("Thank you and bye!")
        break

    try:
        response = ai.ask_ai(question)
        
        print("\nEngine Selected:", response["engine"])
        print("Generated Query:", response["query"])

        if response["engine"] == "mysql":
            query = response["query"].replace("%", "%%") 
            result_sql = pd.read_sql(query, apt)
            print("\nMySQL Query Result:")
            display(result_sql.head())

        elif response["engine"] == "mongodb":
            mongo_query = response["query"].replace("```python", "").replace("```", "").strip()
            try:
                result_mongo = list(eval(mongo_query))
                print("\nMongoDB Query Result:")
                if len(result_mongo) == 1 and isinstance(result_mongo[0], dict):
                    # print any key-value result
                    for k, v in result_mongo[0].items():
                        print(f"{k.replace('_', ' ').capitalize()}: {v}")
                elif result_mongo:
                    for doc in result_mongo:
                        print(doc)
                else:
                    print("No results found.")
            except Exception as e:
                print("MongoDB query execution error:", e)

        else:
            print("Unknown engine specified in response.")

    except json.JSONDecodeError:
        print("Error: LLM output could not be parsed as JSON.")
    except Exception as e:
        print("Execution error:", e)

Welcome to the Unified Database Natural Language Interface!
Type 'exit' to quit.



Enter your question:  What is the average price of apartments that allow dogs?



Engine Selected: mysql
Generated Query: SELECT AVG(p.price) FROM aptadditional.pricing p JOIN aptadditional.pets pet ON p.id = pet.id WHERE pet.pets_allowed LIKE '%Dogs%'

MySQL Query Result:


Unnamed: 0,AVG(p.price)
0,1509.9131


Enter your question:  exit


Thank you and bye!


### MySQL

In [21]:
# MySQL Connection 
apt = sqlalchemy.create_engine("mysql+pymysql://root:Dsci-551@localhost/aptadditional")

In [22]:
class Custom_GenAI:

    def __init__(self, API_KEY):
        self.client = genai.Client(api_key=API_KEY)

    def ask_ai(self, question):
        prompt = f"""
You are a MySQL database assistant.

The user has access to the following MySQL databases and tables:

Database 1:  `rental`
- `general_info(id, title, cityname, state, latitude, longitude)`
  - `title` is a short description of the apartment
  - `state` uses U.S. state abbreviations like 'CA', 'NY', 'NC'

- `property_details(id, square_feet, bedrooms, bathrooms, pets_allowed)`
  - `pets_allowed` includes only: 'Cats,Dogs', 'No pets allowed', 'Cats', or 'Dogs'

- `amenities(amenity_id, amenity_name)`
- `property_amenities(id, amenity_id)`

Database 2: `price`
- `pricing(id, price, currency)`
- `price_details(id, price_display, price_type)`

Database 3: `aptadditional`
- `media(id, has_photo)`
- `sources(id, source, time)`

Your job is to **only** return a valid **MySQL SQL query** using **fully qualified table names**, e.g. `rental.general_info`.

**DO NOT explain anything. DO NOT provide natural language responses. DO NOT include markdown.**
Only return the SQL query.

---

Question: {question}
"""
        response = self.client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt
        )

        # Remove markdown, comments, and extra text
        sql_raw = response.text
        sql_clean = sql_raw.strip().strip("```sql").strip("```").splitlines()
        
        sql_lines = [line for line in sql_clean if not line.strip().startswith("--") and not line.strip().lower().startswith("please") and line.strip()]
        return "\n".join(sql_lines)

In [23]:
ai = Custom_GenAI(API_KEY)

# NLI Section
print("Welcome to MySQL Natural Language to Database Interface!")
print("Type 'exit' to quit.\n")

while True:
    question = input("Enter your question: ")
    if question.lower() == "exit":
        print("Thank you and bye!")
        break
    
    sql_query = ai.ask_ai(question)
    print("\nGenerated SQL Query:", sql_query)

    try:
        result_sql = pd.read_sql(sql_query, apt)
        print("\nQuery Result:")
        display(result_sql.head())
    except Exception as e:
        print("Error executing SQL:", e)

Welcome to MySQL Natural Language to Database Interface!
Type 'exit' to quit.



Enter your question:  How many apartments have 3 bedrooms and allow cats in NC?



Generated SQL Query: ```sql
SELECT
  COUNT(DISTINCT rental.general_info.id)
FROM rental.general_info
JOIN rental.property_details
  ON rental.general_info.id = rental.property_details.id
WHERE
  rental.property_details.bedrooms = 3 AND rental.property_details.pets_allowed LIKE '%Cats%' AND rental.general_info.state = 'NC';
```
Error executing SQL: unsupported format character 'C' (0x43) at index 259


Enter your question:  exit


Thank you and bye!


### MongoDB

In [24]:
# === MongoDB Connection ===
client = MongoClient("mongodb://localhost:27017/")

In [30]:
# === Custom Google LLM Wrapper ===
class Custom_GenAI:
    def __init__(self, API_KEY):
        self.client = genai.Client(api_key=API_KEY)

    def ask_ai(self, question):
        prompt = f"""
You are an assistant that translates natural language questions into valid PyMongo commands. 
Use the following databases and collections:

Database 1:  `rental`
- `general_info(id, title, cityname, state, latitude, longitude)`
  - `title` is a short description of the apartment
  - `state` uses U.S. state abbreviations like 'CA', 'NY', 'NC'

- `property_details(id, square_feet, bedrooms, bathrooms, pets_allowed)`
  - `pets_allowed` includes only: 'Cats,Dogs', 'No pets allowed', 'Cats', or 'Dogs'

- `amenities(amenity_id, amenity_name)`
- `property_amenities(id, amenity_id)`

Database 2: Database `price`
- `pricing(id, price, currency)`
- `price_details(id, price_display, price_type)`

Database 3: Database `aptadditional`
- `media(id, has_photo)`
- `sources(id, source, time)`

Assume MongoClient has been initialized as:
client = MongoClient("mongodb://localhost:27017/")

Your task is to choose the correct command format based on query complexity:

Supported commands:
- `.find(filter, projection)` for simple filtering
- `.aggregate([...])` for advanced operations using `$match`, `$group`, `$sort`, `$limit`, `$skip`, `$project`, `$lookup`
- `.insertOne(...)`, `.insertMany(...)`, `.updateOne(...)`, `.deleteOne(...)` for data modification

Do **not** use unsupported methods like `.count_documents()`, `.find_one()`, etc.

Always use the correct database and collection, and return a **single-line valid PyMongo command**. 
Return ONLY the PyMongo command — no explanations, no markdown, no line breaks.

Question: {question}
"""
        response = self.client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt,
        )
        return response.text.strip()

In [31]:
ai = Custom_GenAI(API_KEY)

# NLI Section
print("Welcome to MongoDB Natural Language to Database Interface!")
print("Type 'exit' to quit.\n")

while True:
    question = input("Enter your question: ")
    if question.lower() == "exit":
        print("Thank you and bye!")
        break

    mongo_query = ai.ask_ai(question)
    mongo_query = mongo_query.replace("```python", "").replace("```", "").strip()

    print("\nGenerated Mongo Query:", mongo_query)
    try:
        result_mongo = list(eval(mongo_query))
        print("\nMongoDB Query Result:")
        if len(result_mongo) == 1 and isinstance(result_mongo[0], dict):
            # print any key-value result
            for k, v in result_mongo[0].items():
                print(f"{k.replace('_', ' ').capitalize()}: {v}")
        elif result_mongo:
            for doc in result_mongo:
                print(doc)
    except Exception as e:
        print("Error executing MongoDB query:", e)

Welcome to MongoDB Natural Language to Database Interface!
Type 'exit' to quit.



Enter your question:  Show 3 apartments in NC with title only



Generated Mongo Query: client["rental"]["general_info"].find({"state": "NC"}, {"title": 1, "_id": 0}).limit(3)

MongoDB Query Result:
{'title': 'Three BR 3101 Morningside Drive'}
{'title': 'Three BR 5206 Moonlight Drive Trail Sw'}
{'title': 'Studio apartment 4016 Twickenham Court'}


Enter your question:  How many listings are there in NC?



Generated Mongo Query: client['rental']['general_info'].aggregate([{"$match": {"state": "NC"}}, {"$count": "total"}])

MongoDB Query Result:
Total: 6292


Enter your question:  exit


Thank you and bye!
