In [16]:
#!pip install google-genai
#!pip install pymongo

In [1]:
import pandas as pd
import pymysql
import sqlalchemy
from pymongo import MongoClient
import os
from google import genai
import json

## API Set Up: Google Gemini LLM API

In [2]:
# Set up API key for Google Gemini LLM API
# Read in the key from json file
def load_config():
    with open("config.json", "r") as file:
        return json.load(file)

In [3]:
# load the key
config = load_config()
API_KEY = config["API_KEY"]

In [4]:
# Set up the AI model we want to use
class Custom_GenAI:

    def __init__(self, API_KEY):
        self.client = genai.Client(api_key=API_KEY)

    def ask_ai(self, question):
        response = self.client.models.generate_content(
        model="gemini-2.0-flash",
        contents=question,
        )

        return response.text

In [5]:
ai = Custom_GenAI(API_KEY)

### Test the AI

In [6]:
# Test whether the AI works
ques = "How many states are in US, give me simple answer"
res = ai.ask_ai(ques)
print(res)

There are 50 states in the US.



## Database Connection

### MySQL

In [21]:
# MySQL Connection 
apt = sqlalchemy.create_engine("mysql+pymysql://root:Dsci-551@localhost/rental")

In [22]:
class Custom_GenAI:

    def __init__(self, API_KEY):
        self.client = genai.Client(api_key=API_KEY)

    def ask_ai(self, question):
        prompt = f"""
You are a MySQL database assistant.

The user has access to the following MySQL databases and tables:

Database 1:  `rental`
- `general_info(id, title, cityname, state, latitude, longitude)`
  - `title` is a short description of the apartment
  - `state` uses U.S. state abbreviations like 'CA', 'NY', 'NC'

- `property_details(id, square_feet, bedrooms, bathrooms, pets_allowed)`
  - `pets_allowed` includes only: 'Cats,Dogs', 'No pets allowed', 'Cats', or 'Dogs'

- `amenities(amenity_id, amenity_name)`
- `property_amenities(id, amenity_id)`

Database 2: `price`
- `pricing(id, price, currency)`
- `price_details(id, price_display, price_type)`

Database 3: `aptadditional`
- `media(id, has_photo)`
- `sources(id, source, time)`

Your job is to **only** return a valid **MySQL SQL query** using **fully qualified table names**, e.g. `rental.general_info`.

**DO NOT explain anything. DO NOT provide natural language responses. DO NOT include markdown.**
Only return the SQL query.

---

Question: {question}
"""
        response = self.client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt
        )

        # Remove markdown, comments, and extra text
        sql_raw = response.text
        sql_clean = sql_raw.strip().strip("```sql").strip("```").splitlines()
        
        sql_lines = [line for line in sql_clean if not line.strip().startswith("--") and not line.strip().lower().startswith("please") and line.strip()]
        return "\n".join(sql_lines)

In [23]:
ai = Custom_GenAI(API_KEY)

# NLI Section
print("Welcome to MySQL Natural Language to Database Interface!")
print("Type 'exit' to quit.\n")

while True:
    question = input("Enter your question: ")
    if question.lower() == "exit":
        print("Thank you and bye!")
        break
    
    sql_query = ai.ask_ai(question)
    print("\nGenerated SQL Query:", sql_query)

    try:
        result_sql = pd.read_sql(sql_query, apt)
        print("\nQuery Result:")
        display(result_sql.head())
    except Exception as e:
        print("Error executing SQL:", e)

Welcome to MySQL Natural Language to Database Interface!
Type 'exit' to quit.



Enter your question:  How many apartments have 3 bedrooms and allow cats in NC?



Generated SQL Query: ```sql
SELECT
  COUNT(DISTINCT rental.general_info.id)
FROM rental.general_info
JOIN rental.property_details
  ON rental.general_info.id = rental.property_details.id
WHERE
  rental.property_details.bedrooms = 3 AND rental.property_details.pets_allowed LIKE '%Cats%' AND rental.general_info.state = 'NC';
```
Error executing SQL: unsupported format character 'C' (0x43) at index 259


Enter your question:  exit


Thank you and bye!


# Question lists:
1. What tables are in the rental database?
2. What columns are in the general_info table?
3. Show me 3 example rows from the property_details table.
4. What data is available about apartment prices?
5. Show me 5 entries from the media table in the aptadditional database.
6. What attributes does the pricing table have?
7. Show me 5 apartments in New York.
8. Find all apartments in California with more than 2 bedrooms.
9. Show 10 listings that allow pets.
10. List apartments in Texas with square footage greater than 1000.
11. Which apartments in Florida have 2 bathrooms and photos?
12. What’s the price of the cheapest apartment in Virginia?
13. Show me apartments with price between 1000 and 1500 dollars in NC.
14. What cities in the rental database have more than 100 listings? (GROUP BY)
15. List the average price of apartments grouped by state. (GROUP BY)
16. Which cities have average price over $2000? (HAVING)
17. Show me 5 most expensive listings in California. (ORDER BY)
18. List apartments with photos and pets allowed, ordered by price.
19. Which apartments have no amenities?
# Join 
20. Show titles and prices of all apartments in the general_info and pricing tables.
21. Find apartments with photos and show their price.
22. What amenities are available in apartments in Florida?
23. For each apartment, show the number of amenities it has.
24. List apartments with 3 bedrooms that have a gym.
25. Which apartments have prices listed in USD and have a photo?
# Data Modification
26. Add a new apartment listing in New York with 2 bedrooms and a price of 2000.
27. Update the price of apartment with id 5668639818 to 1800.
28. Delete the apartment in rental.general_info with id 5668639818.
29. Add a new media entry for apartment 5668639819 with has_photo as 'Thumbnail'.
30. Change the pets_allowed for apartment 5668639818 to 'None'.
31. Insert a new entry into pricing with id 6000000001, price 2200, and currency USD.

### MongoDB

In [24]:
# === MongoDB Connection ===
client = MongoClient("mongodb://localhost:27017/")

In [30]:
# === Custom Google LLM Wrapper ===
class Custom_GenAI:
    def __init__(self, API_KEY):
        self.client = genai.Client(api_key=API_KEY)

    def ask_ai(self, question):
        prompt = f"""
You are an assistant that translates natural language questions into valid PyMongo commands. 
Use the following databases and collections:

Database 1:  `rental`
- `general_info(id, title, cityname, state, latitude, longitude)`
  - `title` is a short description of the apartment
  - `state` uses U.S. state abbreviations like 'CA', 'NY', 'NC'

- `property_details(id, square_feet, bedrooms, bathrooms, pets_allowed)`
  - `pets_allowed` includes only: 'Cats,Dogs', 'No pets allowed', 'Cats', or 'Dogs'

- `amenities(amenity_id, amenity_name)`
- `property_amenities(id, amenity_id)`

Database 2: Database `price`
- `pricing(id, price, currency)`
- `price_details(id, price_display, price_type)`

Database 3: Database `aptadditional`
- `media(id, has_photo)`
- `sources(id, source, time)`

Assume MongoClient has been initialized as:
client = MongoClient("mongodb://localhost:27017/")

Your task is to choose the correct command format based on query complexity:

Supported commands:
- `.find(filter, projection)` for simple filtering
- `.aggregate([...])` for advanced operations using `$match`, `$group`, `$sort`, `$limit`, `$skip`, `$project`, `$lookup`
- `.insertOne(...)`, `.insertMany(...)`, `.updateOne(...)`, `.deleteOne(...)` for data modification

Do **not** use unsupported methods like `.count_documents()`, `.find_one()`, etc.

Always use the correct database and collection, and return a **single-line valid PyMongo command**. 
Return ONLY the PyMongo command — no explanations, no markdown, no line breaks.

Question: {question}
"""
        response = self.client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt,
        )
        return response.text.strip()

In [31]:
ai = Custom_GenAI(API_KEY)

# NLI Section
print("Welcome to MongoDB Natural Language to Database Interface!")
print("Type 'exit' to quit.\n")

while True:
    question = input("Enter your question: ")
    if question.lower() == "exit":
        print("Thank you and bye!")
        break

    mongo_query = ai.ask_ai(question)
    mongo_query = mongo_query.replace("```python", "").replace("```", "").strip()

    print("\nGenerated Mongo Query:", mongo_query)
    try:
        result_mongo = list(eval(mongo_query))
        print("\nMongoDB Query Result:")
        if len(result_mongo) == 1 and isinstance(result_mongo[0], dict):
            # print any key-value result
            for k, v in result_mongo[0].items():
                print(f"{k.replace('_', ' ').capitalize()}: {v}")
        elif result_mongo:
            for doc in result_mongo:
                print(doc)
    except Exception as e:
        print("Error executing MongoDB query:", e)

Welcome to MongoDB Natural Language to Database Interface!
Type 'exit' to quit.



Enter your question:  Show 3 apartments in NC with title only



Generated Mongo Query: client["rental"]["general_info"].find({"state": "NC"}, {"title": 1, "_id": 0}).limit(3)

MongoDB Query Result:
{'title': 'Three BR 3101 Morningside Drive'}
{'title': 'Three BR 5206 Moonlight Drive Trail Sw'}
{'title': 'Studio apartment 4016 Twickenham Court'}


Enter your question:  How many listings are there in NC?



Generated Mongo Query: client['rental']['general_info'].aggregate([{"$match": {"state": "NC"}}, {"$count": "total"}])

MongoDB Query Result:
Total: 6292


Enter your question:  exit


Thank you and bye!
