In [1]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
import re
import pandas as pd
import json

## LLM Gemini model preparation

Please get your GEMINI_API_KEY in Google AI Studio first, then add it to `.env` file in the root directory in the same way shown in `.env_example` file.

In [2]:
# Load environment variables from .env file
load_dotenv()

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

Gemini 1.5 Flash

Rate Limits:

- 15 RPM (requests per minute)

- 1 million TPM (tokens per minute)

- 1,500 RPD (requests per day)

Price (input)

- Free of charge

In [3]:
# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 1024,
    "response_mime_type": "text/plain",
}

SYSTEM_PROMPT = """
You are an AI block of data enriching pipeline. I need you to answer if following description has specific details. 
Provide boolean responses in json format with the following keys:
{
    is_bathroom_shared: "is shared bathroom mentioned",
    is_kitchen_shared: "is shared kitchen mentioned",
    is_host_on_prem: "is host living on premises",
    shared_utilities: "are shared utilities or facilities like laundry rooms mentioned",
    is_entrance_separate: "is separate entrance mentioned",
    other_guests_on_prem: "are other guests going to live on the premises",
    host_profile_mentions: "does host's profile mention they are a Superhost or similar detail",
    property_separate_suite: "is there mention of private or separate guest suites/apartments",
    host_interaction: "does the listing mention limited or no interaction with the host",
    house_rules_host_indication: "are there specific rules indicating host presence (e.g., noise levels, curfews)",
    listing_personal_touches: "does the listing language suggest personal touches",
    entire_property: "is the property type listed as Entire home/apt or Private room",
    host_additional_services: "does the host offer services like breakfast indicating presence"
}

Response with only Json! No additional text or markdown! only True or False
Expecting output like that:
{
  is_bathroom_shared: True,
  is_kitched_shared: True,
  is_host_on_prem: False,
  other_guests_on_prem: False, 
  ...
}
"""

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash-latest",
    generation_config=generation_config,
    system_instruction=SYSTEM_PROMPT
)

## Data preparation

In [4]:
# Specify the path to your .jsonl file
file_path = '../../data/description_parsing/descriptions_output.jsonl'

# Read the .jsonl file into a DataFrame
df = pd.read_json(file_path, lines=True)

In [5]:
print(df.shape)
df.head()

(2184, 7)


Unnamed: 0,listing_id,place_description,the_space_description,guest_access_description,other_things_to_note_description,registration_number_description,during_your_stay_description
0,36322781,Private basement suite in a quiet neighbourhoo...,Electronic Door Lock<br />Heated floors<br />C...,The Suite and backyard if requested.,"My wife, myself, and 3 young children live ups...",24-157781,
1,1152216438211720832,Enjoy a stylish experience at this centrally l...,,,,24-159950,
2,21356520,The room is bright with big window . Very quit...,Very clean and bright.,Guest can access my kitchen if no one using th...,The room is spacious with closet and study tab...,24-157581,I like socializing with the guest. I’m availa...
3,1064160331460880512,Centrally located suite with self check-in and...,"Main bedroom, with a queen-sized bed and a com...",Your suite is the ground floor of our home. Yo...,"We have a 10-year-old Shih-tzu dog named Byte,...",24-172645,
4,1034049419047464832,License # 23 031530<br /><br />Have a long lay...,Comfortable bedroom with a bay window over loo...,"Shared washroom, living room, beautiful garden...","We have dogs, cat and hedgehogs, if you are al...",23-031530,we live on the ground floor so always here to ...


In [6]:
# Function to join columns with their headers
def join_columns_with_headers(row):
    return " \n ".join([f"{col.replace('_', ' ').capitalize()}:\n{row[col]}" 
                      for col in df.columns 
                      if col not in ['listing_id', 'registration_number_description']
                      ])

# Applying the function to each row
df['combined_description'] = df[['place_description', 
                                 'the_space_description', 
                                 'guest_access_description', 
                                 'other_things_to_note_description',
                                 'during_your_stay_description']
                                 ].apply(join_columns_with_headers, axis=1)

# Displaying the result
print(df[['listing_id', 'combined_description']])


               listing_id                               combined_description
0                36322781  Place description:\nPrivate basement suite in ...
1     1152216438211720832  Place description:\nEnjoy a stylish experience...
2                21356520  Place description:\nThe room is bright with bi...
3     1064160331460880512  Place description:\nCentrally located suite wi...
4     1034049419047464832  Place description:\nLicense # 23 031530<br /><...
...                   ...                                                ...
2179             19744355  Place description:\nNote: check-in is at 6pm a...
2180   873041376969359872  Place description:\nYou'll have a great time a...
2181   914167814583406464  Place description:\nEnjoy a 1 bedroom 1 bathro...
2182   950905428130826624  Place description:\nPrivate room with queen be...
2183   941828367326510592  Place description:\nMake yourself at home to o...

[2184 rows x 2 columns]


In [7]:
sample_combined_description = df.combined_description[0]

print(sample_combined_description)

Place description:
Private basement suite in a quiet neighbourhood close to Nanaimo Skytrain station.<br />-10min to Downtown Vancouver<br />-10min to GM Place/ BC Place (Concerts, Events, Sporting events) <br />-10min to Metrotown Shopping Center<br />-15min to BCIT via bus<br /><br />-10min walk to Troute lake <br />-3min walk to Small Grocery store<br /><br />Flexible Check-In times<br />Plenty of street parking 
 The space description:
Electronic Door Lock<br />Heated floors<br />Cozy and Comfortable basement suite<br />Not the largest but great for travelers who are exploring the city.  Perfect for short stays and adventurous guests.<br /><br />***Guests staying indoors the majority of their stay may find the suite small** <br />-Sofa is too small to sleep on. <br />-Not ideal if you have a lot of luggage** 
 Guest access description:
The Suite and backyard if requested. 
 Other things to note description:
My wife, myself, and 3 young children live upstairs. Tenants next door. Occ

## Run Gemini to generate response

In [8]:
response = model.generate_content(sample_combined_description)

print(response.text)

{
  "is_bathroom_shared": False,
  "is_kitchen_shared": False,
  "is_host_on_prem": True,
  "shared_utilities": True,
  "is_entrance_separate": True,
  "other_guests_on_prem": True,
  "host_profile_mentions": False,
  "property_separate_suite": True,
  "host_interaction": False,
  "house_rules_host_indication": True,
  "listing_personal_touches": False,
  "entire_property": False,
  "host_additional_services": False
}


In [9]:
def clean_features(features):
    # Use regex to remove everything before the first { and after the last }
    cleaned_features = re.sub(r'^.*?({.*}).*$', r'\1', features.lower(), flags=re.DOTALL)
    return cleaned_features

In [10]:
cleaned_features = clean_features(response.text)

json.loads(cleaned_features)

{'is_bathroom_shared': False,
 'is_kitchen_shared': False,
 'is_host_on_prem': True,
 'shared_utilities': True,
 'is_entrance_separate': True,
 'other_guests_on_prem': True,
 'host_profile_mentions': False,
 'property_separate_suite': True,
 'host_interaction': False,
 'house_rules_host_indication': True,
 'listing_personal_touches': False,
 'entire_property': False,
 'host_additional_services': False}