In [106]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
import re
import pandas as pd
import json

## LLM Gemini model preparation

Please get your GEMINI_API_KEY in Google AI Studio first, then add it to `.env` file in the root directory in the same way shown in `.env_example` file.

In [98]:
# Load environment variables from .env file
load_dotenv()

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

Gemini 1.5 Flash

Rate Limits:

- 15 RPM (requests per minute)

- 1 million TPM (tokens per minute)

- 1,500 RPD (requests per day)

Price (input)

- Free of charge

In [261]:
# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 1024,
    "response_mime_type": "text/plain",
}

SYSTEM_PROMPT = """
You are a helpful assistant. You will analyze the given text and provide a JSON response indicating whether the text mentions or suggests certain things. Please follow these guidelines:
1. "is_bathroom_shared": Check if there is any mention of a shared bathroom. Ignore mentions of other shared amenities like "shared kitchen" or "shared laundry". Look for keywords like "shared bathroom", "bathroom shared", "bathroom is shared", etc.
2. "is_kitchen_shared": Check if there is any mention of a shared kitchen. Ignore mentions of other shared amenities like "shared bathroom" or "shared laundry". Look for keywords like "shared kitchen", "kitchen shared", "kitchen is shared", etc.
3. "is_host_in_unit": Check if there is any mention of the host living in the same unit, apartment or house as the guest. Look for keywords like "live in the unit", "live next door", etc.
4. "is_host_on_prem": Check if there is any mention of the host living on premises. Look for keywords like "live on premises", "live in the same building", "live on-site", etc.
5. "shared_utilities": Check if there is any mention of shared utilities or facilities. Ignore mentions of other shared amenities like "shared kitchen" or "shared bathroom". Look for keywords like "shared laundry", "shared laundry room", "laundry shared", "laundry is shared", "shared electricity", "shared internet", "shared cable TV", etc.
6. "is_entrance_separate": Check if there is any mention of separate entrance. Look for keywords like "private entrance", "separate entrance", etc.
7. "other_guests_on_prem": Check if there is any mention of other guests living on the premises/house/apt/apartment or anything indicating that there will be other guests on the premises.
8. "host_profile_mentioned": Check if there is any mention of the host's profile being Superhost or similar detail.
9. "property_separate_suite": Check if there is any mention of private or separate suites/apartments. Look for keywords like "private suite", "separate suite", "separate apartments", etc.
10. "host_interaction": Check if there is any mention of limited or no interaction with the host.
11. "house_rules_host_indication": Check if there is any mention of any specific rule indicating the host's presence. Look for keywords like "noise", "noise levels", "no noise", "curfew", "limited time" "from 10am to 10pm", "until 10pm", etc.
12. "listing_personal_touches": Check if there is any mention of language in the text suggesting personal touches.
13. "entire_property": Check if there is any mention of the property type being listed. Ignore mentions of other property types like "entire room", "private room" or "shared room". Look for keywords like "entire home", "entire house", "entire studio", "entire apt", "entire apartment", "entire condo", "entire penthouse", "whole house", "whole apartment", "whole studio", etc.
14. "host_additional_services": Check if there is any mention of host offering services, therefore, indicating their presence. Look for keywords like "breakfast", "breakfast provided", "breakfast included", "meals provided", "meal provided", "meal included", "available upon request", etc.
15. "has_self_check_in": Check if there is any menition of guest self check-in. Look for keywords like "self check-in", "self check in", "self checkin", etc.

Respond with a JSON formatted output. No additional text, markdown, details or annotations required:
{
  "is_bathroom_shared": <true/false>,
  "is_kitched_shared": <true/false>,
  "is_host_on_prem": <true/false>,
  "other_guests_on_prem": <true/false>, 
  ...
}
"""

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash-latest",
    generation_config=generation_config,
    system_instruction=SYSTEM_PROMPT
)

## Data preparation

In [262]:
# Specify the path to your .jsonl file
file_path = '../../data/description_parsing/descriptions_output_burnaby.jsonl'

# Read the .jsonl file into a DataFrame
df = pd.read_json(file_path, lines=True)

In [263]:
print(df.shape)
df.head()

(297, 7)


Unnamed: 0,listing_id,place_description,the_space_description,guest_access_description,registration_number_description,other_things_to_note_description,during_your_stay_description
0,36007359,* New renovated studio<br />* Separate entranc...,"Studio enough for 2 peoples with appliances, s...",Self check in.<br />Access with smart lock code.,00180401,,
1,16396887,* New renovated 1 bedroom ground level suite.<...,It's an amazing private space that is good for...,"High speed Wi-Fi, Paid Cable TV, Netflix movie...",,* ENJOY FREE gifts inside basket on the counte...,
2,944162829781112704,Welcome to our brand new and beautiful apartme...,You will enjoy the quiet and private room with...,This is a private big bedroom with a private b...,,1) The private bathroom is not inside the room...,
3,1045877421386474496,"Enjoy a luxury, comfortable stay in this brigh...",,,00180521,,
4,13131513,Enjoy living in Vancouver’s premier neighbourh...,"Freshly painted walls, designer sheets, heated...",During your trip you will have unfettered acce...,24-160094,.,Our host guest relationship is of the utmost i...


In [264]:
# Function to join columns with their headers
def join_columns_with_headers(row):
    return " \n ".join([f"{col.replace('_', ' ').capitalize()}:\n{row[col]}" 
                      for col in df.columns 
                      if col not in ['listing_id', 'registration_number_description']
                      ])

# Applying the function to each row
df['combined_description'] = df[['place_description', 
                                 'the_space_description', 
                                 'guest_access_description', 
                                 'other_things_to_note_description',
                                 'during_your_stay_description']
                                 ].apply(join_columns_with_headers, axis=1)

# Displaying the result
print(df[['listing_id', 'combined_description']])


              listing_id                               combined_description
0               36007359  Place description:\n* New renovated studio<br ...
1               16396887  Place description:\n* New renovated 1 bedroom ...
2     944162829781112704  Place description:\nWelcome to our brand new a...
3    1045877421386474496  Place description:\nEnjoy a luxury, comfortabl...
4               13131513  Place description:\nEnjoy living in Vancouver’...
..                   ...                                                ...
292  1092272050579725440  Place description:\nOur cute studio suite is w...
293  1129809878062596864  Place description:\nForget your worries in thi...
294  1084101859863696256  Place description:\nComing to a grand home, co...
295             50795954  Place description:\nWelcome at this peaceful a...
296  1134596079356454016  Place description:\nLocated in Desirable Locat...

[297 rows x 2 columns]


In [265]:
sample_combined_description = df.combined_description[3]

print(sample_combined_description)

Place description:
Enjoy a luxury, comfortable stay in this bright, family friendly, safe and central neighbourhood. <br /><br />Bed Size: Full Double<br /><br />Walking distance to transit, trails, parks, grocery stores, Kensington Plaza + much more! A 20 minute drive to downtown and just a 5 minute drive to The Amazing Brentwood Mall.<br /><br />Walking distance (across the street) to Bus Routes to SFU + BCIT: Bus #144 + R5<br /><br />SFU : 6 minute drive<br />BCIT: 12 minute drive.<br /><br />Plenty of street parking available. EV charging available upon request. 
 The space description:
nan 
 Guest access description:
nan 
 Other things to note description:
nan 
 During your stay description:
nan


## Run Gemini to generate response

In [266]:
response = model.generate_content(sample_combined_description)

print(response.text)

{
  "is_bathroom_shared": false,
  "is_kitchen_shared": false,
  "is_host_in_unit": false,
  "is_host_on_prem": false,
  "shared_utilities": false,
  "is_entrance_separate": false,
  "other_guests_on_prem": false,
  "host_profile_mentioned": false,
  "property_separate_suite": false,
  "host_interaction": false,
  "house_rules_host_indication": false,
  "listing_personal_touches": false,
  "entire_property": false,
  "host_additional_services": true,
  "has_self_check_in": false
}


In [191]:
def clean_features(features):
    # Use regex to remove everything before the first { and after the last }
    cleaned_features = re.sub(r'^.*?({.*}).*$', r'\1', features.lower(), flags=re.DOTALL)
    return cleaned_features

In [192]:
cleaned_features = clean_features(response.text)

json.loads(cleaned_features)

{'is_bathroom_shared': False,
 'is_kitchen_shared': False,
 'is_host_on_prem': False,
 'shared_utilities': True,
 'is_entrance_separate': True,
 'other_guests_on_prem': False,
 'host_profile_mentioned': False,
 'property_separate_suite': True,
 'host_interaction': True,
 'house_rules_host_indication': True,
 'listing_personal_touches': True,
 'entire_property': True,
 'host_additional_services': False,
 'has_self_check_in': True}