In [1]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
import re
import pandas as pd
import json
import time

  from .autonotebook import tqdm as notebook_tqdm


## LLM Gemini model preparation

Please get your GEMINI_API_KEY in Google AI Studio first, then add it to `.env` file in the root directory in the same way shown in `.env_example` file.

In [2]:
# Load environment variables from .env file
load_dotenv()

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

Gemini 1.5 Flash

Rate Limits:

- 15 RPM (requests per minute)

- 1 million TPM (tokens per minute)

- 1,500 RPD (requests per day)

Price (input)

- Free of charge

In [3]:
# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 1024,
    "response_mime_type": "text/plain",
}

SYSTEM_PROMPT = """
You are a helpful assistant. You will analyze the given text and provide a JSON response indicating whether the text mentions or suggests certain things. Please follow these guidelines:
1. "listing_id": This is an exeption to the rule. Value of "listing_id" should be of type string, replaced and just copied from "listing_id" attribute.
2. "is_bathroom_shared": Check if there is any mention of a shared bathroom. Ignore mentions of other shared amenities like "shared kitchen" or "shared laundry". Look for keywords like "shared bathroom", "bathroom shared", "bathroom is shared", etc.
3. "is_kitchen_shared": Check if there is any mention of a shared kitchen. Ignore mentions of other shared amenities like "shared bathroom" or "shared laundry". Look for keywords like "shared kitchen", "kitchen shared", "kitchen is shared", etc.
4. "is_host_in_unit": Check if there is any mention of the host living in the same unit as the guest. Look for keywords like "we live in the unit", "we live in the other room", etc. If the text explicitly indicates that the host lives upstairs, downstairs, or in a separate building or unit, set this to false.
5. "is_host_on_prem": Check if there is any mention of the host living on premises. One clue would be an indication that the host owns a pet such as a dog. Look for keywords like "live on premises", "live in the same building", "live on-site", etc.
6. "shared_utilities": Check if there is any mention of shared utilities or facilities. Ignore mentions of other shared amenities like "shared kitchen" or "shared bathroom". Look for keywords like "shared laundry", "shared laundry room", "laundry shared", "laundry is shared", "shared electricity", "shared internet", "shared cable TV", etc.
7. "is_entrance_separate": Check if there is any mention of separate entrance. Look for keywords like "private entrance", "separate entrance", etc.
8. "other_guests_on_prem": Check if there is any mention of other guests living on the premises/house/apt/apartment or anything indicating that there will be other guests on the premises.
9. "host_profile_mentioned": Check if there is any mention of the host's profile being Superhost or similar detail.
10. "is_separate_suite": Check if there is any mention of private or separate suites/apartments. Look for keywords like "private suite", "separate suite", "separate apartments", etc.
11. "host_interaction": Check if there is any mention of limited or no interaction with the host.
12. "has_house_rules": Check if there is any mention of any specific rule indicating the host's presence. Ignore standard house rules such as check-in/checkout times and self check-in information like "check-in after 3pm", "checkout before 11am", "self check-in with lockbox", etc. Look for keywords like "noise", "noise levels", "no noise", "no parties", "quiet hours", "no pets", "no smoking", "limited time", "maximum", "minimum", "from 10am to 10pm", "until 10pm", "not allowed", etc.
13. "has_personal_touches": Check if there is any mention of language in the text suggesting personal touches.
14. "entire_property": Check if there is any mention of the property type being listed. Ignore mentions of other property types like "entire room", "private room" or "shared room". Look for keywords like "entire home", "entire house", "entire studio", "entire apt", "entire apartment", "entire condo", "entire penthouse", "whole house", "whole apartment", "whole studio", etc.
15. "includes_additional_service": Check if there is any mention of host offering services, therefore, indicating their presence. Look for keywords like "breakfast", "breakfast provided", "breakfast included", "meals provided", "meal provided", "meal included", "available upon request", "luggage drop-off", "cleaning available", etc.
16. "has_self_check_in": Check if there is any menition of guest self check-in. Look for keywords like "self check-in", "self check in", "self checkin", etc.

Respond with a JSON formatted output. The response should be a JSON object with its key being the listing ID, which can be found in the input text. No additional text, markdown, details or annotations required. Here's an example:
{
  "listing_id": "123456",
  "is_bathroom_shared": <true/false>,
  "is_kitched_shared": <true/false>,
  "is_host_in_unit": <true/false>,
  "is_host_on_prem": <true/false>,
  "shared_utilities": <true/false>, 
  ...
}
"""

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash-latest",
    generation_config=generation_config,
    system_instruction=SYSTEM_PROMPT
)

## Data preparation

In [28]:
# Specify the path to your .jsonl file
file_path = '../../data/description_parsing/output.jsonl'

dtype_dict = {
    "listing_id": str
}

# Read the .jsonl file into a DataFrame
df = pd.read_json(file_path, lines=True, dtype=dtype_dict)

In [29]:
print(df.shape)
df.head()

(1331, 26)


Unnamed: 0,listing_id,checking_in_and_out_house_rule,during_your_stay_house_rule,bathroom_amenities,bedroom_and_laundry_amenities,entertainment_amenities,heating_and_cooling_amenities,home_safety_amenities,internet_and_office_amenities,kitchen_and_dining_amenities,...,the_space_description,guest_access_description,other_things_to_note_description,accommodation_type,before_you_leave_house_rule,scenic_views_amenities,family_amenities,registration_number_description,privacy_and_safety_amenities,during_your_stay_description
0,37294314,"[Check-in after 3:00 p.m., Checkout before 11:...","[2 guests maximum, No pets, No parties or even...","[Hair dryer, Shampoo, Hot water]","[Essentials, Hangers, Bed linens]",[TV with standard cable],[Heating],"[Smoke alarm, Carbon monoxide alarm, Fire exti...",[Wifi],"[Refrigerator, Microwave, Dishes and silverwar...",...,The newly constructed suite features a double/...,Have access to private suite and small area in...,I have a small dog (chihuahua yorkie).,Entire guest suite,,,,,,
1,699440821139041828,"[Check-in after 4:00 p.m., Checkout before 10:...","[4 guests maximum, Pets allowed]",,[Washer],[TV],[Indoor fireplace],"[Exterior security cameras on property, Smoke ...",[Wifi],[Kitchen],...,,,,Entire home,,,,,,
2,23793078,"[Check-in: 4:00 p.m.–12:00 a.m., Checkout befo...","[2 guests maximum, No pets, No parties or even...","[Hair dryer, Cleaning products, Shampoo, Condi...","[Washer, Dryer, Essentials, Hangers, Bed linen...","[TV with standard cable, Books and reading mat...","[Central air conditioning, Central heating, Po...","[Smoke alarm, Carbon monoxide alarm, Fire exti...","[Wifi, Dedicated workspace]","[Kitchen, Refrigerator, Microwave, Cooking bas...",...,This is a cozy one bedroom self-contained suit...,We will provide a pass code upon arrival. It's...,We are sharing this part of our home on a part...,Entire villa,,,,,,
3,909615095483961012,"[Check-in after 1:00 p.m., Checkout before 11:...","[5 guests maximum, No pets, Quiet hours: 11:00...","[Hair dryer, Cleaning products, Shampoo, Condi...","[Free washer – In unit, Free dryer – In unit, ...","[55-inch HDTV, Pool table, Books and reading m...","[Portable fans, Heating]","[Smoke alarm, Carbon monoxide alarm, Fire exti...","[Fast wifi – 284 Mbps, Dedicated workspace]","[Kitchen, Refrigerator, Microwave, Cooking bas...",...,Large suite full kitchen. Fridge stove dishwas...,The entire suite. Also advanced booking is for...,Private entrance. Number lock entry,Entire guest suite,"[Throw trash away, Turn things off, Additional...","[Garden view, Ocean view]",[Outlet covers],,,
4,21683753,"[Check-in after 6:00 p.m., Checkout before 1:0...","[2 guests maximum, No pets, Quiet hours: 11:00...","[Cleaning products, Dove shampoo, Dove condit...","[Essentials, Hangers, Bed linens, Extra pillow...","[32-inch HDTV with Amazon Prime Video, Netflix...","[Portable fans, Heating]","[Exterior security cameras on property, Smoke ...","[Wifi, Dedicated workspace]","[Refrigerator, Cooking basics, Dishes and silv...",...,"Private entrance, Office desk and chair to wor...","Private entrance. Good street parking, enter t...",The cooking space is set up with a rotisserie ...,Entire home,"[Throw trash away, Lock up]",,[Pack ’n play/Travel crib – available upon req...,,,


In [30]:
# Function to join columns with their headers
def join_columns_with_headers(row):
    return " \n ".join([f"{col.replace('_', ' ').capitalize()}:\n{row[col]}" 
                      for col in df.columns 
                      if col not in ['registration_number_description',
                                     'during_your_stay_house_rule',
                                     'before_you_leave_house_rule', 
                                     'bathroom_amenities', 
                                     'bedroom_and_laundry_amenities', 
                                     'entertainment_amenities', 
                                     'heating_and_cooling_amenities', 
                                     'home_safety_amenities', 
                                     'internet_and_office_amenities',
                                     'kitchen_and_dining_amenities', 
                                     'parking_and_facilities_amenities', 
                                     'not_included_amenities', 
                                     'scenic_views_amenities', 
                                     'outdoor_amenities',
                                     'family_amenities', 
                                     'privacy_and_safety_amenities']
                      ])

# Applying the function to each row
df['combined_description'] = df[['listing_id',
                                 'place_description', 
                                 'the_space_description', 
                                 'guest_access_description', 
                                 'other_things_to_note_description',
                                 'during_your_stay_description',
                                 'checking_in_and_out_house_rule',
                                 'accommodation_type', 
                                 'location_features_amenities',
                                 'services_amenities']
                                 ].apply(join_columns_with_headers, axis=1)

# Displaying the result
print(df[['listing_id', 'combined_description']])


               listing_id                               combined_description
0                37294314  Listing id:\n37294314 \n Checking in and out h...
1      699440821139041828  Listing id:\n699440821139041828 \n Checking in...
2                23793078  Listing id:\n23793078 \n Checking in and out h...
3      909615095483961012  Listing id:\n909615095483961012 \n Checking in...
4                21683753  Listing id:\n21683753 \n Checking in and out h...
...                   ...                                                ...
1326  1117045875626497107  Listing id:\n1117045875626497107 \n Checking i...
1327   744961251977035854  Listing id:\n744961251977035854 \n Checking in...
1328  1154699404234096988  Listing id:\n1154699404234096988 \n Checking i...
1329  1145329834522305779  Listing id:\n1145329834522305779 \n Checking i...
1330   583634744259641738  Listing id:\n583634744259641738 \n Checking in...

[1331 rows x 2 columns]


In [88]:
sample_combined_description = df.combined_description[0]

print(sample_combined_description)

Listing id:
37294314 
 Checking in and out house rule:
['Check-in after 3:00 p.m.', 'Checkout before 11:00 a.m.', 'Self check-in with lockbox'] 
 Location features amenities:
['Private entrance'] 
 Services amenities:
['Self check-in', 'Lockbox'] 
 Place description:
Newly created studio suite with private entrance and off street parking.  Large flat garden yard, with bbq and outdoor space for relaxing or eating. Close to downtown Vancouver and Stanley Park. Just off the highway that connects you to Whistler and the rest of the North Shore. 
 The space description:
The newly constructed suite features a double/full wall bed (pulls down sideways), a kitchenette (no stove). The fridge has drinks and has plenty of room for food. The shower is 36” x 36”. Television has access to cable and Netflix. 
 Guest access description:
Have access to private suite and small area in backyard. 
 Other things to note description:
I have a small dog (chihuahua yorkie). 
 Accommodation type:
Entire guest 

## Run Gemini to generate response

In [322]:
response = model.generate_content(sample_combined_description)

print(response.text)

{
  "listing_id": "37294314",
  "is_bathroom_shared": false,
  "is_kitchen_shared": false,
  "is_host_in_unit": false,
  "is_host_on_prem": true,
  "shared_utilities": false,
  "is_entrance_separate": true,
  "other_guests_on_prem": false,
  "host_profile_mentioned": false,
  "is_separate_suite": true,
  "host_interaction": false,
  "has_house_rules": true,
  "has_personal_touches": false,
  "entire_property": true,
  "includes_additional_service": false,
  "has_self_check_in": true
}



In [32]:
def clean_features(features):
    # Use regex to remove everything before the first { and after the last }
    cleaned_features = re.sub(r'^.*?({.*}).*$', r'\1', features.lower(), flags=re.DOTALL)
    return cleaned_features

In [48]:
cleaned_features = clean_features(response.text)

json.loads(cleaned_features)

NameError: name 'response' is not defined

In [96]:
print(type(df.combined_description))
for idx, row in df.combined_description.items():
    print(row)
    break

<class 'pandas.core.series.Series'>
Listing id:
37294314 
 Checking in and out house rule:
['Check-in after 3:00 p.m.', 'Checkout before 11:00 a.m.', 'Self check-in with lockbox'] 
 Location features amenities:
['Private entrance'] 
 Services amenities:
['Self check-in', 'Lockbox'] 
 Place description:
Newly created studio suite with private entrance and off street parking.  Large flat garden yard, with bbq and outdoor space for relaxing or eating. Close to downtown Vancouver and Stanley Park. Just off the highway that connects you to Whistler and the rest of the North Shore. 
 The space description:
The newly constructed suite features a double/full wall bed (pulls down sideways), a kitchenette (no stove). The fridge has drinks and has plenty of room for food. The shower is 36” x 36”. Television has access to cable and Netflix. 
 Guest access description:
Have access to private suite and small area in backyard. 
 Other things to note description:
I have a small dog (chihuahua yorkie).

In [104]:
llm_features = []
def extract_features(description):
    """Extracting features from first 100 description items using LLM"""
    
    limit = 0
    # iterating through descriptions and extracting features using LLM
    for idx, desc in description.items():
        # print(f'DESC: {desc}, TYPE: {type(desc)}')
        if limit >= 100:
            break
        try:
            response = model.generate_content(desc)
            # print(f'processing {desc}, response: {response.text}')
            cleaned_response = clean_features(response.text)
            llm_features.append(cleaned_response)
        except (AttributeError, ValueError) as e:
            print(f'Error processing item {description}')
            print(f'Exception thrown: {e}')
        
        except Exception as e:
            print(f'An expected error occurred with item {description}')
            print(f'Exception thrown: {e}')
        
        limit += 1
        time.sleep(10)
        
extract_features(df.combined_description)

## Save Gemini response into jsonl

In [101]:
def write_output_jsonl(file_path, data):
    """Writes a list of dictinoaries to JSONL file."""
    with open(file_path, 'a') as file: # open file and append
        for listing in data: # iterate through every listing
            json_item = json.loads(listing)
            json.dump(json_item, file)
            file.write('\n')

In [102]:
path = '../../data/description_parsing/feature_extraction.jsonl'
write_output_jsonl(path, llm_features)

## Adding accommodation type to complete output

In [86]:
def write_jsonl(file_path, data):
    """Writes a list of dictinoaries to JSONL file."""
    with open(file_path, 'a') as file: # open file and append
        for record in data: # iterate through every listing
            json.dump(record, file)
            file.write('\n')

In [18]:
path = '../../data/description_parsing/description_amenities_house_rules.jsonl'

filtered = []

raw_df = pd.read_csv('../../data/raw_data/raw_data.csv')
print(raw_df[raw_df['id'] == 36322781]['type'].values[0])
# print(raw_df.shape)
# raw_df.head()
with open(path, 'r') as file:
    for line in file:
        item = json.loads(line)
        result = raw_df[raw_df['id'] == int(item['listing_id'])]
        if len(result) > 0:
            # print(f"LISTING ID: {item['listing_id']}")
            # print(result['type'])
            item['accommodation_type'] = result['type'].values[0]
            # print(item)
            filtered.append(item)
            # write_jsonl('../../data/description_parsing/output.jsonl')
        else:
            print('empty')

print(filtered[0])

Entire guest suite
{'listing_id': '37294314', 'checking_in_and_out_house_rule': ['Check-in after 3:00 p.m.', 'Checkout before 11:00 a.m.', 'Self check-in with lockbox'], 'during_your_stay_house_rule': ['2 guests maximum', 'No pets', 'No parties or events', 'No smoking'], 'bathroom_amenities': ['Hair dryer', 'Shampoo', 'Hot water'], 'bedroom_and_laundry_amenities': ['Essentials', 'Hangers', 'Bed linens'], 'entertainment_amenities': ['TV with standard cable'], 'heating_and_cooling_amenities': ['Heating'], 'home_safety_amenities': ['Smoke alarm', 'Carbon monoxide alarm', 'Fire extinguisher', 'First aid kit'], 'internet_and_office_amenities': ['Wifi'], 'kitchen_and_dining_amenities': ['Refrigerator', 'Microwave', 'Dishes and silverware', 'Dishwasher', 'Coffee maker'], 'location_features_amenities': ['Private entrance'], 'outdoor_amenities': ['Backyard'], 'parking_and_facilities_amenities': ['Free parking on premises', 'Free street parking'], 'services_amenities': ['Self check-in', 'Lockbox

In [26]:
save_path = '../../data/description_parsing/output.jsonl'
write_jsonl(save_path, filtered)