In [1]:
from IPython.display import display, Markdown
import pandas as pd
import requests
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load clustered Airbnb data from the user's GitHub project path
df = pd.read_csv("../ML-Exam/data/clustered_airbnb.csv")

In [3]:
# Define the LLaMA 3 client
class Llama3Client:
    def __init__(self, model="llama3", host="http://localhost:11434"):
        self.model = model
        self.api_url = f"{host}/api/chat"

    def ask(self, prompt: str) -> str:
        payload = {
            "model": self.model,
            "messages": [{"role": "user", "content": prompt}],
            "stream": False
        }
        try:
            response = requests.post(self.api_url, json=payload)
            response.raise_for_status()
            return response.json()['message']['content'].strip()
        except requests.RequestException as e:
            return f"Error: {e}"

# Format listings for display in the prompt
def format_listings(listings_df, nights):
    listings = []
    for _, row in listings_df.iterrows():
        total_price = row['realSum'] * nights
        listing = (
            f"{row['City']} (Cluster {row['cluster']}) - €{row['realSum']}/night, "
            f"{row['bedrooms']} bedrooms, "
            f"{row['dist']}km to center, "
            f"{row['metro_dist']}km to metro, "
            f"{row['guest_satisfaction_overall']} guest satisfaction, "
            f"Total: €{total_price:.2f}"
        )
        listings.append(listing)
    return "\n".join(listings)

In [4]:
# 1. Function for best value (all cities) using format_listings
def recommend_best_value(df, budget, nights=5, max_results=5):
    df['total_price'] = df['realSum'] * nights
    filtered = df[df['total_price'] <= budget].sort_values(by='guest_satisfaction_overall', ascending=False).head(max_results)

    if filtered.empty:
        return "No listings found within your budget."
    
    context = format_listings(filtered, nights)
    prompt = f"""
You are a travel assistant helping users choose the best Airbnb option from clustered data.

The user has a budget of €{budget} for {nights} nights. Below are some available listings:

{context}

For each listing, consider:
- Total price
- Distance to city center and metro
- Number of bedrooms
- Guest satisfaction
- Cluster number

Which listing would you recommend and why? Include all these aspects in your answer.
"""

    client = Llama3Client()
    return client.ask(prompt)

nights = 5
budget = 500

response_1 = recommend_best_value(df, budget, nights)

In [None]:
# Display results
display(Markdown("### Recommendation (Best Value Overall):"))
display(Markdown(response_1))

In [None]:
# 2. Function for a specific city using format_listings
def recommend_in_city(df, city, budget, nights=5, max_results=5):
    df['total_price'] = df['realSum'] * nights
    filtered = df[(df['City'].str.lower() == city.lower()) & (df['total_price'] <= budget)]
    filtered = filtered.sort_values(by='guest_satisfaction_overall', ascending=False).head(max_results)

    if filtered.empty:
        return f"No listings found in {city.title()} within your budget."

    context = format_listings(filtered, nights)
    prompt = f"""
You are a travel assistant helping users choose the best Airbnb option in {city.title()} from clustered data.

The user has a budget of €{budget} for {nights} nights. Below are some available listings in {city.title()}:

{context}

For each listing, consider:
- Total price
- Distance to city center and metro
- Number of bedrooms
- Guest satisfaction
- Cluster number

Based on value, distance, and cluster, which listing would you recommend and why?
"""
    client = Llama3Client()
    return client.ask(prompt)

# Example usage (you can replace these with input() calls or widgets in a notebook)
nights = 5
budget = 500
city = "Berlin"

response_2 = recommend_in_city(df, city, budget, nights)

display(Markdown(f"### Recommendation (City: {city.title()}):"))
display(Markdown(response_2))


In [7]:
# 1. Function for best value (all cities) using .to_string()
def recommend_best_value(df, budget, nights=5, max_results=5):
    df['total_price'] = df['realSum'] * nights
    filtered = df[df['total_price'] <= budget].sort_values(by='guest_satisfaction_overall', ascending=False).head(max_results)

    if filtered.empty:
        return "No listings found within your budget."
    
    context = filtered.to_string(index=False)

    prompt = f"""
You are a travel assistant helping users choose the best Airbnb option from clustered data.

The user has a budget of €{budget} for {nights} nights. Below are some available listings as a table:

{context}

Please evaluate each listing and recommend the best one. Be sure to consider:
- Total price
- Distance to city center and metro (dist & metro_dist)
- Number of bedrooms (bedrooms)
- Guest Satisfaction (guest_satisfaction_overall)
- Cluster

Explain your reasoning clearly using all available details and remember to mention which city the airbnb is located in.
"""
    client = Llama3Client()
    return client.ask(prompt)

In [None]:
response_1 = recommend_best_value(df, budget=500, nights=5)
display(Markdown(response_1))


In [9]:
# 2. Function for a specific city using .to_string()
def recommend_in_city(df, city, budget, nights=5, max_results=5):
    df['total_price'] = df['realSum'] * nights
    filtered = df[
        (df['City'].str.lower() == city.lower()) &
        (df['total_price'] <= budget)
    ].sort_values(by='guest_satisfaction_overall', ascending=False).head(max_results)

    if filtered.empty:
        return f"No listings found in {city.title()} within your budget."
    
    context = filtered.to_string(index=False)

    prompt = f"""
You are a travel assistant helping users choose the best Airbnb option in {city.title()}.

The user has a budget of €{budget} for {nights} nights. Below are the available listings in table format:

{context}

Please evaluate the listings and recommend the best one. Include reasoning based on:
- Total cost
- Distance to attractions
- Bedrooms
- Guest satisfaction
- Cluster number
"""
    client = Llama3Client()
    return client.ask(prompt)

In [10]:
# RAG-based cluster analysis functions
def create_cluster_documents(df):
    """Create text documents describing each cluster's characteristics."""
    cluster_docs = []
    cluster_ids = []
    
    # Get all unique clusters
    clusters = sorted(df['cluster'].unique())
    
    for cluster in clusters:
        cluster_data = df[df['cluster'] == cluster]
        
        # Calculate statistics for this cluster
        avg_price = cluster_data['realSum'].mean()
        avg_satisfaction = cluster_data['guest_satisfaction_overall'].mean()
        avg_dist = cluster_data['dist'].mean()
        avg_metro_dist = cluster_data['metro_dist'].mean()
        avg_bedrooms = cluster_data['bedrooms'].mean()
        
        # Room type distribution
        room_types = cluster_data['room_type'].value_counts(normalize=True).to_dict()
        
        # City distribution
        city_dist = cluster_data['City'].value_counts(normalize=True).to_dict()
        
        # Create a document describing this cluster
        doc = f"Cluster {cluster} characteristics:\n"
        doc += f"Average price: €{avg_price:.2f} per night\n"
        doc += f"Average guest satisfaction: {avg_satisfaction:.2f}/100\n"
        doc += f"Average distance to city center: {avg_dist:.2f}km\n"
        doc += f"Average distance to metro: {avg_metro_dist:.2f}km\n"
        doc += f"Average number of bedrooms: {avg_bedrooms:.2f}\n"
        
        doc += "Room type distribution:\n"
        for room_type, percentage in room_types.items():
            doc += f"- {room_type}: {percentage*100:.1f}%\n"
            
        doc += "City distribution:\n"
        for city, percentage in city_dist.items():
            doc += f"- {city}: {percentage*100:.1f}%\n"
        
        cluster_docs.append(doc)
        cluster_ids.append(cluster)
        
    return cluster_docs, cluster_ids

In [11]:
# Create the cluster documents and initialize the vectorizer
cluster_docs, cluster_ids = create_cluster_documents(df)

In [12]:
# Create a TF-IDF vectorizer for the RAG system
vectorizer = TfidfVectorizer(stop_words='english')
document_vectors = vectorizer.fit_transform(cluster_docs)

def retrieve_relevant_docs(query, top_k=3):
    """Retrieve the most relevant cluster documents for a query."""
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, document_vectors).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    # Return both the documents and their cluster IDs
    return [cluster_docs[i] for i in top_indices], [cluster_ids[i] for i in top_indices], similarities[top_indices]

In [None]:
# Function to name clusters based on key traits
def name_clusters(df):
    """Generate descriptive names for each cluster based on key characteristics."""
    # Get cluster documents
    cluster_docs, cluster_ids = create_cluster_documents(df)
    # Define feature groups for cluster analysis
    feature_groups = {
        'Distance': ['dist', 'metro_dist'],
        'Accommodation': ['bedrooms', 'person_capacity'], 
        'Quality': ['cleanliness_rating', 'guest_satisfaction_overall'],
        'Price': ['realSum'],
        'Superhost': ['host_is_superhost_bool']
    }
    
    # Create descriptive names for each cluster
    client = Llama3Client()
    
    for cluster_id in cluster_ids:
        cluster_data = df[df['cluster'] == cluster_id]
        
        # Calculate feature group averages
        group_stats = {}
        for group_name, features in feature_groups.items():
            # Calculate mean of standardized values for features in group
            feature_means = []
            for feature in features:
                if feature in cluster_data.columns:
                    mean = cluster_data[feature].mean()
                    std = df[feature].std()  # Using full dataset std
                    feature_means.append((mean - df[feature].mean()) / std)
            if feature_means:
                group_stats[group_name] = np.mean(feature_means)
        
        # Sort groups by absolute z-score to find most distinctive traits
        distinctive_groups = sorted(group_stats.items(), key=lambda x: abs(x[1]), reverse=True)
        top_groups = [g[0] for g in distinctive_groups[:3]]
        
        # Get the cluster document
        doc = cluster_docs[cluster_ids.index(cluster_id)]
        
        prompt = f"""[Task]
        Create a concise, memorable name for an Airbnb property cluster that captures its key characteristics.
        
        [Target]
        - Primary: Potential Airbnb guests looking for specific property types
        - Secondary: Property managers seeking to position their listings
        
        [Tone]
        Professional yet approachable, using clear and appealing language suitable for property listings
        
        [Technical Details]
        1. Output Format: Return ONLY a cluster name following this exact pattern:
           "Descriptive Name ({', '.join(top_groups)})"
           Example: "Urban Luxury Oasis (Distance: 2.1 km to center, Quality: 4.8 rating, Price: $250 per night)"
        
        2. Key Data Points:
           - Distinctive feature groups: {', '.join(top_groups)}
           - Cluster characteristics:
             {doc}
        
        3. Requirements:
           - Name must be immediately understandable
           - Include specific metrics for each feature group
           - Accurately reflect the data-driven grouping
        """
        
        cluster_name = client.ask(prompt)
        display(Markdown(f"**Cluster {cluster_id}:** {cluster_name}"))

# Generate cluster names
name_clusters(df)