# L4: Boosting


<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>


In [None]:
# 警告控制
import warnings

# 忽略所有警告信息，以免干扰程序输出
warnings.filterwarnings('ignore')

In [None]:
import custom_utils  # 导入自定义工具库 custom_utils

<p style="background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px"> 💻 &nbsp; <b>Access <code>requirements.txt</code> and <code>utils</code> files:</b> To access <code>requirements.txt</code> for this notebook, 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>. For more help, please see the <em>"Appendix - Tips and Help"</em> Lesson.</p>

## Data Loading

In [None]:
from datasets import load_dataset
import pandas as pd

# 加载数据集
dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
dataset = dataset.take(100)  # 取前100条数据

# 将数据集转换为 pandas 数据框
dataset_df = pd.DataFrame(dataset)

# 显示前5条数据
dataset_df.head(5)

In [None]:
# 打印数据框的列名
print("Columns:", dataset_df.columns)

## Document Modelling

In [None]:
# 使用自定义工具库处理记录，并将结果存储在 listings 变量中
listings = custom_utils.process_records(dataset_df)

## Database Creation and Connection

In [None]:
# 使用自定义工具库连接到数据库，并获取数据库和集合对象
db, collection = custom_utils.connect_to_database()

In [None]:
# 删除集合中所有现有的记录
collection.delete_many({})

## Data Ingestion

In [None]:
# 插入处理后的记录到集合中
collection.insert_many(listings)
print("Data ingestion into MongoDB completed")

## Vector Search Index defintion

In [None]:
# 创建带有过滤器的向量搜索索引
custom_utils.setup_vector_search_index_with_filter(collection=collection)

<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"> ⏳ <b>Note:</b> If the output of the previous cell is <code>Error creating vector search index: Duplicate Index</code> you may proceed to the next cell if you intend to still use a previously created index.</p>

## Handling User Query

In [None]:
from pydantic import BaseModel
from typing import Optional
import custom_utils

class SearchResultItem(BaseModel):
    name: str  # 房源名称
    accommodates: Optional[int] = None  # 可容纳人数，可选
    address: custom_utils.Address  # 地址信息
    averageReviewScore: Optional[float] = None  # 平均评论得分，可选
    number_of_reviews: Optional[float] = None  # 评论数量，可选
    combinedScore: Optional[float] = None  # 综合得分，可选

In [None]:
from IPython.display import display, HTML

def handle_user_query(query, db, collection, stages=[], vector_index="vector_index_text"):
    """
    处理用户查询并返回系统响应和源信息。

    Args:
    query (str): 用户的查询字符串。
    db (MongoClient.database): 数据库对象。
    collection (MongoCollection): 要搜索的 MongoDB 集合。
    stages (list): 额外的聚合阶段要包括在管道中。
    vector_index (str): 向量索引名称，默认为 "vector_index_text"。

    Returns:
    str: 系统响应。
    """
    # 执行向量搜索
    get_knowledge = custom_utils.vector_search_with_filter(query, db, collection, stages, vector_index)

    # 检查是否有结果
    if not get_knowledge:
        return "No results found.", "No source information available."
    
    print("List of all fields of the first document, before model conformance")
    print(get_knowledge[0].keys())

    # 将搜索结果转换为 SearchResultItem 模型列表
    search_results_models = [
        SearchResultItem(**result)
        for result in get_knowledge
    ]

    # 将搜索结果转换为 DataFrame 以便在 Jupyter 中更好地呈现
    search_results_df = pd.DataFrame([item.dict() for item in search_results_models])

    # 使用 OpenAI 的 completion 生成系统响应
    completion = custom_utils.openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system", 
                "content": "You are an Airbnb listing recommendation system."
            },
            {
                "role": "user", 
                "content": f"Answer this user query: {query} with the following context:\n{search_results_df}"
            }
        ]
    )
    system_response = completion.choices[0].message['content']

    # 打印用户问题、系统响应和源信息
    print(f"- User Question:\n{query}\n")
    print(f"- System Response:\n{system_response}\n")

    # 以 HTML 表格形式显示 DataFrame
    display(HTML(search_results_df.to_html()))

    # 返回结构化响应和源信息作为字符串
    return system_response

## Boosting Search Results After Vector Search

In [None]:
# 定义计算平均评论得分和评论数量加权得分的聚合阶段
review_average_stage = {
    "$addFields": {
        "averageReviewScore": {
            "$divide": [
                {
                    "$add": [
                        "$review_scores.review_scores_accuracy",
                        "$review_scores.review_scores_cleanliness",
                        "$review_scores.review_scores_checkin",
                        "$review_scores.review_scores_communication",
                        "$review_scores.review_scores_location",
                        "$review_scores.review_scores_value",
                    ]
                },
                6  # 除以评论评分类型的数量以获得平均值
            ]
        },
        # 根据评论数量计算评分提升因子
        "reviewCountBoost": "$number_of_reviews"
    }
}

In [None]:
weighting_stage = {
    "$addFields": {
        "combinedScore": {
            # Example formula that combines average review score and review count boost
            "$add": [
                {"$multiply": ["$averageReviewScore", 0.9]},  # Weighted average review score
                {"$multiply": ["$reviewCountBoost", 0.1]}   # Weighted review count boost
            ]
        }
    }
}

In [None]:
# 应用 combinedScore 进行排序的聚合阶段
sorting_stage_sort = {
    "$sort": {"combinedScore": -1}  # 按降序排列，以提升较高的综合评分
}

In [None]:
# 定义额外的聚合阶段，包括计算平均评论得分、加权评分和排序
additional_stages = [review_average_stage, weighting_stage, sorting_stage_sort]

## Results

In [None]:
query = """
I want to stay in a place that's warm and friendly, 
and not too far from restaurants, can you recommend a place? 
Include a reason as to why you've chosen your selection.
"""

# 处理用户查询并获取响应
response = handle_user_query(
    query, 
    db, 
    collection, 
    additional_stages, 
    vector_index="vector_index_with_filter"
)
response
