In [1]:
# from langchain.globals import set_verbose

# set_verbose(True)

In [None]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chains import LLMChain

# 示例对：可继续扩展
examples = [
    {
        "activity": "I was reading about climate change and polar bears",
        "queries": '["effects of climate change", "polar bear habitat loss", "how melting ice affects arctic animals"]'
    },
    {
        "activity": "I watched videos about different types of cats",
        "queries": '["funny cat videos", "maine coon vs ragdoll", "top 10 cat breeds", "cat behavior explained"]'
    },
    {
        "activity": "I was looking at cooking tutorials",
        "queries": '["easy pasta recipes", "how to make fried rice", "best kitchen tools 2024"]'
    }
]


In [3]:
# 单个示例的格式
example_prompt = PromptTemplate(
    input_variables=["activity", "queries"],
    template="User activity: {activity}\nOutput: {queries}\n"
)

# Few-shot Prompt：拼接 prefix + 多个示例 + suffix（当前问题）
search_query_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=(
        "You are an assistant that generates realistic and helpful human search queries "
        "based on what a user says they were doing.\n"
        "Do NOT just repeat the activity.\n"
        "Generate natural, creative, and diverse search engine queries.\n"
        "Only return a Python list of strings. No extra explanation.\n\n"
    ),
    suffix="User activity: {activity}\nOutput:",
    input_variables=["activity"]
)


In [4]:
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
import os

# 初始化 LLM（你本地已设置 .env）
load_dotenv()
llm = ChatOpenAI(model_name="gpt-4o", openai_api_key=os.getenv("OPENAI_API_KEY"))

# 构建链
search_query_chain = LLMChain(llm=llm, prompt=search_query_prompt)


  llm = ChatOpenAI(model_name="gpt-4o", openai_api_key=os.getenv("OPENAI_API_KEY"))
  search_query_chain = LLMChain(llm=llm, prompt=search_query_prompt)


In [5]:
activity = "I want to recreate my activity where I read cat-related articles and watched funny cat videos at April 12, 2024"

response = search_query_chain.run(activity=activity)
print(response)


  response = search_query_chain.run(activity=activity)


["latest cat memes April 2024", "cat behavior articles April 2024", "funny cat video compilations", "trending cat videos April 2024"]


## Download Test

In [6]:
import sys
import os

# Step 1: Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Step 2: Import from insertion package
from insertion.downloadfiles import download_file

# (Optional) Example usage
# downloadfiles.some_function()


In [7]:
# change to list
import ast

keywords_list = ast.literal_eval(response)
print(keywords_list)

['latest cat memes April 2024', 'cat behavior articles April 2024', 'funny cat video compilations', 'trending cat videos April 2024']


In [8]:
# download_records = []
# for keyword in keywords_list:
#     download_records.append(download_file(keyword))

## Insert file to DD

In [None]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chains import LLMChain

examples = [
    {
        "activity": "I was researching climate change and polar bears",
        "files": [
            "/tmp/climate_effects_report.pdf",
            "/tmp/random_dog_picture.jpg",
            "/tmp/polar_bear_melting_ice.png"
        ],
        "output": [
            {
                "local_path": "/tmp/climate_effects_report.pdf",
                "target_path": "/home/user/Documents/climate_effects_report.pdf",
                "access_time": "2025-04-15 10:15:23",
                "modified_time": "2025-04-15 10:20:31"
            },
            {
                "local_path": "/tmp/polar_bear_melting_ice.png",
                "target_path": "/home/user/Pictures/polar_bear_melting_ice.png",
                "access_time": "2025-04-15 10:25:10",
                "modified_time": "2025-04-15 10:27:05"
            }
        ]
    },
    {
        "activity": "I was watching funny cat videos",
        "files": [
            "/tmp/cat_funny_1.mp4",
            "/tmp/cat_breed_info.txt",
            "/tmp/unrelated_politics_article.pdf"
        ],
        "output": [
            {
                "local_path": "/tmp/cat_funny_1.mp4",
                "target_path": "/home/user/Videos/cat_funny_1.mp4",
                "access_time": "2025-04-12 14:05:10",
                "modified_time": "2025-04-12 14:07:50"
            }
        ]
    }
]

example_prompt = PromptTemplate(
    input_variables=["activity", "files", "output"],
    template=(
        "User activity: {activity}\n"
        "Downloaded files:\n{files}\n"
        "Selected file operations:\n{output}\n"
    )
)


prefix = (
    "You are an intelligent assistant helping to reconstruct user behavior inside a Linux system.\n"
    "We have a clean disk image (a standard Linux filesystem) mounted, and we want to create traces\n"
    "of a user's past activities based on the files they might have interacted with.\n\n"
    "You are given two pieces of information:\n"
    "1. A description of what the user claims they were doing (activity).\n"
    "2. A list of downloaded files that might be related to that activity.\n\n"
    "Your job is to select a few relevant files (not all) that realistically fit the described activity.\n"
    " Important:\n"
    "- If a file does not seem related, you must ignore it. Do not force it in.\n"
    "- For each selected file:\n"
    "  - Keep its local path unchanged.\n"
    "  - Generate a realistic target path under '/home/user/' according to typical Linux usage conventions:\n"
    "    * Documents → /home/user/Documents/\n"
    "    * Images → /home/user/Pictures/\n"
    "    * Videos → /home/user/Videos/\n"
    "    * Important files (e.g., presentations) → /home/user/Desktop/\n"
    "- You must also generate a reasonable 'access_time' and 'modified_time' for each file:\n"
    "  - If the user activity mentions a specific date or time, base the timestamps around that.\n"
    "  - Otherwise, use the current date and time that will be provided to you.\n"
    "  - 'access_time' should be slightly before or very close to 'modified_time', like normal file usage.\n"
    "- Assume the only Linux user is called 'user'.\n\n"
    "Output format:\n"
    "Return a Python list of dictionaries. Each dictionary must contain:\n"
    "- local_path\n"
    "- target_path\n"
    "- access_time\n"
    "- modified_time\n\n"
    "Be realistic. You are helping recreate a real user's footprint. Do it naturally and thoughtfully.\n"
)


# Few-shot prompt 
select_and_plan_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=(
        "User activity: {activity}\n"
        "Downloaded files:\n{files}\n"
        "Current system time: {current_time}\n"
        "Selected file operations:\n"
    ),
    input_variables=["activity", "files", "current_time"]
)


In [13]:
select_and_plan_chain = LLMChain(
    llm=llm,  
    prompt=select_and_plan_prompt,
    verbose=True,  # Optional: Set to True for debugging
)


In [14]:
from datetime import datetime

# 动态获取当前时间（比如 ISO 格式）
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

download_record = [
    "./to_upload/aspcapro_cat_behavior_guide.pdf",
    "./to_upload/cat_care_basics.docx",
    "./to_upload/evolution_of_domestic_cats.pdf",
    "./to_upload/funny_cat_memes.jpg",
    "./to_upload/history_of_cat_breeds.txt",
    "./to_upload/feral_cat_population_study.pdf"
]


files_formatted = "\n".join(download_record)

# files_formatted 是下载下来的文件格式化好的一堆字符串
response = select_and_plan_chain.run(
    activity=activity,
    files=files_formatted,
    current_time=current_time
)




[1m> Entering new LLMChain chain...[0m


KeyError: "'local_path'"