In [43]:
import os
import datetime

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain_google_genai import GoogleGenerativeAI
from langchain_core.messages import SystemMessage, AIMessage, HumanMessage

import json

import warnings
warnings.filterwarnings("ignore")

In [None]:
os.environ["GOOGLE_API_KEY"]="YOUR_GOOGLE_GENAI_API_KEY"

In [15]:
data_path = "../Data/iot_readable_last_en.txt"
encoder_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
encoder_model_lib = "transformer"
decoder_name = "gemini-2.5-flash"
vectorstore_path = "../Data/iot_data_faiss_en_new"

In [16]:
def initialize_vectorstore():
    if not os.path.exists(vectorstore_path):
        # LOAD -> SPLIT -> EMBED -> STORE -> RETRIEVE
        loader = TextLoader(data_path, encoding="utf-8")
        text_file = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n"],
            chunk_size = 1,
            chunk_overlap = 0
        )

        # Metadata Çıkarma İşlemi
        print("Metadata Çıkarılıyor...")
        metadata = []
        for line in text_file[0].page_content.split("\n"):
            parts = [p.strip() for p in line.split(",")]

            if len(parts) <= 1:
                break
            
            date = parts[0].split(":")[1].strip()
            hour = parts[1].split(":", 1)[1].strip()
            room = parts[2].split(":")[1].strip()
            sensor = parts[3].split(":")[1].strip()

            dtime = datetime.datetime.strptime(f"{date} {hour}", "%Y-%m-%d %H:%M")
            timestamp = dtime.timestamp()

            metadata.append({"Date": timestamp, "Hour": hour, "Room": room, "Sensor": sensor})
        print("Metadata Hazır!\n")

        print("Metinler Ayrıştırılıyor...")
        splitted_texts = text_splitter.split_text(text_file[0].page_content)
        print(f"Metinden şu kadar satır çıkarıldı: {len(splitted_texts)}")
        splitted_texts = splitted_texts[:-1]
        print(f"Veri olarak seçilen satır sayısı: {len(splitted_texts)}")
        print(f"Metadata listesi uzunluğu: {len(metadata)}\n")

        docs = [Document(page_content=text, metadata=meta) for text, meta in zip(splitted_texts, metadata)]

        print(f"Embedding Modeli Yükleniyor... Model Kütüphanesi: {encoder_model_lib}")
        if encoder_model_lib == "sentence-transformer":
            pass
        elif encoder_model_lib == "transformer":
            embeddings = HuggingFaceEmbeddings(
                model_name=encoder_name,
                model_kwargs={"device": "cpu"},
                encode_kwargs={"normalize_embeddings": False}
            )
        elif encoder_model_lib == "google-genai":
            pass
        print("Embedding Modeli Hazır!\n")

        print("Metadatalı Vektör Veritabanı Başlatılıyor...")
        vectorstore = FAISS.from_documents(documents=docs, embedding=embeddings)
        print("Metadatalı Vektör Veritabanı Hazır!")
    
    else:
        print(f"Metadatalı Vektör Veritabanı {vectorstore_path} Klasöründen Yükleniyor...")
        embeddings = HuggingFaceEmbeddings(
            model_name=encoder_name,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": False}
        )
        
        vectorstore = FAISS.load_local(folder_path=vectorstore_path, embeddings=embeddings, allow_dangerous_deserialization=True)
        print("Metadatalı Vektör Veritabanı Başarıyla Yüklendi!")

    return vectorstore

In [52]:
def fetch_documents(structured_query : str, vectorstore, metadatas):
    retriever = vectorstore.as_retriever(search_tpye="mmr", search_kwargs = {"k" : 10, "fetch_k" : 300, "filter" : {"Hour" : metadatas["Hour"]}})
    search_result = retriever.invoke(structured_query)
    
    return search_result

In [18]:
def initialize_llm(llm_name : str, llm_lib : str):
    if llm_lib.lower() == "google-genai":
        llm = GoogleGenerativeAI(model=llm_name, temperature=0)
    else:
        print("LLM Lib Bulunamadı!")
    
    return llm

In [19]:
def create_query_prompt(user_query : str):
    current_year = datetime.datetime.now().year

    messages = [
        SystemMessage(content=f"""
    Your task is to extract specific fields from the given input sentence and return the result strictly in JSON format.
    
    Rules:
    - Always include these 4 fields in the JSON: "date", "time", "room", "query".
    - If the sentence has a specific calendar date (e.g., "16 August 2025"), extract it in ISO format YYYY-MM-DD if possible.
      If the year is not given, return only the given part (e.g., "16 August").
    - If no explicit date is mentioned, return "not mentioned".
    - If year is not mentioned use current year, Current Year: {current_year}
    - For the "time" field:
        * If the query has explicit time (e.g., "9 A.M."), convert it to 24-hour format (e.g., "09:00").
        * If the query uses relative expressions like "yesterday", "today", "current", "last time", keep them as they are.
        * If no time expression exists, return "not mentioned".
    - "room" should always be extracted from the input (e.g., "living room", "bedroom").
    - "query" should capture what is being asked (e.g., "temperature", "humidity").")
    """),

    AIMessage(content="Understood. I will extract the fields from the input sentence and provide the output in JSON format. Please provide the first input."),
    HumanMessage(content="Input: What is the current temperature in the living room?"),
    AIMessage(content='{"date": "current", "time": "not mentioned", "room": "living room", "query": "temperature"}'),
    HumanMessage(content="Input: What was the temperature in the bedroom yesterday?"),
    AIMessage(content='{"date": "yesterday", "time": "not mentioned", "room": "bedroom", "query": "temperature"}'),
    HumanMessage(content="Input: What was the temperature in the bedroom last time?"),
    AIMessage(content='{"date": "last time", "time": "not mentioned", "room": "bedroom", "query": "temperature"}'),
    HumanMessage(content="Input: What is the current humidity level in the sitting room?"),
    AIMessage(content='{"date": "current", "time": "not mentioned", "room": "sitting room", "query": "humidity"}'),
    HumanMessage(content="Input: What was the humidity level in living room yesterday at 5 P.M"),
    AIMessage(content='{"date": "yesterday", "time": "17:00", "room": "living room", "query": "humidity"}'),
    HumanMessage(content="Input: What was the temperature in bathroom yesterday"),
    AIMessage(content='{"date": "yesterday", "time": "not mentioned", "room": "bathroom", "query": "temperature"}'),
    HumanMessage(content="Input: What was the temperature in the living room at the 16 August 2025 9 A.M?"),
    AIMessage(content='{"date": "2025-08-16", "time": "09:00", "room": "living room", "query": "temperature"}'),
    HumanMessage(content="Input: What was the humidity level in bedroom at 20 August"),
    AIMessage(content='{"date": "2025-08-16, "time": "not mentioned", "room": "bedroom", "query": "humidity"}'),
    ]

    messages.append(HumanMessage(content=f"Input: {user_query}"))

    return messages

In [29]:
def rewrite_query_and_extract_metadata(structured_model_output : str):
    """
    Gelecek stringde 4 alan var, bu alanlar şunlar:
    - date
    - time
    - room
    - query

    date: ['current', 'yesterday', 'now', 'last time' or date format e.g '2025-08-16']
    time: ['not mentioned' or 24 hour format e.g '09:00']
    room: ['kitchen', 'bedroom', 'living room', 'bathroom' ...]
    query: ['temperature', 'humidity']
    """

    try:
        temp = json.loads(structured_model_output)
    except:
        print(f"Şu girdiyi JSON objesine çevirmeye çalışırken hata oluştu: {structured_model_output}")
        temp = None
    
    try:
        date = temp["date"]
        time = temp["time"]
        room = temp["room"]
        query = temp["query"]
    except:
        print(f"JSON objesinden valueleri almaya çalışırken hata meydana geldi {temp}")
    
    current = datetime.datetime.now()

    current_date = current.strftime("%Y-%m-%d")
    
    current_hour = current.strftime("%H:%M")
    current_hour = datetime.datetime.strptime(current_hour, "%H:%M")
    new_min = (current_hour.minute // 5) * 5
    rounded_hour = current_hour.replace(minute=new_min, second=0, microsecond=0)

    rounded_hour = rounded_hour.strftime("%H:%M")
    
    yesterday = current - datetime.timedelta(days=1)
    yesterday_date = yesterday.strftime("%Y-%m-%d")

    if ((date.lower() == "not mentioned") or (date.lower() == "current") or (date.lower() == "now") or (date.lower() == "last time")):
        new_date = current_date
    elif (date.lower() == "yesterday"):
        new_date = yesterday_date
    else:
        new_date = date

    if (time.lower() == "not mentioned"):
        new_time = rounded_hour
    else:
        new_time = time

    if ((time.lower() == "yesterday")):
        content = f"What was the {query} in {room} on {date} at {new_time}"
    else:
        content = f"What is the {query} in {room} on {date} at {new_time}"

    metadata = {"Date": new_date, "Hour": new_time, "Room": room.capitalize(), "Sensor":  query.capitalize()+" Sensor" }

    return content, metadata

In [30]:
def create_answering_prompt(user_query: str, docs_from_faiss, metadata_info):

    answering_prompt_template = """
        You are a helpful and concise smart home assistant. Your primary goal is to answer the user's question based strictly on the context provided under "[Provided Information]" and "[Metadata]". Follow all rules precisely.

        Here are the rules and examples for how you must respond.

        ### Rules
        1.  Your main goal is to answer the user's question using only the informations within the '[Provided Information]' and '[Metadata]' blocks.
        2.  If the information is sufficient to answer the question, synthesize it into a complete, natural, and helpful English sentence. Do not just state the number.
        3.  If the '[Provided Information]' block is empty, does not contain relevant details, or is insufficient to answer the question, you must respond with the exact phrase: "I don't have enough information on this topic, please try again later."
        4.  After providing the direct answer based on the information, you are allowed to add one short, relevant, and interesting fact in English if you are highly confident about it. This extra fact must start with the prefix "Info: ". Do not add this if you have no information.

        ### Examples

        ## Example 1: (Sufficient Information)
        [User Question]
        What was the temperature in the sitting room yesterday?

        [Provided Information]
        Date: 2025-08-15, Time: 9:00, Room: Sitting Room, Sensor: Temperature Sensor, Sensor Value: 23.4, Sensor Unit: °C, Status: Low Temperature

        [Metadata]
        {{"Date": "2025-08-15", "Hour": "09:00", "Room": "Sitting room", "Sensor": "Temperature Sensor"}}

        [Your Answer]
        The temperature recorded in the sitting room yesterday was 23.4°C, which was considered low.

        ## Example 2: (Insufficient Information)
        [User Question]
        What is the orbit of Mars?

        [Provided Information]
        "No information found this topic."

        [Metadata]
            
        [Your Answer]
        I don't have enough information on this topic, please try again later.

        ## Example 3: (Sufficient Information + Extra Fact)
        [User Question]
        What is the current temperature in the living room?

        [Provided Information]
        Date: 2025-08-16, Time: 15:00, Room: Living Room, Sensor: Temperature Sensor, Sensor Value: 25.4, Sensor Unit: °C, Status: Normal

        [Metadata]
        {{"Date": 2025-08-16, "Hour": 15:00, "Room": Living room, "Sensor": "Temperature Sensor"}}

        [Your Answer]
        The current temperature in the living room is 25.4°C, which is a normal value. Info: The ideal room temperature for humans is generally considered to be between 20-22°C.


        ## Now, answer the user's actual question based on the provided information and metadata information:

        [User Question]
        {user_question_placeholder}

        [Provided Information]
        {context_from_faiss_placeholder}

        [Metadata]
        {metadata_info_placeholder}

        [Your Answer]
    """

    if docs_from_faiss:
        context = "\n".join([doc.page_content for doc in docs_from_faiss])
    else:
        context = "No document infortmation found this topic."
    
    if not metadata_info:
        metadata_info = "No metadata information found for this topic."

    answering_prompt_final = answering_prompt_template.format(
        user_question_placeholder = user_query,
        context_from_faiss_placeholder = context,
        metadata_info_placeholder = metadata_info
    )

    messages = [
        SystemMessage(content=answering_prompt_final)
    ]

    return messages

In [63]:
user_query = "What was the temperature in kitchen on 15 august 9 a.m"

query_prompt_template = create_query_prompt(user_query=user_query)

In [64]:
model_output = llm.invoke(query_prompt_template)

In [70]:
model_output

'{"date": "2025-08-15", "time": "09:00", "room": "kitchen", "query": "temperature"}'

In [65]:
structured_query, metadatas = rewrite_query_and_extract_metadata(structured_model_output=model_output)

In [66]:
docs_from_faiss = fetch_documents(structured_query=structured_query, vectorstore=vectorstore, metadatas=metadatas)

In [67]:
answering_prompt_template = create_answering_prompt(user_query=user_query, docs_from_faiss=docs_from_faiss, metadata_info=metadatas)

In [73]:
model_response

'On August 15th at 9 a.m., the temperature in the kitchen was 21.35°C, which was considered a high temperature. Info: The ideal kitchen temperature for food safety and comfort is generally between 18-21°C.'