#  RAG-based Solar Weather Impact Search app

# STEP 1: Load CSVs into DataFrames

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd

# Load CSVs (already uploaded to Google Drive or local path)
gen1 = pd.read_csv('/content/drive/MyDrive/RAG_EPGP_MLAI/RAG_Assignment/Plant_1_Generation_Data.csv')
gen2 = pd.read_csv('/content/drive/MyDrive/RAG_EPGP_MLAI/RAG_Assignment/Plant_2_Generation_Data.csv')
weather1 = pd.read_csv('/content/drive/MyDrive/RAG_EPGP_MLAI/RAG_Assignment/Plant_1_Weather_Sensor_Data.csv')
weather2 = pd.read_csv('/content/drive/MyDrive/RAG_EPGP_MLAI/RAG_Assignment/Plant_2_Weather_Sensor_Data.csv')


# STEP 2: Standardize Column Names (optional)

In [3]:
# Standardize the column names
for df in [gen1, gen2, weather1, weather2]:
    df.columns = df.columns.str.strip().str.upper()


# STEP 3: Summarize Each Row Using NLP-Friendly Format


In [4]:
def make_gen_summary(row):
    return (
        f"[Generation] At {row['DATE_TIME']}, Plant {row['PLANT_ID']} generated "
        f"{row['AC_POWER']}W AC and {row['DC_POWER']}W DC. "
        f"Daily Yield: {row['DAILY_YIELD']} kWh. Total Yield: {row['TOTAL_YIELD']} kWh."
    )

def make_weather_summary(row):
    return (
        f"[Weather] At {row['DATE_TIME']}, Plant {row['PLANT_ID']} recorded "
        f"Irradiation: {row['IRRADIATION']} W/m², Module Temp: {row['MODULE_TEMPERATURE']}°C, "
        f"Ambient Temp: {row['AMBIENT_TEMPERATURE']}°C."
    )

gen_chunks = gen1.apply(make_gen_summary, axis=1).tolist() + gen2.apply(make_gen_summary, axis=1).tolist()
weather_chunks = weather1.apply(make_weather_summary, axis=1).tolist() + weather2.apply(make_weather_summary, axis=1).tolist()

# STEP 4: Install & Load Embedding Model

In [5]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle
from tqdm import tqdm
import os

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight & accurate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# STEP 5: Generate & Cache Embeddings

In [6]:
all_chunks = gen_chunks + weather_chunks  # combine both sets of summaries
cache_path = "embeddings_cache.pkl"

# Load cache if available
if os.path.exists(cache_path):
    with open(cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
else:
    embedding_cache = {}

embeddings = []
for chunk in tqdm(all_chunks, desc="Embedding Chunks"):
    if chunk in embedding_cache:
        emb = embedding_cache[chunk]
    else:
        emb = model.encode(chunk)
        embedding_cache[chunk] = emb
    embeddings.append(emb)

# Save cache
with open(cache_path, "wb") as f:
    pickle.dump(embedding_cache, f)

Embedding Chunks: 100%|██████████| 142917/142917 [16:18<00:00, 146.02it/s]


# STEP 6: Index Embeddings with FAISS

In [7]:
!pip install faiss-cpu

import faiss

embedding_matrix = np.array(embeddings).astype("float32")
embedding_dim = embedding_matrix.shape[1]

faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(embedding_matrix)

print("✅ FAISS index built. Total vectors:", faiss_index.ntotal)


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
✅ FAISS index built. Total vectors: 142917


# STEP 7: Save FAISS Index and Chunks


In [8]:
faiss.write_index(faiss_index, "faiss_index.index")

with open("chunk_texts.pkl", "wb") as f:
    pickle.dump(all_chunks, f)


#STEP 8: Define Search Function

In [9]:
def search_faiss(query, model, index, chunks, k=5):
    query_vector = model.encode(query).astype("float32")
    D, I = index.search(np.array([query_vector]), k)
    return [chunks[i] for i in I[0]]

# STEP 9: Setup OpenAI GPT API for Answer Generation

In [10]:
from openai import OpenAI

# Load API Key from file
with open("/content/drive/MyDrive/RAG_EPGP_MLAI/ChatGPT_Key.txt", "r") as f:
    api_key = f.read().strip()

client = OpenAI(api_key=api_key)

def generate_answer(context_chunks, query):
    context = "\n\n".join(context_chunks)
    prompt = f"""
You are a solar plant expert.
Use the below context to answer the question precisely.

Context:
{context}

Question: {query}

Answer:"""

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


# STEP 10: Query the System

In [11]:
query = "How does temperature impact energy generation?"
top_chunks = search_faiss(query, model, faiss_index, all_chunks, k=5)
answer = generate_answer(top_chunks, query)

print("🔍 Query:", query)
# print("\n📚 Retrieved Context:\n", "\n\n".join(top_chunks))
print("\n💬 Answer:\n", answer)


🔍 Query: How does temperature impact energy generation?

💬 Answer:
 Temperature can have a significant impact on solar energy generation. In general, as the temperature increases, the efficiency of solar panels tends to decrease. This is because higher temperatures can cause solar panels to operate less efficiently, leading to a decrease in energy generation. On the other hand, colder temperatures can sometimes increase energy generation due to the improved efficiency of the panels. It is important for solar plant operators to monitor and manage the temperature of their panels to optimize energy generation.


In [12]:
query = "Does higher irradiation always result in more AC power?"
top_chunks = search_faiss(query, model, faiss_index, all_chunks, k=5)
answer = generate_answer(top_chunks, query)

print("🔍 Query:", query)
# print("\n📚 Retrieved Context:\n", "\n\n".join(top_chunks))
print("\n💬 Answer:\n", answer)

🔍 Query: Does higher irradiation always result in more AC power?

💬 Answer:
 No, higher irradiation does not always result in more AC power. While increased irradiation generally leads to higher power generation, other factors such as system efficiency, maintenance, shading, and equipment degradation can also impact the amount of AC power generated. Thus, it is important to consider all of these factors when analyzing power generation data from a solar plant.


In [13]:
query = "Compare the performance of both plants during peak sunlight hours."
top_chunks = search_faiss(query, model, faiss_index, all_chunks, k=5)
answer = generate_answer(top_chunks, query)

print("🔍 Query:", query)
# print("\n📚 Retrieved Context:\n", "\n\n".join(top_chunks))
print("\n💬 Answer:\n", answer)

🔍 Query: Compare the performance of both plants during peak sunlight hours.

💬 Answer:
 Based on the provided data, the performance of Plant 4136001 during peak sunlight hours seems to be better compared to Plant 4135001. 

During peak sunlight hours on 2020-06-08 at 13:15:00, Plant 4136001 recorded an irradiation of 0.6180603946666666 W/m², Module Temp of 50.7018586°C, and Ambient Temp of 33.37835496666667°C. 

On the other hand, during peak sunlight hours on 2020-06-08 at 15:30:00, Plant 4135001 recorded a lower irradiation of 0.4696780846666667 W/m², Module Temp of 45.644804066666666°C, and Ambient Temp of 30.64887906666667°C. 

Therefore, Plant 4136001 exhibited better performance with higher irradiation levels and module temperatures during peak sunlight hours compared to Plant 4135001.


In [14]:
query = "When do the highest yields occur and under what conditions?"
top_chunks = search_faiss(query, model, faiss_index, all_chunks, k=5)
answer = generate_answer(top_chunks, query)

print("🔍 Query:", query)
# print("\n📚 Retrieved Context:\n", "\n\n".join(top_chunks))
print("\n💬 Answer:\n", answer)

🔍 Query: When do the highest yields occur and under what conditions?

💬 Answer:
 The highest yield occurred on June 14th at 15:15:00, with a daily yield of 6439.2666666666655 kWh. This high yield was likely the result of optimal sunlight conditions, possibly due to clear skies and maximum sunlight exposure during that time of the day. Additionally, the higher than usual AC and DC power generation values at that time also contributed to the overall high yield.


In [15]:
query = "Which plant is more efficient in converting DC to AC??"
top_chunks = search_faiss(query, model, faiss_index, all_chunks, k=5)
answer = generate_answer(top_chunks, query)

print("🔍 Query:", query)
# print("\n📚 Retrieved Context:\n", "\n\n".join(top_chunks))
print("\n💬 Answer:\n", answer)

🔍 Query: Which plant is more efficient in converting DC to AC??

💬 Answer:
 Plant 4135001 was most efficient at converting DC to AC on 16-06-2020 at 17:00, as it generated 189.375W AC from 1932.0W DC, resulting in a higher efficiency compared to the other recorded dates.


In [16]:
query = "What anomalies or inconsistencies are observed in energy generation?"
top_chunks = search_faiss(query, model, faiss_index, all_chunks, k=5)
answer = generate_answer(top_chunks, query)

print("🔍 Query:", query)
# print("\n📚 Retrieved Context:\n", "\n\n".join(top_chunks))
print("\n💬 Answer:\n", answer)

🔍 Query: What anomalies or inconsistencies are observed in energy generation?

💬 Answer:
 1. The DC power generation for Plant 4135001 at 27-05-2020 13:30 seems unusually high compared to the other readings, indicating a possible anomaly in the data.
2. The Total Yield for Plant 4136001 on 2020-05-31 and 2020-06-03 shows a significant increase from the previous days, suggesting a potential inconsistency in the reporting or data collection process.
3. The Daily Yield for Plant 4136001 on 2020-06-10 is significantly lower compared to the other days, which could indicate a potential issue with the plant's operation or data recording.
