In [1]:
# !pip install google-generativeai python-dotenv pydantic

In [2]:
import sys
sys.path.append('../')

In [15]:
from services.gemini_wrapper import generate_insights
from services.prompts import SUMMARIZER_PROMPT
import os
import json
import fitz  # PyMuPDF
import traceback
from pathlib import Path
from pydantic import ValidationError

In [16]:
class SummaryStructure(BaseModel):
    Philosophy_of_Policy_Makers: str = Field(..., description="Core philosophy and strategic intent of policymakers.")
    Previous_Performance_Summary: str = Field(..., description="Key achievements and challenges in the previous period.")
    Future_Predictions: str = Field(..., description="Economic forecasts and expected trends.")
    Focus_Areas_and_Sectors: str = Field(..., description="Sectors of interest and companies to watch.")
    Key_Policy_Changes_and_Reforms: str = Field(..., description="Significant policy updates and regulatory changes.")
    Growth_Drivers_and_Challenges: str = Field(..., description="Factors driving growth and potential challenges.")
    Global_Impact_and_Comparisons: str = Field(..., description="Comparison of India's economy with global trends.")
    Opportunities_for_Businesses_and_Investors: str = Field(..., description="High-growth areas and investment opportunities.")
    Technology_and_Innovation_Impact: str = Field(..., description="Role of AI, digital transformation, and emerging technologies.")
    Social_Implications: str = Field(..., description="Effects on employment, education, and social welfare.")
    Conclusion_and_Actionable_Insights: str = Field(..., description="Key takeaways and recommendations.")

class SummarySchema(BaseModel):
    file_name: str = Field(..., description="Name of the file being summarized.")
    text: str = Field(..., description="Extracted text from the document.")
    text_length: int = Field(..., description="Length of the extracted text.")
    summary: SummaryStructure = Field(..., description="Structured summary output.")


In [None]:
SummaryStructure.schema()

In [None]:
subfolders = [ f.path for f in os.scandir(".") if f.is_dir() ]
subfolders

In [None]:
data = []

for folder in subfolders:
    obj = {}
    folder_path = Path(folder)

    try:
        if not folder_path.exists() or not folder_path.is_dir():
            print(f"Skipping invalid folder: {folder}")
            continue

        file_inside = next((f for f in folder_path.iterdir() if f.is_file()), None)
        if not file_inside:
            print(f"No valid file found in {folder}")
            continue

        obj["file_name"] = folder.replace("./", "")
        print(f"Processing file: {folder}")

        with fitz.open(file_inside) as doc:
            doc_text = "\n\n".join(page.get_text() for page in doc)

        obj["text"] = doc_text
        obj["text_length"] = len(doc_text)

        # Generate structured summary using Generative AI
        summary_response = generate_insights(SUMMARIZER_PROMPT.format(title=obj["file_name"], content=obj["text"]))

        summary_response = summary_response.replace("```json", "").replace("```", "")  # Remove unnecessary characters
        print("AI Summary Response:", summary_response)

        try:
            summary_dict = json.loads(summary_response)  # Convert AI-generated JSON string to dict
            obj["summary"] = SummaryStructure(**summary_dict)  # Validate JSON structure using Pydantic
        except (json.JSONDecodeError, ValidationError) as e:
            print(f"Error processing summary for file {file_inside.name}: {e}")
            traceback.print_exc()
            break

        # Validate the entire object
        validated_obj = SummarySchema(**obj)
        data.append(validated_obj.dict())  # Store as a dictionary

    except Exception as e:
        print(f"Unexpected error processing file {file_inside.name}: {e}")
        traceback.print_exc()
        break

In [None]:
data

In [24]:
with open("./economic_survey_summaries.json", "w") as f:
    json.dump(data, f, indent=4)

In [25]:
updated_data = []
for obj in data:
    del obj["text"]  # Remove the extracted text to reduce file size
    updated_data.append(obj)

In [26]:
with open("./economic_survey_summaries.json", "w") as f:
    json.dump(updated_data, f, indent=4)