# Tutorial

In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\Text Summarization'

In [2]:
from TextSummarizer.constants import *
from TextSummarizer.utils.file_utils import *
from TextSummarizer.utils.config_utils import *
from TextSummarizer.utils.data_utils import *
from TextSummarizer.config.configuration import ConfigurationManager
from TextSummarizer.logging import logger

  from .autonotebook import tqdm as notebook_tqdm


[2024-12-26 09:23:11,910: INFO: config: PyTorch version 2.5.1+cu121 available.]


In [20]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from pymongo.errors import PyMongoError
from typing import Dict, List, Optional
from collections import defaultdict
from datasets import load_from_disk, Dataset, DatasetDict
import pandas as pd
from datetime import datetime
import os

In [37]:
class MongoDBHandler:
    def __init__(self):
        try:
            logger.debug("Initializing MongoDBHandler.")
            self.data_transformation_config = ConfigurationManager()
            self.root_dir = self.data_transformation_config.get_data_transformation_config().root_dir
            self.data_path = os.path.join(self.root_dir, "sampled_dataset")
            self.config = get_settings()
            self.uri = self.config.MONGODB_CONNECTION_STRING
            self.db_name = self.config.MONGODB_NAME
            self.client = self.connect_to_mongodb()
            self.db = self.client[self.db_name]
            self.db_name = "text_summarization_database"
            self.db_collection_name = "database_splits"
            self.texts = self.db['texts']
            self.summaries = self.db['summaries']
            logger.info("MongoDBHandler initialized successfully.")
        except Exception as e:
            logger.error(f"Error during initialization: {e}")
            raise

    def connect_to_mongodb(self):
        try:
            client = MongoClient(self.uri, server_api=ServerApi('1'))
            client.admin.command('ping')
            print("Pinged your deployment. Successfully connected to MongoDB!")
            return client
        except PyMongoError as e:
            print(f"Failed to connect to MongoDB: {e}")
            raise


    def load_data_from_disk(self) -> Dataset:
        try:
            logger.debug(f"Loading dataset from disk at path: {self.data_path}")
            dataset = load_from_disk(self.data_path)
            logger.info("Dataset loaded successfully.")
            return dataset
        except Exception as e:
            logger.error(f"Error loading dataset from disk: {e}")
            raise

    def _prepare_dataset_for_mongo(self,dataset):
        """Convert DatasetDict to a MongoDB-compatible format"""
        mongo_data = []
        
        # Process each split in the dataset
        for split_name, split_data in dataset.items():
            # Convert the split data to a dictionary
            split_dict = split_data.to_dict()
            
            # Restructure the data for MongoDB
            # MongoDB documents should be row-based rather than column-based
            num_examples = len(next(iter(split_dict.values())))
            for i in range(num_examples):
                document = {'split': split_name}
                for feature_name, feature_values in split_dict.items():
                    document[feature_name] = feature_values[i]
                mongo_data.append(document)
        
        return mongo_data

    def test_and_create_collection(self,data):
        try:
            db = self.client[self.db_name]
            collection = db[self.db_collection_name]
            
            # Drop the collection if it exists
            if self.db_collection_name in db.list_collection_names():
                collection.drop()
                print(f"Dropped existing collection: {self.db_collection_name}")

            # Insert data into the collection
            if data:
                result = collection.insert_many(data)
                print(f"Inserted {len(result.inserted_ids)} documents into '{self.db_collection_name}' collection.")
    
        except PyMongoError as e:
            print(f"Error working with the collection: {e}")
            raise


    def upload_dataset_to_mongo(self, dataset):
        try:
            # Prepare the dataset
            mongo_data = self._prepare_dataset_for_mongo(dataset)
            
            # Use your existing function to create and populate the collection
            self.test_and_create_collection(mongo_data)
            
            print(f"Successfully uploaded dataset to MongoDB Atlas")
            
        except Exception as e:
            print(f"Error uploading dataset to MongoDB: {e}")
            raise

    def mongodb_to_datasetdict(self) -> DatasetDict:
        """
        Retrieve all data from MongoDB and convert it back to a DatasetDict object.
        """
        logger.info("Starting to retrieve and convert MongoDB data to DatasetDict")
        try:
            # Get database and collection
            db = self.client[self.db_name]
            collection = db[self.db_collection_name]
            logger.debug(f"Connected to database '{self.db_name}' and collection '{self.db_collection_name}'")
            
            # Retrieve all documents (excluding MongoDB _id field)
            all_documents = list(collection.find({}, {'_id': 0}))
            logger.info(f"Retrieved {len(all_documents)} documents from MongoDB")
            
            if not all_documents:
                logger.error("No documents found in the collection")
                raise ValueError("No documents found in the collection")
                
            # Group documents by split
            split_data = defaultdict(list)
            for doc in all_documents:
                split_name = doc.pop('split', 'train')
                split_data[split_name].append(doc)
            logger.debug(f"Grouped documents into {len(split_data)} splits: {list(split_data.keys())}")
            
            # Convert to DatasetDict format
            dataset_dict = {}
            for split_name, documents in split_data.items():
                # Convert list of documents to column format
                features = defaultdict(list)
                for doc in documents:
                    for key, value in doc.items():
                        features[key].append(value)
                
                # Create Dataset object for each split
                dataset_dict[split_name] = Dataset.from_dict(features)
                logger.debug(f"Created Dataset for split '{split_name}' with {len(documents)} examples")
            
            final_dataset = DatasetDict(dataset_dict)
            logger.info("Successfully converted MongoDB data to DatasetDict")
            return final_dataset
            
        except PyMongoError as e:
            logger.error(f"MongoDB error: {str(e)}", exc_info=True)
            raise
        except Exception as e:
            logger.error(f"Error converting data to DatasetDict: {str(e)}", exc_info=True)
            raise




    def save_summary(self, text: str, summary: str, model_name: str) -> str:
        try:
            logger.debug(f"Saving summary for text with model: {model_name}")
            doc = {
                "text": text,
                "summary": summary,
                "model": model_name,
                "created_at": datetime.now()
            }
            result = self.summaries.insert_one(doc)
            logger.info(f"Summary saved with ID: {result.inserted_id}")
            return str(result.inserted_id)
        except Exception as e:
            logger.error(f"Error saving summary: {e}")
            raise

    def get_summary(self, text_id: str) -> Dict:
        try:
            logger.debug(f"Fetching summary with ID: {text_id}")
            summary = self.summaries.find_one({"_id": text_id})
            logger.info(f"Summary fetched: {summary}")
            return summary
        except Exception as e:
            logger.error(f"Error fetching summary: {e}")
            raise

    def get_summaries_by_model(self, model_name: str) -> List[Dict]:
        try:
            logger.debug(f"Fetching summaries for model: {model_name}")
            summaries = list(self.summaries.find({"model": model_name}))
            logger.info(f"Fetched {len(summaries)} summaries for model: {model_name}")
            return summaries
        except Exception as e:
            logger.error(f"Error fetching summaries by model: {e}")
            raise

    def close(self):
        try:
            logger.debug("Closing MongoDB connection.")
            self.client.close()
            logger.info("MongoDB connection closed.")
        except Exception as e:
            logger.error(f"Error closing MongoDB connection: {e}")
            raise

    def __enter__(self):
        logger.debug("Entering MongoDBHandler context.")
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        logger.debug("Exiting MongoDBHandler context.")
        self.close()


In [38]:
mongodbhandler = MongoDBHandler()
dataset = mongodbhandler.load_data_from_disk()
mongodbhandler.upload_dataset_to_mongo(dataset)
dataset = mongodbhandler.mongodb_to_datasetdict()

[2024-12-26 09:59:42,179: INFO: config_utils: yaml file: config\config.yaml loaded successfully]
[2024-12-26 09:59:42,182: INFO: config_utils: yaml file: params.yaml loaded successfully]
[2024-12-26 09:59:42,184: INFO: file_utils: created directory at: artifacts]
[2024-12-26 09:59:42,185: INFO: file_utils: created directory at: artifacts/data_transformation]
Pinged your deployment. Successfully connected to MongoDB!
[2024-12-26 09:59:43,748: INFO: 1865962573: MongoDBHandler initialized successfully.]
[2024-12-26 09:59:43,764: INFO: 1865962573: Dataset loaded successfully.]
Dropped existing collection: database_splits
Inserted 60 documents into 'database_splits' collection.
Successfully uploaded dataset to MongoDB Atlas
[2024-12-26 09:59:47,963: INFO: 1865962573: Starting to retrieve and convert MongoDB data to DatasetDict]
[2024-12-26 09:59:49,336: INFO: 1865962573: Retrieved 60 documents from MongoDB]
[2024-12-26 09:59:49,373: INFO: 1865962573: Successfully converted MongoDB data to D