In [6]:
!pip install pandas
!pip install numpy
!pip install sentence-transformers
!pip install transformers
!pip install scikit-learn
!pip install python-docx


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




## Using GPT2 model

In [9]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document
import re

class CarbonCreditDocGenerator:
    def __init__(self):
        self.sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.nlg_pipeline = pipeline("text-generation", model="gpt2", max_length=1000)
        
        # Load your knowledge base here
        self.knowledge_base = self.load_knowledge_base()
        
    def load_knowledge_base(self):
        # Expanded knowledge base
        return [
            "Carbon credits represent the reduction of one metric ton of carbon dioxide emissions.",
            "Afforestation projects involve planting trees in areas where there were none before.",
            "The Verified Carbon Standard (VCS) is a widely recognized certification for carbon credits.",
            "Carbon credit projects must demonstrate additionality, meaning the reductions wouldn't have occurred without the project.",
            "Monitoring, reporting, and verification (MRV) are crucial components of carbon credit projects.",
            "Project developers must provide detailed information about project location, type, and expected carbon sequestration.",
            "Carbon credit pricing can vary based on project type, location, and additional benefits.",
            "Environmental Impact Assessments (EIA) are often required for carbon credit projects.",
            "Community engagement and social benefits are important aspects of many carbon credit projects.",
            "Risk assessment and mitigation strategies are crucial for project success and credibility."
        ]
        
    def process_input_data(self, input_text):
        # Improved parsing of the input document
        sections = re.split(r'\d+\.\s+', input_text)[1:]  # Split by numbered sections
        data = {}
        current_section = ""
        for section in sections:
            lines = section.strip().split('\n')
            section_title = lines[0].strip()
            current_section = section_title
            data[current_section] = {}
            for line in lines[1:]:
                if ':' in line:
                    key, value = line.split(':', 1)
                    data[current_section][key.strip()] = value.strip()
                else:
                    # Append to the last key if no colon is found
                    if data[current_section]:
                        last_key = list(data[current_section].keys())[-1]
                        data[current_section][last_key] += " " + line.strip()
        return data
    
    def retrieve_relevant_knowledge(self, query, top_k=3):
        query_embedding = self.sbert_model.encode([query])[0]
        knowledge_embeddings = self.sbert_model.encode(self.knowledge_base)
        
        similarities = cosine_similarity([query_embedding], knowledge_embeddings)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        return [self.knowledge_base[i] for i in top_indices]
    
    def generate_section_content(self, section_title, input_data, max_length=1000):
        query = f"Generate content for the '{section_title}' section of a carbon credit document."
        relevant_knowledge = self.retrieve_relevant_knowledge(query)
        
        # Use the specific section data if available, otherwise use all input data
        section_data = input_data.get(section_title, input_data)
        context = f"Input data: {section_data}\n\nRelevant knowledge: {' '.join(relevant_knowledge)}"
        prompt = f"{context}\n\nTask: {query}\n\nContent:"
        
        generated_text = self.nlg_pipeline(prompt, max_length=max_length, num_return_sequences=1)[0]['generated_text']
        
        # Apply corrective RAG
        corrected_text = self.apply_corrective_rag(generated_text, section_data, relevant_knowledge)
        
        return corrected_text
    
    def apply_corrective_rag(self, generated_text, input_data, relevant_knowledge):
        corrected_text = generated_text
        
        # Ensure all input data is represented
        for key, value in input_data.items():
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    if sub_value.lower() not in corrected_text.lower():
                        corrected_text += f" {sub_key}: {sub_value}."
            elif value.lower() not in corrected_text.lower():
                corrected_text += f" {key}: {value}."
        
        # Ensure relevant knowledge is incorporated
        for knowledge in relevant_knowledge:
            if knowledge.lower() not in corrected_text.lower():
                corrected_text += f" {knowledge}"
        
        return corrected_text
    
    def create_document(self, input_text):
        doc = Document()
        doc.add_heading('Carbon Credit Project Document', 0)
        
        input_data = self.process_input_data(input_text)
        
        sections = [
            "Project Overview",
            "Seller/Proponent Information",
            "Carbon Credit Specifications",
            "Financial & Pricing Information",
            "Project Impact and Sustainability",
            "Risks & Mitigation Strategies",
            "Supporting Documentation",
            "Declarations and Acknowledgements"
        ]
        
        for section in sections:
            doc.add_heading(section, level=1)
            content = self.generate_section_content(section, input_data)
            doc.add_paragraph(content)
        
        return doc
    
    def generate_document(self, input_text, output_path):
        doc = self.create_document(input_text)
        doc.save(output_path)
        print(f"Carbon credit document generated and saved to {output_path}")

# Usage
generator = CarbonCreditDocGenerator()
input_text = """
Carbon Credit Project Submission Form
1. Project Overview
Project Title: Green Future Reforestation Initiative
Executive Summary:
The Green Future Reforestation Initiative aims to restore 500 hectares of degraded land through afforestation and sustainable land management practices. The project seeks to sequester carbon dioxide, enhance biodiversity, and improve local air quality while providing economic opportunities for surrounding communities.
Location:
Country: Brazil
Region: Amazon Rainforest
Coordinates: Latitude -3.4653, Longitude -62.2159
Project Category: Afforestation
Project Start Date: 01/15/2024
Expected Completion Date: 12/31/2028
2. Seller/Proponent Information
Organization Name: EcoSustainability Inc.
Contact Person: Maria Silva
Job Title: Project Manager
Email Address: maria.silva@ecosustainability.org
Phone Number: +55 11 98765-4321
Mailing Address:
EcoSustainability Inc.
Avenida Paulista, 1000
São Paulo, SP, Brazil
Zip Code: 01310-100
3. Carbon Credit Specifications
Expected Carbon Credits: 25,000 tonnes of CO₂ equivalent annually
Carbon Credit Standard: Verified Carbon Standard (VCS)
Carbon Credit Methodology: VCS Methodology VM0017 (Afforestation and Reforestation)
Verification Status: Not yet verified; expected verification by Q2 of 2025.
4. Financial & Pricing Information
Total Project Budget: $2,500,000
Sources of Funding:
Internal Funding: $1,000,000
External Investors: $1,200,000
Grant Support: $300,000 (from the Green Climate Fund)
Carbon Credit Pricing: $15 per tonne of CO₂ equivalent
5. Project Impact and Sustainability
Environmental Impact:
The project is expected to sequester approximately 100,000 tonnes of CO₂ over its lifetime while restoring native habitats and increasing biodiversity by planting over 200,000 indigenous trees.
Community and Social Benefits:
The initiative will create around 150 jobs in tree planting and maintenance, provide training in sustainable agriculture to local farmers, and enhance community infrastructure through improved access roads.
Sustainability and Long-term Goals:
The project will establish a community stewardship program ensuring ongoing maintenance and monitoring of the forest. Plans for future expansion include additional reforestation sites and eco-tourism initiatives.
6. Risks & Mitigation Strategies
Risk Assessment:
Key risks include deforestation due to illegal logging, climate change impacts on growth rates, and financial instability.
Mitigation Strategies:
Implementing strict monitoring protocols, engaging local communities in protection efforts, securing insurance against natural disasters, and diversifying funding sources.
7. Supporting Documentation
Project Plan: Attached (PDF)
Environmental Impact Assessment (EIA): Attached (PDF)
Verification Reports: Not applicable at this stage.
Financial Projections: Attached (PDF)
Additional Certifications or Approvals: Attached (PDF)
8. Declarations and Acknowledgements
By signing below, I certify that the information provided is accurate and complete to the best of my knowledge. I acknowledge that any false or misleading information may result in the disqualification of this submission.
Name of Authorized Signatory: Carlos Mendes
Signature: [Digital Signature]
Date: 10/06/2024
Submission Guidelines:
Review and Complete: Ensure all sections are fully completed and accurate.
Attach Documents: Attach all required supporting documentation in the appropriate format (PDF preferred).
Submit: Submit the completed form and attachments via email to submissions@ecosustainability.org or through the Online Submission Portal.
"""
generator.generate_document(input_text, '/kaggle/working/carbon_credit_document2.docx')



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Carbon credit document generated and saved to /kaggle/working/carbon_credit_document2.docx


## Using Sbert model

In [8]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document

class SBERTCarbonCreditDocGenerator:
    def __init__(self):
        self.sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.knowledge_base = self.load_knowledge_base()
        self.document_structure = self.load_document_structure()
        
    def load_knowledge_base(self):
        # This should load your carbon credit domain knowledge
        return [
            "Carbon credits represent the reduction of one metric ton of carbon dioxide emissions.",
            "Afforestation projects involve planting trees in areas where there were none before.",
            "The Verified Carbon Standard (VCS) is a widely recognized certification for carbon credits.",
            "Carbon credit projects must demonstrate additionality, meaning the reductions wouldn't have occurred without the project.",
            "Monitoring, reporting, and verification (MRV) are crucial components of carbon credit projects.",
            "Project developers must provide detailed information about the project location, type, and expected carbon sequestration.",
            "The price of carbon credits can vary based on project type, location, and additional benefits like biodiversity conservation.",
            "Carbon credit projects often have social and environmental co-benefits beyond carbon sequestration.",
            "The crediting period for forestry projects typically ranges from 20 to 100 years.",
            "Leakage, or the displacement of emissions to areas outside the project boundary, must be accounted for in carbon credit calculations.",
        ]
        
    def load_document_structure(self):
        return [
            "Executive Summary",
            "Certificate Identification",
            "Emission Reduction Details",
            "Project Information",
            "Verification and Certification",
            "Issuance and Expiration Dates",
            "Market Type",
            "Transferability Information",
            "Legal Framework",
            "Accountability Measures",
            "Contact Information"
        ]
        
    def process_input_data(self, input_text):
        lines = input_text.split('\n')
        data = {}
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                data[key.strip()] = value.strip()
        return data
    
    def retrieve_relevant_knowledge(self, query, top_k=3):
        query_embedding = self.sbert_model.encode([query])[0]
        knowledge_embeddings = self.sbert_model.encode(self.knowledge_base)
        
        similarities = cosine_similarity([query_embedding], knowledge_embeddings)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        return [self.knowledge_base[i] for i in top_indices]
    
    def generate_section_content(self, section_title, input_data):
        relevant_input = self.get_relevant_input(section_title, input_data)
        relevant_knowledge = self.retrieve_relevant_knowledge(section_title)
        
        content = f"{section_title}:\n\n"
        content += "Project-specific information:\n"
        for key, value in relevant_input.items():
            content += f"- {key}: {value}\n"
        
        content += "\nRelevant domain knowledge:\n"
        for knowledge in relevant_knowledge:
            content += f"- {knowledge}\n"
        
        return content
    
    def get_relevant_input(self, section_title, input_data):
        section_embedding = self.sbert_model.encode([section_title])[0]
        input_embeddings = self.sbert_model.encode(list(input_data.keys()))
        
        similarities = cosine_similarity([section_embedding], input_embeddings)[0]
        relevant_indices = np.where(similarities > 0.3)[0]  # Adjust threshold as needed
        
        return {k: input_data[k] for i, k in enumerate(input_data.keys()) if i in relevant_indices}
    
    def create_document(self, input_text):
        doc = Document()
        doc.add_heading('Carbon Credit Project Document', 0)
        
        input_data = self.process_input_data(input_text)
        
        for section in self.document_structure:
            doc.add_heading(section, level=1)
            content = self.generate_section_content(section, input_data)
            doc.add_paragraph(content)
        
        return doc
    
    def generate_document(self, input_text, output_path):
        doc = self.create_document(input_text)
        doc.save(output_path)
        print(f"Carbon credit document generated and saved to {output_path}")

# Usage
generator = SBERTCarbonCreditDocGenerator()
input_text = """
Carbon Credit Project Submission Form
1. Project Overview
Project Title: Green Future Reforestation Initiative
Executive Summary:
The Green Future Reforestation Initiative aims to restore 500 hectares of degraded land through afforestation and sustainable land management practices. The project seeks to sequester carbon dioxide, enhance biodiversity, and improve local air quality while providing economic opportunities for surrounding communities.
Location:
Country: Brazil
Region: Amazon Rainforest
Coordinates: Latitude -3.4653, Longitude -62.2159
Project Category: Afforestation
Project Start Date: 01/15/2024
Expected Completion Date: 12/31/2028
2. Seller/Proponent Information
Organization Name: EcoSustainability Inc.
Contact Person: Maria Silva
Job Title: Project Manager
Email Address: maria.silva@ecosustainability.org
Phone Number: +55 11 98765-4321
Mailing Address:
EcoSustainability Inc.
Avenida Paulista, 1000
São Paulo, SP, Brazil
Zip Code: 01310-100
3. Carbon Credit Specifications
Expected Carbon Credits: 25,000 tonnes of CO₂ equivalent annually
Carbon Credit Standard: Verified Carbon Standard (VCS)
Carbon Credit Methodology: VCS Methodology VM0017 (Afforestation and Reforestation)
Verification Status: Not yet verified; expected verification by Q2 of 2025.
4. Financial & Pricing Information
Total Project Budget: $2,500,000
Sources of Funding:
Internal Funding: $1,000,000
External Investors: $1,200,000
Grant Support: $300,000 (from the Green Climate Fund)
Carbon Credit Pricing: $15 per tonne of CO₂ equivalent
5. Project Impact and Sustainability
Environmental Impact:
The project is expected to sequester approximately 100,000 tonnes of CO₂ over its lifetime while restoring native habitats and increasing biodiversity by planting over 200,000 indigenous trees.
Community and Social Benefits:
The initiative will create around 150 jobs in tree planting and maintenance, provide training in sustainable agriculture to local farmers, and enhance community infrastructure through improved access roads.
Sustainability and Long-term Goals:
The project will establish a community stewardship program ensuring ongoing maintenance and monitoring of the forest. Plans for future expansion include additional reforestation sites and eco-tourism initiatives.
6. Risks & Mitigation Strategies
Risk Assessment:
Key risks include deforestation due to illegal logging, climate change impacts on growth rates, and financial instability.
Mitigation Strategies:
Implementing strict monitoring protocols, engaging local communities in protection efforts, securing insurance against natural disasters, and diversifying funding sources.
7. Supporting Documentation
Project Plan: Attached (PDF)
Environmental Impact Assessment (EIA): Attached (PDF)
Verification Reports: Not applicable at this stage.
Financial Projections: Attached (PDF)
Additional Certifications or Approvals: Attached (PDF)
8. Declarations and Acknowledgements
By signing below, I certify that the information provided is accurate and complete to the best of my knowledge. I acknowledge that any false or misleading information may result in the disqualification of this submission.
Name of Authorized Signatory: Carlos Mendes
Signature: [Digital Signature]
Date: 10/06/2024
Submission Guidelines:
Review and Complete: Ensure all sections are fully completed and accurate.
Attach Documents: Attach all required supporting documentation in the appropriate format (PDF preferred).
Submit: Submit the completed form and attachments via email to submissions@ecosustainability.org or through the Online Submission Portal.
"""
generator.generate_document(input_text, '/kaggle/working/carbon_credit_document_sbert2.docx')



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Carbon credit document generated and saved to /kaggle/working/carbon_credit_document_sbert2.docx
