 For splitting the csv in one txt file

In [None]:
import pandas as pd
import re

def clean_text(text):#+

# Load the CSV file with error handling using on_bad_lines
csv_file_path = 'SNET_Digitization - 1-100.csv'

try:
    df = pd.read_csv(csv_file_path, encoding_errors='ignore', on_bad_lines='skip')
except pd.errors.ParserError as e:
    print(f"Error reading the CSV file: {e}")
    exit()

# Specify the columns to extract
document_id_column = 'Serial Number'
article_text_column = 'Article Text'

# Function to clean up non-printable characters
def clean_text(text):
    # Replace non-printable characters with an empty string
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', str(text))

# Apply cleaning function to 'Article Text' column
df[article_text_column] = df[article_text_column].apply(clean_text)

# Initialize an output file
output_file_path = 'output_climate_finance_serialno_SNETs_1-100.txt'

# Group by 'Document ID' and write the output
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for document_id, group in df.groupby(document_id_column):
        output_file.write(f'Serial_Number: {document_id}\n')
        # Merge all text for that Document ID and write it
        merged_text = ' '.join(group[article_text_column].dropna().astype(str))
        output_file.write(f'{merged_text}\n\n')

print(f"Output written to {output_file_path}")

For splitting  one txt file into multiple smaller chunks 

In [None]:
import os

def read_file(file_path, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding) as file:
        return file.readlines()

def split_content_by_serial_number(content):
    chunks = []
    current_chunk = []
    current_chunk_word_count = 0
    serial_number = None
    
    for line in content:
        if "Serial_Number: " in line:
            if current_chunk:
                chunks.append(current_chunk)
                current_chunk = [line]
                current_chunk_word_count = len(line.split())
                serial_number = line.split()[1]
            else:
                current_chunk.append(line)
                serial_number = line.split()[1]
        else:
            current_chunk.append(line)
            current_chunk_word_count += len(line.split())
        
        if current_chunk_word_count > 2000:
            chunks.append(current_chunk)
            current_chunk = []
            current_chunk_word_count = 0
            if "Serial_Number: " in line:
                serial_number = line.split()[1]
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def save_chunks_to_files(chunks, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for i, chunk in enumerate(chunks):
        chunk_filename = os.path.join(output_folder, f'chunk_{i + 1}.txt')
        with open(chunk_filename, 'w', encoding='utf-8') as chunk_file:
            for line in chunk:
                chunk_file.write(line)

def process_large_text_file(input_file, output_folder):
    content = read_file(input_file)
        
    # Split the content into chunks of 2000 words each
    chunks = split_content_by_serial_number(content)
        
    # Save the chunks to separate files
    save_chunks_to_files(chunks, output_folder)

if __name__ == '__main__':
    input_file = 'output_climate_finance_serialno_SNETs_1-100.txt'
    output_folder = 'output_chunks_folder_climate_finance_serialno_SNETs_1-100'
    process_large_text_file(input_file, output_folder)

Generation

In [6]:
import os
import pickle
from abc import ABC, abstractmethod, abstractproperty
from dotenv import load_dotenv
import boto3
import faiss
from typing import Optional
from langchain.llms.bedrock import Bedrock
from langchain_community.chat_models import BedrockChat
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.conversational_retrieval.base import BaseConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS

from boto3 import client
from botocore.config import Config

config = Config(read_timeout=1000)

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# Create the Bedrock client
BEDROCK_CLIENT = boto3.client(
    "bedrock",
    region_name='us-east-1',
    config=config,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

class BaseConversation(ABC):

    @abstractproperty
    def default_model(self) -> str:
        pass

    @abstractproperty
    def embeddings(self):
        pass

    @abstractmethod
    def get_conversation_chain(
        self,
        store: FAISS
    ) -> BaseConversationalRetrievalChain:
        pass

    def create_store(self, texts: list):
        """Create a vector store from the provided texts."""
        store: FAISS = FAISS.from_texts(
            texts=texts,
            embedding=self.embeddings
        )
        faiss.write_index(store.index, "docs.index")
        store.index = None
        with open("faiss_store.pkl", "wb") as f:
            pickle.dump(store, f)

    def get_chain(self) -> BaseConversationalRetrievalChain:
        """Create a conversation chain from the stored vector store."""
        if not os.path.exists("docs.index"):
            raise FileNotFoundError("No vector store found.")
        if not os.path.exists("faiss_store.pkl"):
            raise FileNotFoundError("No vector store found.")

        index = faiss.read_index("docs.index")
        with open("faiss_store.pkl", "rb") as f:
            store = pickle.load(f)

        store.index = index
        return self.get_conversation_chain(store=store)

class HFConversation(BaseConversation):
    def __init__(self, model_name: Optional[str] = None) -> None:
        self.model_name = model_name
        self._embeddings: Optional[Embeddings] = None

    @property
    def default_model(self) -> str:
        return 'hkunlp/instructor-large'

    @property
    def embeddings(self) -> Embeddings:
        if self._embeddings is None:
            self._embeddings = HuggingFaceEmbeddings()
        return self._embeddings

    def get_conversation_chain(self, store: FAISS) -> BaseConversationalRetrievalChain:
        """Create a conversation chain from the provided vector store."""
        llm = BedrockChat(model_id="mistral.mistral-large-2402-v1:0", region_name='us-east-1', model_kwargs={"temperature": 0.3,"max_tokens": 4000})
        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        return ConversationalRetrievalChain.from_llm(llm=llm, retriever=store.as_retriever(), memory=memory)

def extract_information_from_chunk(chunk, conversation_chain):
    prompt = f"""You are an AI assistant tasked with identifying articles that mention data usage in disaster contexts. Your goal is to determine whether a given article discusses the collection, analysis, or application of data in relation to disasters. Follow these instructions carefully:

1. Understanding Data in Disasters:
"Data in Disasters" refers to the various types of information collected, analyzed, and utilized before, during, and after disaster events. Key aspects include:

a) Types of Data:
- Hazard Data (e.g., frequency, intensity, location of disasters)
- Exposure Data (populations and assets in disaster-prone areas)
- Vulnerability Data (social, economic, environmental factors)
- Impact Data (casualties, economic losses, damage assessments)
- Response and Recovery Data (relief operations, recovery efforts)

b) Data Sources:
- Remote Sensing (satellite images, aerial photos, GIS data)
- Crowdsourced Data (social media, mobile apps, local reporting)
- Field Surveys (ground assessments by response teams)
- Statistical Data (censuses, surveys, economic reports)

c) Indicators and Metrics:
- Mortality rates
- Economic losses
- Affected population numbers
- Disaster risk indexes

d) Data Applications:
- Early Warning Systems
- Big Data and AI in disaster response
- Humanitarian data management
- Climate and environmental monitoring
- Geospatial mapping and analysis

e) Open Data Repositories:
- UN OCHA's Humanitarian Data Exchange (HDX)
- World Bank Open Data
- EM-DAT (Emergency Events Database)
- NASA Earth Data

2. Article Analysis:
Carefully read the provided article text:

<article>
{{ARTICLE_TEXT}}
</article>

3. As you read, consider the following questions:
- Does the article mention any specific types of data used in disaster contexts?
- Are there examples of data collection, analysis, or application in disaster management?
- Does the content discuss the role of data in disaster prevention, preparedness, response, or recovery?
- Are there mentions of data sources, indicators, or metrics related to disasters?
- Does the article describe any challenges or innovations in disaster-related data usage?
- Is there discussion of open data repositories or data sharing in disaster contexts?

4. Make a Decision and Provide Output:
Based on your analysis, determine whether the article mentions data usage in disaster contexts.


If the article is relevant, output the following:

YOUT OUTPUT MUST BE IN THE BELOW FORMAT

Serial Number : 1
Decision : Yes

Serial Number : 2
Decision : No

STRICTLY dont give any other filling text or reason for your decision ,output must only contain "["Yes","<Document ID>" "<Serial_Number>"]"

ALL ARTICLES MUST BE COVERED, NOT ONE ARTICLE SHOULD BE ABSENT IN THE OUTPUT.

"""
    full_input = f"{prompt}\n\n{chunk}"
    # Use the appropriate method to run the full input as a single string
    response = conversation_chain.run(full_input)
    return response

def main():
    # Initialize conversation
    conversation = HFConversation()

    # Check if vector store exists, if not create it
    if not os.path.exists("docs.index") or not os.path.exists("faiss_store.pkl"):
        # Assuming you have the texts to create the store
        texts = ["Your initial texts for creating the FAISS store go here"]
        conversation.create_store(texts)

    # Folder containing file chunks
    folder_path = 'file_chunks'
    output_file = 'output.txt'

    processed_serial_numbers = set()

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for filename in os.listdir(folder_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(folder_path, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as infile:
                        chunk = infile.read()

                        # Reinitialize conversation chain for each chunk to clear context
                        conversation_chain = conversation.get_chain()

                        response = extract_information_from_chunk(chunk, conversation_chain)

                        # Extract the serial number from the chunk
                        if 'Serial_Number: ' in chunk:
                            serial_number = chunk.split('Serial_Number: ')[1].split()[0]
                        else:
                            serial_number = "Not Mentioned"

                        # Write the serial number only if it hasn't been processed
                        if serial_number not in processed_serial_numbers:
                            outfile.write(f"Serial_Number: {serial_number:}\n")
                            processed_serial_numbers.add(serial_number)

                        # Write the response
                        for line in response.split('\n'):
                            if line.strip() and "Serial_Number: " not in line:  # Check if line is not empty and does not contain "Serial Number"
                                outfile.write(line.strip() + "\n")

                        outfile.write("-" * 40 + "\n")  # Add separator line
                except UnicodeDecodeError:
                    print(f"Error reading {file_path}. Skipping this file due to encoding issues.")

if __name__ == '__main__':
    main()

converting Output from LLM (txt) into csv with Serial Number and Decision as two columns

In [None]:
import csv
import re

# Function to process each line and extract Serial Number and Decision
def extract_data_from_line(line):
    serial_number = None
    decision = None

    # Check for the first format
    if "Serial Number" in line:
        match = re.search(r"Serial Number\s*:\s*(\d+|Not Mentioned)", line)
        if match:
            serial_number = match.group(1)
    elif "Decision" in line:
        match = re.search(r"Decision\s*:\s*(Yes|No)", line)
        if match:
            decision = match.group(1)

    # Check for the second format
    elif "<output>" in line:
        match = re.search(r'\["(Yes|No)",,?"(\d+)"\]', line)
        if match:
            decision = match.group(1)
            serial_number = match.group(2)

    return serial_number, decision

# Function to convert text file to CSV
def convert_txt_to_csv(txt_file_path, csv_file_path):
    serial_number = None
    decision = None
    processed_serial_numbers = set()  # Set to track already processed serial numbers

    with open(txt_file_path, 'r', encoding='utf-8') as txt_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(["Serial Number", "Decision"])

        for line in txt_file:
            line = line.strip()

            if line:
                sn, dec = extract_data_from_line(line)
                
                if sn is not None:
                    serial_number = sn
                if dec is not None:
                    decision = dec

                # Write to CSV when both Serial Number and Decision are found
                if serial_number and decision:
                    if serial_number not in processed_serial_numbers:
                        csv_writer.writerow([serial_number, decision])
                        processed_serial_numbers.add(serial_number)  # Mark this serial number as processed
                    serial_number = None
                    decision = None

if __name__ == '__main__':
    input_txt_file = 'output.txt'  # Replace with your text file path
    output_csv_file = 'SNETs_half_5.csv'  # Replace with your desired CSV file path
    convert_txt_to_csv(input_txt_file, output_csv_file)
