In [1]:
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_anthropic import ChatAnthropic
from environs import Env
from langsmith import Client

env = Env()
load_dotenv()
client = Client()
chat_model = ChatAnthropic(model='claude-3-haiku-20240307', temperature=0)
prompt_template = open('dataset_prompt.txt', 'r', encoding='utf-8').read()
prompt = PromptTemplate(input_variables=["input"], template=prompt_template)
chain = prompt | chat_model

In [1]:
import pandas as pd
jsonObj = pd.read_json('final_dataset.jsonl', lines=True)

In [3]:
# Iterate through each row in the DataFrame
for index, row in jsonObj.iterrows():
    # Filter the 'messages' list and update roles
    updated_messages = []
    for message in row['messages']:
        if message['role'] == 'aichunk':
            message['role'] = 'assistant'
        if message['role'] == 'system':
            message['content'] = "You are Sales Assistant who answer the questions \nabout flat features. For complete information take users phone number to contact with operators.\nBe concise, as detailed as possible, but don't make up any information. send information in an ordered format.\nIf you don't know an answer just say don't know and send admins phone {builder_phone} to contact.\nAll answers should be in Uzbek.\n\nHere is additional questions with answers:\nDo you have apartments for mortgage? - {mortgage}\nDo you have ready apartments - {readyflat}\nDo you have cadastre-registered apartments - {cadastre}\nCan we buy an apartment with a subsidy - {subsidy}\n"
        if message['role'] != 'function' and 'content' in message:
            if message['role'] != 'system':
                message['content'] = chain.invoke(message['content']).content
            updated_messages.append(message)
    # Update the DataFrame with the modified list
    jsonObj.at[index, 'messages'] = updated_messages

In [4]:
jsonObj.to_json('updated_dataset_2part.jsonl', lines=True, orient='records')

In [58]:
import json

def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

def save_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            file.write(json.dumps(entry) + '\n')

def split_messages(messages, max_length):
    chunks = []
    current_chunk = []
    current_length = 0

    for message in messages:
        message_length = len(json.dumps(message))  # Length of the message when serialized to JSON string
        if current_length + message_length > max_length and current_chunk:
            # Ensure the chunk ends with 'assistant'
            if current_chunk[-1]['role'] == 'assistant':
                chunks.append(current_chunk)
                current_chunk = []
                current_length = 0

        current_chunk.append(message)
        current_length += message_length

    if current_chunk and current_chunk[-1]['role'] == 'assistant':
        chunks.append(current_chunk)
    
    return chunks

def ensure_boundaries(chunks):
    for chunk in chunks:
        messages = chunk['messages']
        if messages[0]['role'] not in ['system', 'user']:
            raise ValueError("Chunk does not start with 'system' or 'user'")
        if messages[-1]['role'] != 'assistant':
            raise ValueError("Chunk does not end with 'assistant'")
    return chunks

def main(input_file, output_prefix, max_length):
    data = load_jsonl(input_file)
    chunked_data = []

    for entry in data:
        messages = entry['messages']
        chunks = split_messages(messages, max_length)
        for chunk in chunks:
            chunked_data.append({'messages': chunk})

    chunked_data = ensure_boundaries(chunked_data)
    
    save_jsonl(chunked_data, output_file)
    print(f"Saved all chunks to {output_file}")

# Example usage:
input_file = 'final_dataset.jsonl'
output_file = 'output.jsonl'
max_length = 2048  # Adjust the max length as per your requirement

main(input_file, output_file, max_length)

Saved all chunks to output.jsonl


In [63]:
import pandas as pd
jsonObj = pd.read_json('output.jsonl', lines=True)

In [64]:
jsonObj

Unnamed: 0,messages
0,"[{'role': 'system', 'content': 'You are Sales ..."
1,"[{'role': 'system', 'content': 'You are Sales ..."
2,"[{'role': 'user', 'content': 'Podval uyga qo's..."
3,"[{'role': 'user', 'content': 'art house firmas..."
4,"[{'role': 'user', 'content': '{rooms} xonali e..."
...,...
1058,"[{'role': 'user', 'content': 'Boshlang'ich to'..."
1059,"[{'role': 'system', 'content': 'You are Sales ..."
1060,"[{'role': 'system', 'content': 'You are Sales ..."
1061,"[{'role': 'system', 'content': 'You are Sales ..."


In [61]:
import json

def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

def check_boundaries(data):
    invalid_rows = []
    for i, entry in enumerate(data):
        messages = entry.get('messages', [])
        if not messages:
            invalid_rows.append(i)
            continue

        if messages[0]['role'] not in ['system', 'user'] or messages[-1]['role'] != 'assistant':
            invalid_rows.append(i)
    
    return invalid_rows

def main(input_file):
    data = load_jsonl(input_file)
    invalid_rows = check_boundaries(data)

    print(f"Total rows: {len(data)}")
    print(f"Rows with invalid boundaries: {len(invalid_rows)}")
    print(f"Invalid row indices: {invalid_rows}")

# Example usage:
input_file = 'output.jsonl'

main(input_file)


Total rows: 1063
Rows with invalid boundaries: 0
Invalid row indices: []
