In [None]:
import sys
import pandas as pd
import random
import os
from sklearn.metrics.pairwise import cosine_similarity
from pydantic import BaseModel
from typing import List
import numpy as np
! pip show anthropic
! pip show openai

In [None]:
import anthropic
import openai
from openai import OpenAI
import os

ANTHROPIC_API_KEY="API_KEY"
OPENAI_API_KEY="API_KEY"

# Set your own IP to allow access
proxy_url = 'http://----'
proxy_port = 'xxxx' 

os.environ['http_proxy'] = f'{proxy_url}:{proxy_port}'
os.environ['https_proxy'] = f'{proxy_url}:{proxy_port}'

anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

#API Usage
client_claude = anthropic.Anthropic(api_key= ANTHROPIC_API_KEY)
client_emb = OpenAI(api_key = OPENAI_API_KEY)
client = OpenAI(api_key = OPENAI_API_KEY)

def get_embedding(text, model="text-embedding-3-small"):
    return client_emb.embeddings.create(input = [text], model=model).data[0].embedding

# Define chatbot
def chat_with_claude_sonnet(prompt,system_prompt):
    messages = [{"role": "user","content": prompt}]
    response = client_claude.messages.create(
        model="claude-3-sonnet-20240229",
        max_tokens=4096,
        temperature=0.5,
        system=system_prompt,
        messages=messages
    )
    message = response.content[0].text

    return message

def chat_with_claude_opus(prompt,system_prompt):
    messages = [{"role": "user","content": prompt}]
    response = client_claude.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4000,
        temperature=0.5,
        system=system_prompt,
        messages=messages
    )
    message = response.content[0].text

    return message

def chat_with_claude_haiku(prompt,system_prompt):
    messages = [{"role": "user","content": prompt}]
    response = client_claude.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=4000,
        temperature=0.5,
        system=system_prompt,
        messages=messages
    )
    message = response.content[0].text

    return message

def chat_with_openai(prompt, system_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages
    )

    response = completion.choices[0].message.content
    return response

In [None]:
from openai import OpenAI
import sys

def chat_with_Llama3unsluth(prompt, system_prompt):
    client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio") # Modify the address provided by Lm-studio
    
    stream = client.chat.completions.create(
        model="Llama3_8B/unsluth",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        stream=True  # Enable streaming output
    )
    
    full_response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            print(content, end='', flush=True)
            full_response += content
    
    print() 
    return full_response

In [None]:
# Split query paragraphs into ab_word
import csv
import json
import time

TARGET_WORDS = 50  # Target number of words
OVERLAP_WORDS = 30  # Number of overlapping words

def count_words(text):
    return len(text.split())

def split_into_overlapping_chunks(text, chunk_size, overlap):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

ab_word = []
with open('Path/Artical_inf.csv', 'r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # Skip the header row
    for row in csv_reader:
        if len(row) >= 8:
            journal = row[1].strip()
            pmid = row[4].strip()
            pmcid = row[5].strip()
            link = row[6].strip()
            title = row[3].strip()
            abstract = row[7].strip()
            if len(abstract) >= 10:
                chunks = split_into_overlapping_chunks(abstract, TARGET_WORDS, OVERLAP_WORDS)
                for chunk in chunks:
                    ab_word.append({
                        'text': chunk,
                        'metadata': {
                            'journal': journal,
                            'pmid': pmid,
                            'pmcid': pmcid,
                            'link': link
                        },
                        'document': title
                    })

print("The number of ab_word is:", len(ab_word))
print("Number of words in the example paragraph:", [count_words(item['text']) for item in ab_word[:5]])

In [None]:
# Pass the embedded ab_word into the Json file
def get_embedding_with_retry(text, max_retries=3, delay=20):
    for attempt in range(max_retries):
        try:
            return get_embedding(text)
        except Exception as e:
            if attempt < max_retries - 1: # If it is not the last attempt
                print(f"Error getting vector: {e}. wait 20s...")
                time.sleep(delay)
            else:
                print(f"Failed after {max_retries} attempts. Skipping this item.")
                return None  # or return a default vector

failed_items = [] # Storing failed fields

with open('Path/ab_vector.json', 'w') as f:
    f.write('[')  # Beginning JSON array
    first_item = True
    for i, item in enumerate(ab_word):
        vector = get_embedding_with_retry(item['text'])
        if vector is not None:
            item['vector'] = vector  # Add the vector to the dictionary
            if not first_item:
                f.write(',')  # Add a comma before all elements except the first one
            else:
                first_item = False
            json.dump(item, f)  # Write the dictionary to a JSON file
        else:
            failed_items.append(item) # Add the failed fields to the list
        print(f"Processing Items {i}", end="\r")
    f.write(']')  # End the JSON array
print("\nVector generation and writing completed")

# Write the failed fields to a separate JSON file
with open('Path/fail_vector.json', 'w') as f:
    json.dump(failed_items, f)

print(f"The failed fields have been written to the file: Path/fail_vector.json")

In [None]:
# Pass in the vector set
with open('Path/ab_vector.json', 'r') as file:
    sentence_vector = json.load(file)

In [None]:
# Define a function that returns a vector number
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_most_similar(query_vector, embeddings, top_n):
    similarities = cosine_similarity([query_vector], embeddings)[0]
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    return top_indices, similarities[top_indices]

def find_deceleration_point(coefficients, scores_length):
    first_derivative_coefficients = np.polyder(coefficients)
    second_derivative_coefficients = np.polyder(first_derivative_coefficients)
    roots = np.roots(second_derivative_coefficients)
    real_roots = roots.real[abs(roots.imag) < 1e-5]
    valid_roots = real_roots[(real_roots >= 0) & (real_roots <= scores_length - 1)]
    
    if len(valid_roots) > 0:
        return int(np.min(valid_roots))
    else:
        return None

def extract_similar_content(paragraph_vector, query, top_n=10):
    word_vectors = [word['vector'] for word in paragraph_vector]
    query_vector = get_embedding(query)
    most_similar_idx, most_similar_scores = find_most_similar(query_vector, word_vectors, top_n)
    
    similar_items = []
    for i in most_similar_idx:
        item = paragraph_vector[i]
        text = item['text']
        metadata = item['metadata']
        document = item['document']
        
        item_str = f"Text: {text}\nMetadata: {metadata}\nDocument: {document}\n"
        similar_items.append(item_str)
    
    return "\n".join(similar_items)

In [None]:
# Get user input
qury = input("Please enter your query: ")

print("The query you entered is:", qury)

# entence_vector and extract_similar_content functions have been defined
temp_str = extract_similar_content(sentence_vector, qury)  # Find similar vectors based on qury

#Merge repeated sentences
def remove_duplicates(temp_str):
    lines = temp_str.split('\n')
    unique_lines = list(set(lines))
    result_str = '\n'.join(unique_lines)
    return result_str
# Deduplication output is deduplicated_str
deduplicated_str = remove_duplicates(temp_str)

combined_str = qury + " " + deduplicated_str

print("The merged set:", combined_str)

In [None]:
# Testing: Usage
prompt1 = """Please identify the input paragraph. The first line of the paragraph is a question, which usually asks about a phenotype or gene. The remaining lines are answers related to this question. Requirements:
1. Please identify these answers and judge whether they are related to the question based on the content. Only output new answers based on these answers with a little logic, and do not use your own knowledge base to supplement the answers.
2. The output answers need to be answered in points, including (1) related gene families (2) related proteins (3) other related phenotypes (4) logical analysis (5) homologous genes in important crops (6) reference titles and DOI information
3. The first sentence of the answer should be "Based on your question, I have summarized the following relevant research information..."
4. Delete redundant sentences and make a good summary."""

part_output = chat_with_claude_opus(combined_str, prompt1)
print(part_output)

In [None]:
# Testing: Usage
prompt1 = """Please identify the input paragraph. The first line of the paragraph is a question, which usually asks about a phenotype or gene. The remaining lines are answers related to this question. Requirements:
1. Please identify these answers and judge whether they are related to the question based on the content. Only output new answers based on these answers with a little logic, and do not use your own knowledge base to supplement the answers.
2. The output answers need to be answered in points, including (1) related gene families (2) related proteins (3) other related phenotypes (4) logical analysis (5) homologous genes in important crops (6) reference titles and DOI information
3. The first sentence of the answer should be "Based on your question, I have summarized the following relevant research information..."
4. Delete redundant sentences and make a good summary."""

part_output = chat_with_Llama3unsluth(combined_str, prompt1)
print(part_output)