In [None]:
import sys
import pandas as pd
import random
import os
from sklearn.metrics.pairwise import cosine_similarity
from pydantic import BaseModel
from typing import List
import numpy as np
! pip show anthropic
! pip show openai

In [None]:
import anthropic
import openai
from openai import OpenAI
import os

ANTHROPIC_API_KEY="API_KEY"
OPENAI_API_KEY="API_KEY"

# Set your own IP to allow access
proxy_url = 'http://----'
proxy_port = 'xxxx' 

os.environ['http_proxy'] = f'{proxy_url}:{proxy_port}'
os.environ['https_proxy'] = f'{proxy_url}:{proxy_port}'

anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

#API Usage
client_claude = anthropic.Anthropic(api_key= ANTHROPIC_API_KEY)
client_emb = OpenAI(api_key = OPENAI_API_KEY)
client = OpenAI(api_key = OPENAI_API_KEY)

def get_embedding(text, model="text-embedding-3-small"):
    return client_emb.embeddings.create(input = [text], model=model).data[0].embedding

# Define chatbot
def chat_with_claude_sonnet(prompt,system_prompt):
    messages = [{"role": "user","content": prompt}]
    response = client_claude.messages.create(
        model="claude-3-sonnet-20240229",
        max_tokens=4096,
        temperature=0.5,
        system=system_prompt,
        messages=messages
    )
    message = response.content[0].text

    return message

def chat_with_claude_opus(prompt,system_prompt):
    messages = [{"role": "user","content": prompt}]
    response = client_claude.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4000,
        temperature=0.5,
        system=system_prompt,
        messages=messages
    )
    message = response.content[0].text

    return message

def chat_with_claude_haiku(prompt,system_prompt):
    messages = [{"role": "user","content": prompt}]
    response = client_claude.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=4000,
        temperature=0.5,
        system=system_prompt,
        messages=messages
    )
    message = response.content[0].text

    return message

def chat_with_openai(prompt, system_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages
    )

    response = completion.choices[0].message.content
    return response

In [None]:
# Arabidopsis phenotype acquisition
from tqdm import tqdm
import time

# Input table paragraphs by line
with open("Path/Focus on phenotype.txt","r",encoding = "utf-8") as f:
    str_file = f.readlines()
    
systemt_prompt = """Please extract the plant phenotype based on the input content. If there is none, please reply "None" """

results = []  # List to store the results

# Open a new TXT file to write the results
with open("Path/phenotype_output.txt", "w", encoding="utf-8") as file:
    for content in tqdm(str_file, desc="Processing"):
        prompt = "Input:" + content
        
        retry_count = 0
        while retry_count < 3:
            try:
                result = chat_with_claude_sonnet(prompt, systemt_prompt)
                results.append(result)
                
                # Write each result to a TXT file
                file.write(f"Input:{content}\n")
                file.write(f"Result:{result}\n")
                file.write("-" * 50 + "\n")  # Add separator line
                
                break  # If the request is successful, exit the loop
            except AnthropicError as e:
                if 'overloaded_error' in str(e):
                    retry_count += 1
                    print(f"Overloaded error occurred. Retrying... (Attempt {retry_count}/3)")
                    time.sleep(5)  # Wait 5 seconds and try again
                else:
                    raise e  # If it is any other error, throw an exception
        else:
            # If the number of retries reaches 3, write the request failure information to the TXT file
            file.write(f"Input:{content}\n")
            file.write("Result:Request failed\n")
            file.write("-" * 50 + "\n")  # Add separator line
            print("Failed to get response after 3 retries.")
    
    # Output results
    for i, result in enumerate(results):
        print(f"Result {i+1}:{result}")

In [None]:
# Get phenotypic response corpus from Claude Opus
from tqdm import tqdm
import time

# Input table paragraphs by line
with open("Path/phenotype_output.txt","r",encoding = "utf-8") as f:
    output_file = f.readlines()

systemt_prompt = """Please complete the following tasks based on the input:

1: Return the genes in Arabidopsis that are related to the phenotype
2: List all other genes or proteins related to the gene in 1
3: Describe other phenotypes associated with the genes or proteins in 1 and 2
4: Describe the results of existing published articles related to the conclusions in 1, 2, and 3
5: Perform a logical analysis of the results in 4
6: List homologous genes in crops with important economic value such as rice, wheat, and corn

Note: Use concise language to reply, without adding any empty words! ! ! ! """

results = []  # List to store the results

# Open a new TXT file to write the results
with open("Path/phenotype_output_answer.txt", "w", encoding="utf-8") as file:
    for content in tqdm(output_file, desc="Processing"):
        prompt = "Input:" + content
        
        retry_count = 0
        while retry_count < 5:
            try:
                result = chat_with_claude_opus(prompt, systemt_prompt)
                results.append(result)
                
                # Write each result to a TXT file
                file.write(f"Input:{content}\n")
                file.write(f"Results:{result}\n")
                file.write("-" * 50 + "\n")  # Add separator line
                
                break  # If the request is successful, exit the loop
            except AnthropicError as e:
                if 'overloaded_error' in str(e):
                    retry_count += 1
                    print(f"Overloaded error occurred. Retrying... (Attempt {retry_count}/5)")
                    time.sleep(5)  # Wait 5 seconds and try again
                else:
                    raise e  # If it is any other error, throw an exception
        else:
            # If the number of retries reaches 5, write the request failure information to the TXT file
            file.write(f"Input:{content}\n")
            file.write("Extract result: Request failed\n")
            file.write("-" * 50 + "\n")  # Add separator line
            print("Failed to get response after 5 retries.")
    
    # Output results
    for i, result in enumerate(results):
        print(f"Result {i+1}:{result}")

In [None]:
import time
import os
import re

systemt_prompt = """
Based on the following gene information, please generate the following types of question-answer pairs and answer them:

1. Basic information query - about gene ID and alias
2. Functional explanation - explain the function of the gene
3. Relationship query - ask about genes belonging to the same protein family
4. Complex query - about the relationship between genes and specific biological processes
5. Inference question - speculate on the biological process that the gene may be involved in based on known information

For each type, generate at least one question-answer pair. Make sure the answer is based only on the information provided, and do not add information that is not given. If the information is not enough to answer a certain type of question, you can skip that type.
Please output in the following format:

Q: [Question]
A: [Answer]
"""

def process_file(input_file_path, output_file_path):
    start_time = time.time()  # Record start time
    
    if not output_file_path.lower().endswith('.txt'):
        output_file_path += '.txt'
    
    # Get the total number of rows
    with open(input_file_path, "r", encoding="utf-8") as f:
        total_lines = sum(1 for line in f if line.strip())
    
    processed_lines = 0
    last_update_time = start_time
    
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        with open(input_file_path, "r", encoding="utf-8") as input_file:
            for line in input_file:
                line = line.strip()  
                if line:  
                    retry_count = 0
                    while retry_count < 3:
                        try:
                            result = chat_with_claude_sonnet(line, systemt_prompt)
                            
                            output_file.write(f"Input: {line}\n")
                            output_file.write(f"Output:\n{result}\n")
                            output_file.write("\n")  # Add separator
                            
                            processed_lines += 1
                            current_time = time.time()
                            
                            # Update progress every 5 seconds or every 10 rows processed
                            if current_time - last_update_time > 5 or processed_lines % 10 == 0:
                                progress = processed_lines / total_lines * 100
                                elapsed_time = current_time - start_time
                                time_per_line = elapsed_time / processed_lines if processed_lines > 0 else 0
                                
                                print(f"Progress: {progress:.2f}% ({processed_lines}/{total_lines}), "
                                      f"Speed: {time_per_line:.2f} s/line")
                                last_update_time = current_time
                            
                            break  # If the request is successful, exit the retry loop
                        except Exception as e:
                            print(f"Request failed, error message: {str(e)}")
                            retry_count += 1
                            if retry_count < 3:
                                print(f"Wait 5 seconds and try again, number of retries: {retry_count}")
                                time.sleep(5) 
                            else:
                                print("The number of retries exceeds the limit, skipping the request")
    
    end_time = time.time()  # Record end time
    process_time = end_time - start_time  # Calculate processing time
    return process_time

input_file_path = "Path/Gene ID.txt"
output_file_path = "Path/Answer"

# Make sure the output directory exists
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

print(f"Processing File: {os.path.basename(input_file_path)}")
process_time = process_file(input_file_path, output_file_path)
print(f"File processing completed, time {process_time:.2f} s")