### AI-based classification of identified genes

#### .ENV setup (run in the terminal)



cp /home/aleksandr/Desktop/WORK/biotech_data_project/.env /home/aleksandr/Desktop/WORK/OLINK_suicide_PSY_project/.env

#### Module import

In [18]:
import pickle
import json
import re
import pandas as pd
import time
import os

from multiprocessing import Pool
from copy import deepcopy

import hashlib
import uuid
from datetime import datetime
import mysql.connector

# OpenAI stuff
import openai
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# SQL stuff
SQL_USER= os.getenv("SQL_USER")
SQL_PWD = os.getenv("SQL_PWD")

#### Simple test

In [54]:
def response_web_search_OAI(query: str, developer_instructions: str):

    # Not efficient and performs much worse than ChatGPT
    # Heavy hallucinations
    client = OpenAI(api_key=OPENAI_API_KEY)

    # "gpt-4o-search-preview" is optimal model
    # "gpt-4o-mini-search-preview" fails to format output sometimes

    #prompt = developer_instructions + "\n" + query

    response = client.responses.create(
        model="gpt-5",
        reasoning={"effort": "low"},
        tools=[{
        "type": "web_search_preview",
        "search_context_size": "high"}],

        instructions = developer_instructions,

        input=query
    )
    return response

In [None]:
developer_instructions = """
You are given a human gene name presented as a gene symbol. You need to do the following:

1. Classify this gene using only one the most appropriate class (separated by ;):

'Receptor (non-immune)';
'Immune receptor';
'Ion channel/Transporter';
'Cell-Surface Ligand';
'Extracellular or Secreted molecule';
'Enzyme (non-kinase)';
'Transcription factor';
'Kinase';
'Structural protein';
'Non-coding RNA';
'Pseudogene'

2. You must only use scientific literature (papers) or scientific databases (such as UniProt, GeneCards, etc.) to perform classification

3. Give only one the most suitable class for a gene

4. Your output must be first an explanation of around 100 words then keyword "RESPONSE_CLASS" followed by the selected class name (spelled exactly as in the instruction 1.)

"""

In [89]:
test_response = response_web_search_OAI(query="TLR2",
                                        developer_instructions=developer_instructions)
response_text = test_response.output_text

In [90]:
response_text

'TLR2 (Toll-like receptor 2) is a type I transmembrane pattern-recognition receptor expressed primarily on innate immune cells such as macrophages, dendritic cells, and neutrophils. It recognizes conserved microbial components—especially triacyl and diacyl lipopeptides from bacteria and certain fungal and parasitic products—often as a heterodimer with TLR1 or TLR6. Ligand engagement triggers recruitment of adaptor proteins (notably MyD88), leading to activation of NF-κB and MAPK pathways and induction of proinflammatory cytokines that orchestrate innate and subsequent adaptive immune responses. Functionally and structurally, TLR2 is best classified as an immune cell-surface receptor rather than an enzyme, kinase, transcription factor, or transporter.\n\nRESPONSE_CLASS Immune receptor'

In [91]:
response_class = response_text.split("RESPONSE_CLASS")[1]
response_class = response_class.strip()
response_class

'Immune receptor'

#### Make calls connected to SQL to index responses and not reuse tokens

In [68]:
def uuid_from_string(input_string):
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, input_string))

def sql_escape(value):
    return str(value).replace("'", "''").replace('\\', '\\\\')

In [92]:

def classify_gene_symbol(input_dict):
        
    gene_symbol = input_dict["gene_symbol"]
    developer_instructions = input_dict["developer_instructions"]
    db_name = input_dict["db_name"]
    db_table_name = input_dict["db_table_name"]
    
    # Creating dp if not exitsts
    # Create SQL database if does not exist
    try:
        connection = mysql.connector.connect(
        host="localhost",
        user=SQL_USER,
        passwd=SQL_PWD
        )

        cursor = connection.cursor()
        cursor.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")

    except Exception as e:
        print(f"[ERROR at classify_gene_symbol at creating database!]: {e}")

    finally:
        cursor.close()
        connection.close()

    # Create SQL table if does not exist
    try:
        connection = mysql.connector.connect(
        host="localhost",
        user=SQL_USER,
        passwd=SQL_PWD
        )

        cursor = connection.cursor()
        cursor.execute(f"USE {db_name}")

        sql = f"""CREATE TABLE IF NOT EXISTS {db_table_name} (
                    id INT AUTO_INCREMENT PRIMARY KEY,
                    gene_symbol VARCHAR(200),
                    uuid_str VARCHAR(50),
                    format_correctness VARCHAR(50),
                    explanation MEDIUMTEXT,
                    selected_class VARCHAR(250))"""
        
        cursor.execute(sql)

    except Exception as e:
        print(f"[ERROR at classify_gene_symbol at creating table!]: {e}")

    finally:
        cursor.close()
        connection.close()


    # Now we need to see if requested gene symbol has been already classified
    current_uuid = uuid_from_string(gene_symbol)

    try:
        connection = mysql.connector.connect(
        host="localhost",
        user=SQL_USER,
        passwd=SQL_PWD
        )

        cursor = connection.cursor()
        cursor.execute(f"USE {db_name}")

        SQL_command_treatment_search = f""" SELECT * FROM {db_table_name} WHERE
        uuid_str = '{current_uuid}'
        """
        cursor.execute(SQL_command_treatment_search)
        SQL_rows = cursor.fetchall()

    except Exception as e:
        print(f"[ERROR at classify_gene_symbol at database search]: gene symbol '{gene_symbol}' - {e}")
        SQL_rows = None

    finally:
        cursor.close()
        connection.close()

    # Loading results (ID, gene_symbol, uuid_str, format_correctness, selected_class)
    try:

        if SQL_rows is not None and len(SQL_rows) > 0:
            
            # MySQL upload
            first_record = SQL_rows[0]
            gene_class = first_record[4]

            response_out = {
                "gene_symbol":first_record[1],
                "uuid_str":first_record[2],
                "format_correctness":first_record[3],
                "explanation":first_record[4],
                "gene_class":first_record[5]
                }
            return response_out
    
    except Exception as e:
        print(f"[ERROR at classify_gene_symbol at loading existing SQL]: gene symbol '{gene_symbol}' - {e}")


    # IF WE ARE HERE -> GENE IS NOT IN SQL -> NEED TO GO TO AI
    try:
        classification = response_web_search_OAI(query=gene_symbol, developer_instructions=developer_instructions)
    except:
        response_out = {
        "gene_symbol": gene_symbol,
        "uuid_str": current_uuid,
        "format_correctness": "FAILED",
        "explanation": None,
        "gene_class":None
        }
        return response_out

    suggested_classes = [
    'Receptor (non-immune)',
    'Immune receptor',
    'Ion channel/Transporter',
    'Cell-Surface Ligand',
    'Extracellular or Secreted molecule',
    'Enzyme (non-kinase)',
    'Transcription factor',
    'Kinase',
    'Structural protein',
    'Non-coding RNA',
    'Pseudogene'
    ]

    response_text = classification.output_text
    response_text = sql_escape(response_text)
    response_class = response_text.split("RESPONSE_CLASS")[1]
    response_class = response_class.strip()
    response_class

    if response_class not in response_class:
        format_correctness = "NEED EVALUATION"
    else:
        format_correctness = "CORRECT"

    
    # Save response
        # Saving record in MySQL database
    try:
        connection = mysql.connector.connect(
        host="localhost",
        user=SQL_USER,
        passwd=SQL_PWD
        )

        cursor = connection.cursor()
        cursor.execute(f"USE {db_name}")

        SQL_command = f"""INSERT INTO {db_table_name} (gene_symbol, uuid_str, format_correctness, explanation, selected_class) 
                VALUES ('{gene_symbol}', '{current_uuid}', '{format_correctness}', '{response_text}', '{response_class}')"""
        cursor.execute(SQL_command)
        connection.commit()

    except Exception as e:
        print(f"[ERROR at classify_gene_symbol at connection after search]: gene '{gene_symbol}' - {e}")

    finally:
        cursor.close()
        connection.close()

    
        response_out = {
            "gene_symbol":gene_symbol,
            "uuid_str":current_uuid,
            "format_correctness":format_correctness,
            "explanation":response_text,
            "gene_class":response_class
            }
        return response_out


In [63]:
# USE gene_name_classification;
# SELECT * from gene_name_gpt5;

input_dict = {
    "gene_symbol":"RP11-981P6.1",
    "developer_instructions":developer_instructions,
    "db_name":"gene_name_classification",
    "db_table_name":"gene_name_gpt5"
}
classify_gene_symbol(input_dict)

{'gene_symbol': 'RP11-981P6.1',
 'uuid_str': '12a61315-4f0c-5a01-b9e2-aa4ce2990c46',
 'format_correctness': 'CORRECT',
 'explanation': 'RP11-981P6.1 is an RP11-designated transcript derived from a genomic clone locus and corresponds to Ensembl gene ENSG00000258302. This locus is annotated as a long non-coding RNA (lncRNA): GeneCards indexes ENSG00000258302 within its ncRNA (GeneCaRNA) compendium and shows no RefSeq protein-coding entry, consistent with a noncoding biotype. Independent commercial assays also label RP11-981P6.1 explicitly as “lncRNA,” providing the Ensembl ID and transcript ENST00000552778, further supporting that it is not protein-coding. Together, these database annotations indicate RP11-981P6.1 is best classified as a non-coding RNA gene. ([genecards.org](https://www.genecards.org/cgi-bin/carddisp.pl?gene=ENSG00000258302&utm_source=openai), [commerce.bio-rad.com](https://commerce.bio-rad.com/en-us/prime-pcr-assays/assay/qhsaled0105820-primepcr-sybr-green-assay-rp11-98

#### Data import

In [81]:

def read_txt_to_list(file_path):
    """
    Reads txt file and saves it in a list line-by-line
    """
    text = open(file_path, "r")
    text = text.read()
    text = str.split(text, sep="\n")
    return text
    
genes_to_parse = read_txt_to_list("unique_significant_genes_for_AI.txt")
genes_to_parse = [x for x in genes_to_parse if x != ""]

In [82]:
len(genes_to_parse)

193

#### Processing genes

In [94]:
# Requests to process
requests_for_ai = []

for gene in genes_to_parse:
    
    curr_dict = {
    "gene_symbol":gene,
    "developer_instructions":developer_instructions,
    "db_name":"gene_name_classification",
    "db_table_name":"gene_name_gpt5"
    }

    requests_for_ai.append(curr_dict)

In [95]:
# Scan through gene symbols
with Pool(9) as ppol:
    processed_requests = ppol.map(classify_gene_symbol, requests_for_ai)

In [96]:
processed_requests_merged = pd.DataFrame(processed_requests)

In [97]:
processed_requests_merged["format_correctness"].value_counts()

format_correctness
CORRECT    193
Name: count, dtype: int64

#### Output

In [99]:
processed_requests_merged.columns

Index(['gene_symbol', 'uuid_str', 'format_correctness', 'explanation',
       'gene_class'],
      dtype='object')

In [104]:
# Printing 
import textwrap

def wrap_text(text, width):
    wrapped_lines = textwrap.wrap(text, width=width)
    for line in wrapped_lines:
        print(line)

with open("gene_class_inspection.txt", "w") as file:
    for gene, explanation, gene_class in zip(processed_requests_merged["gene_symbol"].tolist(), 
                                   processed_requests_merged["explanation"].tolist(),
                                   processed_requests_merged["gene_class"].tolist()):

        wrapped_text = textwrap.fill(explanation, width=70)
        file.write(gene)
        file.write("\n\n")
        file.write(gene_class)
        file.write("\n\n")
        file.write(wrapped_text)
        file.write("\n\n\n\n")

In [105]:
processed_requests_merged.to_csv("GPT_5_classified_symbols_init.csv")