In [5]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import mysql.connector
from mysql.connector import Error
import config

In [6]:
numCores = os.cpu_count()
numCores

16

This code below is for adding most of the data to a MySQL database

In [None]:
def get_max_id(cursor, binds_value):
    try:
        # Adjust the query to filter by 'binds' value
        cursor.execute("SELECT MAX(id) FROM compounds WHERE binds = %s", (binds_value,))
        result = cursor.fetchone()
        print("Max id for binds =", binds_value, "is:", result[0])
        return result[0] if result[0] is not None else 0
    except Error as e:
        print("Error fetching max id for binds =", binds_value, ":", e)
        return 0
    
def load_csv_in_chunks(filepath, batch_size, start_id, desired_binds,cursor):
    try:
        # Open the CSV file
        with open(filepath, mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            headers = next(reader)  # Skip the header row
            binds_index = headers.index("binds")  # Find the index of the 'binds' column
            batch = []
            current_id = 0  # This will track the ID of each row processed

            for row in tqdm(reader, desc="Processing rows"):
                current_id = int(row[0])  # Assuming the ID is in the first column
                if current_id <= start_id:
                    continue  # Skip rows until we reach the starting ID

                if int(row[binds_index]) == desired_binds:  # Check if 'binds' is the desired value
                    batch.append(row)
                    if len(batch) >= batch_size:
                        insert_batch(batch, headers,cursor)
                        batch = []  # Reset the batch list after inserting

            if batch:  # Insert any remaining rows in the final batch
                insert_batch(batch, headers,cursor)

    except Exception as e:  # Use Exception to catch all possible issues
        print(f"Error reading file at ID {current_id}: {e}")

def insert_batch(batch, headers, cursor):
    try:
        query = f"INSERT INTO compounds ({', '.join(headers)}) VALUES (%s, %s, %s, %s, %s, %s, %s)"
        cursor.executemany(query, batch)
        connection.commit()
        print(f"Batch inserted. Batch size: {len(batch)} Last ID: {batch[-1][0]}")
    except Error as e:
        print(f"Failed to insert batch into MySQL table. Error: {e}")
        connection.rollback()

def connect_to_database():
    """Connect to the MySQL database using settings from the config module."""
    try:
        print("Connecting to the MySQL database...")
        conn = mysql.connector.connect(**config.DATABASE_CONFIG)
        if conn.is_connected():
            print("Connection established.")
        else:
            print("Connection failed.")
        return conn
    except Error as e:
        print(f"The error '{e}' occurred")
        return None

def close_connection(conn):
    """Close the database connection."""
    if conn.is_connected():
        conn.close()
        print("The connection is closed.")

# User input for binds value
desired_binds = 1
conn = connect_to_database()
if conn is not None:
    try:
        cursor = conn.cursor()

        # Determine the maximum ID already processed
        max_id = get_max_id(cursor, desired_binds)
        print(f"Starting import from ID {max_id + 1} using {desired_binds} binds")
        
        # Path to your CSV file and batch size
        csv_file_path = 'C:/Users/funkb/DataspellProjects/Chemical-Stuff/leash-BELKA/train.csv'
        batch_size = 100000  # Modify as needed
        
        # Start loading from the next ID
        load_csv_in_chunks(csv_file_path, batch_size, max_id, desired_binds)
    finally:
        cursor.close()
        close_connection(conn)


Connecting to the MySQL database...
Connection established.


In [None]:
def batch_smiles_to_fingerprints(smiles_list):
    """Convert a list of SMILES to fingerprints."""
    fingerprints = []
    for smiles in tqdm(smiles_list, desc="Converting SMILES", leave=False):
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
            # Convert the fingerprint to a hex string for easy storage
            fp_hex = fp.ToBitString().encode('utf-8').hex()
            fingerprints.append(fp_hex)
        else:
            fingerprints.append(None)  # Use None for molecules that can't be parsed
    return fingerprints