In [9]:
!pip install rdkit autogluon pycaret
import pandas as pd
from sklearn.model_selection import train_test_split



In [2]:
morgan_dataset = pd.read_csv("~/datasets/morgan_dataset.csv")
ap_dataset = pd.read_csv("~/datasets/ap_dataset.csv")
rdk5_dataset = pd.read_csv("~/datasets/rdk5_dataset.csv")

In [3]:
# Ensure datasets are aligned (optional if already aligned)
morgan_dataset, ap_dataset = morgan_dataset.align(ap_dataset, join='inner', axis=0)
morgan_dataset, rdk5_dataset = morgan_dataset.align(rdk5_dataset, join='inner', axis=0)

# Split morgan_dataset into training and testing sets (80% train, 20% test)
morgan_train, morgan_test = train_test_split(morgan_dataset, test_size=0.2, random_state=42)

# Now split ap_dataset and rdk5_dataset based on the indices of the split
ap_train = ap_dataset.loc[morgan_train.index]  # Use the same indices from morgan_train
ap_test = ap_dataset.loc[morgan_test.index]  # Use the same indices from morgan_test

rdk5_train = rdk5_dataset.loc[morgan_train.index]  # Use the same indices from morgan_train
rdk5_test = rdk5_dataset.loc[morgan_test.index]  # Use the same indices from morgan_test

In [4]:
# Split based on 'activity' as the target
morgan_train_activity = morgan_train.drop(columns=['pIC50'])

morgan_test_activity = morgan_test.drop(columns=['pIC50'])

ap_train_activity = ap_train.drop(columns=['pIC50'])

ap_test_activity = ap_test.drop(columns=['pIC50'])

rdk5_train_activity = rdk5_train.drop(columns=['pIC50'])

rdk5_test_activity = rdk5_test.drop(columns=['pIC50'])


# Split based on 'pIC50' as the target
morgan_train_pic50 = morgan_train.drop(columns=['activity'])

morgan_test_pic50 = morgan_test.drop(columns=['activity'])

ap_train_pic50 = ap_train.drop(columns=['activity'])

ap_test_pic50 = ap_test.drop(columns=['activity'])

rdk5_train_pic50 = rdk5_train.drop(columns=['activity'])

rdk5_test_pic50 = rdk5_test.drop(columns=['activity'])

In [25]:
from autogluon.tabular import TabularDataset, TabularPredictor
morgan_activity_predictor = TabularPredictor(label='activity').fit(morgan_train_activity)
ap_activity_predictor = TabularPredictor(label='activity').fit(ap_train_activity)
rdk5_activity_predictor = TabularPredictor(label='activity').fit(rdk5_train_activity)


No path specified. Models will be saved in: "AutogluonModels/ag-20250212_010337"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.9.21
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82~20.04.1-Ubuntu SMP Thu Dec 19 05:24:09 UTC 2024
CPU Count:          4
Memory Avail:       4.83 GB / 15.34 GB (31.5%)
Disk Space Avail:   319.26 GB / 387.48 GB (82.4%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accu

In [26]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

# Define batch size
BATCH_SIZE = 1000  # Adjust based on memory availability

# Expand the output path
output_dir = os.path.expanduser("~/250k_zinc")
morgan_path = os.path.join(output_dir, "morgan_fingerprints.csv")
output_file = os.path.join(output_dir, "morgan_predictions.csv")

# Ensure the directory exists
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Output directory '{output_dir}' does not exist. Check if fingerprint generation completed successfully.")

# Load the Morgan fingerprint data
if os.path.exists(morgan_path):
    print("Loading file...")
    morgan_df = pd.read_csv(morgan_path)
else:
    raise FileNotFoundError(f"File not found: {morgan_path}")

# Open file to write predictions
with open(output_file, "w") as f:
    # Write the header
    f.write("prediction\n")

    # Process data in batches
    for i in tqdm(range(0, len(morgan_df), BATCH_SIZE), desc="Predicting Morgan Activity"):
        batch = morgan_df.iloc[i : i + BATCH_SIZE]
        
        # Perform prediction
        predictions = morgan_activity_predictor.predict(batch)

        # Save to file incrementally
        np.savetxt(f, predictions, fmt="%.6f")  # Adjust precision if needed

print(f"Predictions saved to {output_file}")


Loading file...


Predicting Morgan Activity: 100%|██████████| 250/250 [01:05<00:00,  3.84it/s]

Predictions saved to /teamspace/studios/this_studio/250k_zinc/morgan_predictions.csv





In [27]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

# Define batch size
BATCH_SIZE = 1000  # Adjust based on memory availability

# Expand the output path
output_dir = os.path.expanduser("~/250k_zinc")
ap_path = os.path.join(output_dir, "ap_fingerprints.csv")
output_file = os.path.join(output_dir, "ap_predictions.csv")

# Ensure the directory exists
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Output directory '{output_dir}' does not exist. Check if fingerprint generation completed successfully.")

# Load the Morgan fingerprint data
if os.path.exists(morgan_path):
    print("Loading file...")
    ap_df = pd.read_csv(ap_path)
else:
    raise FileNotFoundError(f"File not found: {morgan_path}")

# Open file to write predictions
with open(output_file, "w") as f:
    # Write the header
    f.write("prediction\n")

    # Process data in batches
    for i in tqdm(range(0, len(morgan_df), BATCH_SIZE), desc="Predicting Morgan Activity"):
        batch = ap_df.iloc[i : i + BATCH_SIZE]
        
        # Perform prediction
        predictions = ap_activity_predictor.predict(batch)

        # Save to file incrementally
        np.savetxt(f, predictions, fmt="%.6f")  # Adjust precision if needed

print(f"Predictions saved to {output_file}")


Loading file...


Predicting Morgan Activity: 100%|██████████| 250/250 [02:16<00:00,  1.83it/s]

Predictions saved to /teamspace/studios/this_studio/250k_zinc/ap_predictions.csv





In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

# Define batch size
BATCH_SIZE = 1000  # Adjust based on memory availability

# Expand the output path
output_dir = os.path.expanduser("~/250k_zinc")
rdk5_path = os.path.join(output_dir, "rdk5_fingerprints.csv")
output_file = os.path.join(output_dir, "rdk5_predictions.csv")

# Ensure the directory exists
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Output directory '{output_dir}' does not exist. Check if fingerprint generation completed successfully.")

# Load the Morgan fingerprint data
if os.path.exists(morgan_path):
    print("Loading file...")
    rdk5_df = pd.read_csv(rdk5_path)
else:
    raise FileNotFoundError(f"File not found: {morgan_path}")

# Open file to write predictions
with open(output_file, "w") as f:
    # Write the header
    f.write("prediction\n")

    # Process data in batches
    for i in tqdm(range(0, len(morgan_df), BATCH_SIZE), desc="Predicting RDK5 Activity"):
        batch = rdk5_df.iloc[i : i + BATCH_SIZE]
        
        # Perform prediction
        predictions = rdk5_activity_predictor.predict(batch)

        # Save to file incrementally
        np.savetxt(f, predictions, fmt="%.6f")  # Adjust precision if needed

print(f"Predictions saved to {output_file}")


Loading file...


Predicting RDK5 Activity: 100%|██████████| 250/250 [02:08<00:00,  1.94it/s]

Predictions saved to /teamspace/studios/this_studio/250k_zinc/rdk5_predictions.csv





In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from autogluon.tabular import TabularDataset, TabularPredictor

# Define batch size
BATCH_SIZE = 1000  # Adjust based on memory availability

# Expand the output path
output_dir = os.path.expanduser("~/250k_zinc")
morgan_path = os.path.join(output_dir, "morgan_fingerprints.csv")
output_file = os.path.join(output_dir, "morgan_pic50_predictions.csv")

# Ensure the directory exists
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Output directory '{output_dir}' does not exist. Check if fingerprint generation completed successfully.")

# Load the Morgan fingerprint data
if os.path.exists(morgan_path):
    print("Loading file...")
    morgan_df = pd.read_csv(morgan_path)
else:
    raise FileNotFoundError(f"File not found: {morgan_path}")

morgan_pic50_predictor = TabularPredictor(label='pIC50').fit(morgan_train_pic50)

# Open file to write predictions
with open(output_file, "w") as f:
    # Write the header
    f.write("prediction\n")

    # Process data in batches
    for i in tqdm(range(0, len(morgan_df), BATCH_SIZE), desc="Predicting Morgan Activity"):
        batch = morgan_df.iloc[i : i + BATCH_SIZE]
        
        # Perform prediction
        predictions = morgan_pic50_predictor.predict(batch)

        # Save to file incrementally
        np.savetxt(f, predictions, fmt="%.6f")  # Adjust precision if needed

print(f"Predictions saved to {output_file}")


Loading file...


No path specified. Models will be saved in: "AutogluonModels/ag-20250212_012231"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.9.21
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82~20.04.1-Ubuntu SMP Thu Dec 19 05:24:09 UTC 2024
CPU Count:          4
Memory Avail:       8.47 GB / 15.34 GB (55.2%)
Disk Space Avail:   319.07 GB / 387.48 GB (82.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accu

Predictions saved to /teamspace/studios/this_studio/250k_zinc/morgan_pic50_predictions.csv





In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from autogluon.tabular import TabularPredictor  # Ensure this import is correct

# Define batch size
BATCH_SIZE = 1000  # Adjust based on memory availability

# Expand the output path
output_dir = os.path.expanduser("~/250k_zinc")
rdk5_path = os.path.join(output_dir, "rdk5_fingerprints.csv")  # Updated to RDK5 fingerprints
output_file = os.path.join(output_dir, "rdk5_pic50_predictions.csv")  # Updated output file

# Ensure the directory exists
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Output directory '{output_dir}' does not exist. Check if fingerprint generation completed successfully.")

# Load the RDK5 fingerprint data
if os.path.exists(rdk5_path):
    print("Loading RDK5 fingerprints...")
    rdk5_df = pd.read_csv(rdk5_path)  # Updated to RDK5
else:
    raise FileNotFoundError(f"File not found: {rdk5_path}")

# Train RDK5-based predictor (ensure 'rdk5_train_pic50' is correctly defined elsewhere)
rdk5_pic50_predictor = TabularPredictor(label='pIC50').fit(rdk5_train_pic50)  # Ensure this dataset exists

# Open file to write predictions
with open(output_file, "w") as f:
    # Write the header
    f.write("prediction\n")

    # Process data in batches
    for i in tqdm(range(0, len(rdk5_df), BATCH_SIZE), desc="Predicting RDK5 Activity"):
        batch = rdk5_df.iloc[i : i + BATCH_SIZE]  # Fixed incorrect reference

        # Perform prediction
        predictions = rdk5_pic50_predictor.predict(batch)  # Updated to RDK5 predictor

        # Save to file incrementally
        np.savetxt(f, predictions, fmt="%.6f")  # Adjust precision if needed

print(f"Predictions saved to {output_file}")

Loading RDK5 fingerprints...


No path specified. Models will be saved in: "AutogluonModels/ag-20250212_012603"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.9.21
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82~20.04.1-Ubuntu SMP Thu Dec 19 05:24:09 UTC 2024
CPU Count:          4
Memory Avail:       3.52 GB / 15.34 GB (22.9%)
Disk Space Avail:   318.99 GB / 387.48 GB (82.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accu

Predictions saved to /teamspace/studios/this_studio/250k_zinc/rdk5_pic50_predictions.csv





In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from autogluon.tabular import TabularPredictor  # Ensure this import is correct

# Define batch size
BATCH_SIZE = 1000  # Adjust based on memory availability

# Expand the output path
output_dir = os.path.expanduser("~/250k_zinc")
ap_path = os.path.join(output_dir, "ap_fingerprints.csv")  # Updated to AP fingerprints
output_file = os.path.join(output_dir, "ap_pic50_predictions.csv")  # Updated output file

# Ensure the directory exists
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Output directory '{output_dir}' does not exist. Check if fingerprint generation completed successfully.")

# Load the AP fingerprint data
if os.path.exists(ap_path):
    print("Loading AP fingerprints...")
    ap_df = pd.read_csv(ap_path)  # Updated to AP
else:
    raise FileNotFoundError(f"File not found: {ap_path}")

# Train AP-based predictor (ensure 'ap_train_pic50' is correctly defined elsewhere)
ap_pic50_predictor = TabularPredictor(label='pIC50').fit(ap_train_pic50)  # Ensure this dataset exists

# Open file to write predictions
with open(output_file, "w") as f:
    # Write the header
    f.write("prediction\n")

    # Process data in batches
    for i in tqdm(range(0, len(ap_df), BATCH_SIZE), desc="Predicting AP Activity"):  # Updated progress bar text
        batch = ap_df.iloc[i : i + BATCH_SIZE]  # Fixed incorrect reference

        # Perform prediction
        predictions = ap_pic50_predictor.predict(batch)  # Updated to AP predictor

        # Save to file incrementally
        np.savetxt(f, predictions, fmt="%.6f")  # Adjust precision if needed

print(f"Predictions saved to {output_file}")


Loading AP fingerprints...


No path specified. Models will be saved in: "AutogluonModels/ag-20250212_014504"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.9.21
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82~20.04.1-Ubuntu SMP Thu Dec 19 05:24:09 UTC 2024
CPU Count:          4
Memory Avail:       3.61 GB / 15.34 GB (23.5%)
Disk Space Avail:   318.91 GB / 387.48 GB (82.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accu

Predictions saved to /teamspace/studios/this_studio/250k_zinc/ap_pic50_predictions.csv



