In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers

In [None]:
!pip install --pre deepchem
import deepchem
deepchem.__version__

In [None]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
!pip install regex

In [None]:
import pandas as pd
dataset = pd.read_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/JAK3_processed_PIC50.csv')
dataset.head()

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, RobertaModel, RobertaTokenizer
from bertviz import head_view

model = AutoModelForMaskedLM.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")

fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
# Create a new DataFrame to store the tokenized vectors
tokenized_vector_df = pd.DataFrame()

for index in range(0, len(dataset)):
    smiles = dataset['canonical_smiles'].iloc[index]
    # Tokenize the SMILES string
    input_ids = tokenizer.encode(smiles, return_tensors="pt").tolist()[0]
    # Ensure the list has length 95 by padding with zeros
    input_ids.extend([0] * (95 - len(input_ids)))
    # Add to DataFrame
    tokenized_vector_df = tokenized_vector_df.append(pd.Series(input_ids), ignore_index=True)

# Concatenate the new DataFrame with the existing dataset along the columns
dataset = pd.concat([dataset, tokenized_vector_df], axis=1)

# Save the updated DataFrame as a new CSV file
dataset.to_csv('/content/drive/Shareddrives/1:1 Awani Gadre/Dataset/JAK3_processed_PIC50_with_chembert_tokenized.csv', index=False)


In [None]:
from sklearn.model_selection import train_test_split

# Assuming X is your features and y is your labels
train_df, valid_test_df = train_test_split(dataset, test_size=0.30, random_state=42)
test_df, valid_df = train_test_split(valid_test_df, test_size=0.50, random_state=42)

In [None]:
test_df

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize and train regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(train_df.iloc[:,2:], train_df.iloc[:,1])

In [None]:
# Predict
y_pred = regressor.predict(test_df.iloc[:,2:])

# Calculate metrics
rmse = np.sqrt(mean_squared_error(test_df.iloc[:,1], y_pred))
mae = mean_absolute_error(test_df.iloc[:,1], y_pred)
r2 = r2_score(test_df.iloc[:,1], y_pred)
mape = np.mean(np.abs((test_df.iloc[:,1] - y_pred) / test_df.iloc[:,1])) * 100

# Print metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R^2: {r2}")
print(f"MAPE: {mape}%")

In [None]:
np.max(dataset['pIC50'])