In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Load the data
data = pd.read_csv('output_4.csv')

# Display the first few rows of the data
print(data.head())

        CID Activity Type Qualifier  Value    MolWt  MolLogP  NumHDonors  \
0  44454916      Activity         =  1.750  446.532  4.34030           1   
1  44454916      Activity         =  0.028  446.532  4.34030           1   
2  44454916            Ki         =  0.120  446.532  4.34030           1   
3  44454916            Ki         =  0.008  446.532  4.34030           1   
4  44454878      Activity         =  0.028  473.602  4.39422           1   

   NumHAcceptors  RingCount   TPSA  ...  NumHeteroatoms  FractionCSP3  \
0              6          5  80.37  ...               8      0.208333   
1              6          5  80.37  ...               8      0.208333   
2              6          5  80.37  ...               8      0.208333   
3              6          5  80.37  ...               8      0.208333   
4              6          5  72.16  ...               8      0.269231   

   HallKierAlpha    Kappa3   LabuteASA  BalabanJ  BCUT2D_MWHI  BCUT2D_MWLOW  \
0          -3.52  4.13491

In [13]:
# Separate the data into input (X) and output (y)
X = data.drop(columns=['CID', 'Activity Type', 'Qualifier', 'Value'])

# Encode the Qualifier column
le = LabelEncoder()
data['Qualifier_encoded'] = le.fit_transform(data['Qualifier'])

# Now, split the output data
y_classifier = data['Activity Type']
y_regressor = data[['Qualifier_encoded', 'Value']]

In [14]:
# Split the data for the classification task
X_train_classifier, X_test_classifier, y_train_classifier, y_test_classifier = train_test_split(X, y_classifier, test_size=0.2, random_state=42)

# Instantiate and train a classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_classifier, y_train_classifier)

# Evaluate the classification model
y_pred_classifier = clf.predict(X_test_classifier)
accuracy_classifier = accuracy_score(y_test_classifier, y_pred_classifier)
print(f'Classification Accuracy: {accuracy_classifier * 100:.2f}%')

Classification Accuracy: 84.81%


In [15]:
# Split the data for the regression task
X_train_regressor, X_test_regressor, y_train_regressor, y_test_regressor = train_test_split(X, y_regressor, test_size=0.2, random_state=42)

# Instantiate and train a regressor
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train_regressor, y_train_regressor)

# Evaluate the regression model
y_pred_regressor = reg.predict(X_test_regressor)
mse = mean_squared_error(y_test_regressor, y_pred_regressor)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 8286194.998310305


In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
import math

# Load data
data = pd.read_csv('output_4.csv')

# Assume that molecular descriptors have been calculated and are part of the dataset

# Separate the data into input (X) and output (y)
X = data.drop(columns=['CID', 'Activity Type', 'Qualifier', 'Value'])

# Encode the Qualifier column
le = LabelEncoder()
data['Qualifier_encoded'] = le.fit_transform(data['Qualifier'])

# Now, split the output data
y_classifier = data['Activity Type']
y_regressor = data[['Qualifier_encoded', 'Value']]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data for the classification task
X_train_classifier, X_test_classifier, y_train_classifier, y_test_classifier = train_test_split(X_scaled, y_classifier, test_size=0.2, random_state=42)

# Instantiate and train a classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_classifier, y_train_classifier)

# Evaluate the classification model
y_pred_classifier = clf.predict(X_test_classifier)
accuracy_classifier = accuracy_score(y_test_classifier, y_pred_classifier)
print(f'Classification Accuracy: {accuracy_classifier * 100:.2f}%')

# Split the data for the regression task
X_train_regressor, X_test_regressor, y_train_regressor, y_test_regressor = train_test_split(X_scaled, y_regressor, test_size=0.2, random_state=42)

# Instantiate and train a regressor
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train_regressor, y_train_regressor)

# Evaluate the regression model
y_pred_regressor = reg.predict(X_test_regressor)
mse = mean_squared_error(y_test_regressor, y_pred_regressor)
mae = mean_absolute_error(y_test_regressor, y_pred_regressor)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')


Classification Accuracy: 85.09%
Mean Squared Error: 8286236.43167256
Mean Absolute Error: 110.94880666624606
