# QSAR Model Building of Acetylcholinesterase Inhibitors

## Read in data

In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv("./data/acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv")
dataset

In [None]:
X = dataset.drop(["pIC50"], axis=1)
X

In [None]:
Y = dataset.iloc[:, -1]
Y

# loc는 location의 약어, 특정 row or columns을 indexing 할 때 사용
# iloc은 integer location의 약어, 특정 row or columns의 순서(위치)를 기반으로 indexing할 때 사용

## Remove low variance features

In [None]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection,get_support(indices=True)]]

X = remove_low_variance(X, threshold=0.1)
X

In [None]:
X.to_csv("descriptor_list.csv", index =False)

In [2]:
# In the app, use the following to get this same descriptor list
# of 218 variables from the initial set of 881 variables
# Xlist = list(pd.read_csv('descriptor_list.csv').columns)
# X[Xlist]

## Random Forest Regression Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
model = RandomForestRegressor(n_estimators=500, random_state=42)
model.fit(X, Y)
r2 = model.score(X, Y)
r2

## Model Prediction

In [3]:
Y_pred = model.predict(X)
Y_pred

NameError: name 'model' is not defined

## Model Performance

In [None]:
print("Mean squared error (MSE) : %.2f"
      % mean_squared_error*Y, Y_pred)
print("Coefficient of determination (R^2): %.2f"
      % r2_score(Y, Y_pred))

## Data Visualzation (Experimental vs Predicted pIC50 for Training Data)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(x=Y, y=Y_pred, c="#7CAE00", alpha=0.3)

# Add trendline
# https://stackoverflow.com/questions/26447191/how-to-add-trendline-in-python-matplotlib-dot-scatter-graphs
z = np.polyfit(Y, Y_pred, 1)
p = np.poly1d(z)

plt.plot(Y,p(Y),"#F8766D")
plt.ylabel('Predicted pIC50')
plt.xlabel('Experimental pIC50')

## Save Model as Pickle Object

In [None]:
import pickle

In [None]:
pickle.dump(model, open("acetylcholinesterase_model.pkl", "wb"))