In [1]:
%matplotlib inline
import random
import pandas as pd
import numpy as np
import cPickle as pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints.FingerprintMols import FingerprintMol as fMol



### GET TRAINING FINGERPRINTS AND GAPS

In [None]:
with open('mol_data/all_mols.mol') as f:
    train_mols = pickle.load(f)
with open('mol_data/train_gap.pkl') as f:
    train_gap = pickle.load(f)

In [None]:
# This determines how many data points to train on
c = 100000
sample_indices = random.sample(range(1, 1000000), c*1.25)
train_indices = sample_indices[0:c-1]
val_indices = sample_indices[c:]

train_mol_subset = train_mols.ix[sample_indices]
train_gap_subset = train_gap.ix[sample_indices]
val_mol_subset = train_mols.ix[val_indices]
val_gap_subset = train_gap.ix[val_indices]

In [None]:
%%time
# generate fingeprints: Morgan fingerprint with radius 2
fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in train_mol_subset]

# convert the RDKit explicit vectors into numpy arrays
np_fps = []
for fp in fps:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  np_fps.append(arr)

### TRAIN ON 100000 DATA POINTS

#### FIT THE DATA

In [None]:
%%time
RF = RandomForestRegressor()
RF.fit(np_fps, train_gap_subset)

In [2]:
with open('mol_data/RF_Regressor.pkl') as f:
    RF = pickle.load(f)

#### VALIDATE (note this is currently wrong bc i did the sampling differently --- change this later)

In [None]:
val_fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in val_mol_subset]

# convert the RDKit explicit vectors into numpy arrays
val_np_fps = []
for fp in val_fps:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  val_np_fps.append(arr)

RF_val = RF.predict(val_np_fps)
print "validation err: " + str(np.sqrt(sum((RF_val - val_gap_subset)**2)/(len(RF_val))))

### GET TEST DATA AND TEST IT!!

In [None]:
# THIS IS HOW THESE WER CREATED
#test_smiles = pd.read_csv('test.csv.gz', compression='gzip')['smiles']
#test_mols = test_smiles.apply(lambda x: Chem.MolFromSmiles(x))

In [4]:
%%time
with open('mol_data/test_mols.mol') as f:
    test_mols = pickle.load(f)

CPU times: user 3min 5s, sys: 11.8 s, total: 3min 16s
Wall time: 3min 23s


In [5]:
%%time
# generate fingeprints: Morgan fingerprint with radius 2
test_fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in test_mols]

# convert the RDKit explicit vectors into numpy arrays
test_np_fps = []
for fp in test_fps:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  test_np_fps.append(arr)

CPU times: user 2min 26s, sys: 28.6 s, total: 2min 54s
Wall time: 3min 2s


In [6]:
%%time
RF_test_pred = RF.predict(test_np_fps)

CPU times: user 23.5 s, sys: 1min 19s, total: 1min 43s
Wall time: 2min 15s


In [11]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [12]:
write_to_file('RF_100k_train_test_predictions_1.csv', RF_test_pred)