In [44]:
import utils as util
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error


In [36]:
# Load data
raw_data = util.load_data('warfarin_with_dose.csv')
X = []
y = []
for point in raw_data:
    feature, dose = util.patient_from_feature(point)
    X.append(feature)
    y.append(dose)
X = np.array(X)
y = np.array(y)

# Pre-Processing
# from sklearn import preprocessing
# scaler = preprocessing.StandardScaler()
# X = scaler.fit_transform(X)

# Split the data into train, val and test
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size = 0.2, random_state = 0)


In [37]:
## Features
#     feature = np.ones(18)
#     feature[0] = extract_age(x)
#     feature[1] = extract_height(x)
#     feature[2] = extract_weight(x)
#     feature[3:7] = extract_race(x)
#     feature[7] = extract_Amio(x)
#     feature[8] = extract_enzyme(x)
#     feature[9:15] = extract_CYP(x)
#     feature[15:18] = extract_VKO(x)


In [38]:
#check dataset
print(X_train.shape)
print(X_train)
print(y_train)

(3537, 18)
[[  5.   166.    64.   ...   0.     0.     0.  ]
 [  6.   170.18  88.   ...   0.     0.     1.  ]
 [  7.   166.12  62.3  ...   0.     1.     0.  ]
 ...
 [  8.   157.48  52.2  ...   1.     0.     0.  ]
 [  7.   175.26 113.   ...   0.     0.     0.  ]
 [  7.   180.34  90.7  ...   1.     0.     0.  ]]
[22.47 37.5  28.   ... 30.   46.9  42.  ]


In [48]:
# Linear regression
from sklearn.linear_model import LinearRegression

lr_regressor = LinearRegression(fit_intercept = True)
lr_regressor.fit(X_train, y_train)
print(lr_regressor.score(X_val, y_val))

y_pred = lr_regressor.predict(X_val)
print(mean_squared_error(y_val,y_pred))
print(np.mean(y_val))
coef = lr_regressor.coef_
print(coef)




0.4190950947712494
171.46222796877814
32.18242937853107
[ -2.55658836   0.08893858   0.15337975  -2.28939148   1.91831832
   0.66516065  -0.29408749  -6.68339492  19.65004685  -6.32290148
 -10.12763766 -11.01987753 -18.30847211 -21.07654829  -0.38028651
  -9.38428301 -17.55039092  -7.95263633]


In [31]:
# SVM
from sklearn.svm import SVR

svm_regressor = SVR(kernel='rbf', gamma='auto')
svm_regressor.fit(X_train, y_train)
svm_regressor.score(X_val, y_val)

0.34885986212276465

In [32]:
# SVM with various hyperparameters
from sklearn.svm import SVR

#initialise arrays to store the scores 
svm_score_train = np.zeros((7,1))
svm_score_val = np.zeros((7,1))

for i, C in enumerate((1000,100,10,1, 0.1, 0.01,0.001)):
    svm_regressor = SVR(kernel='rbf', gamma='auto', C=C)
    svm_regressor.fit(X_train,y_train)
    svm_score_train[i,:] = svm_regressor.score(X_train,y_train)
    svm_score_val[i,:] = svm_regressor.score(X_val,y_val)
    
print('training score \n', svm_score_train)
print('validation score \n', svm_score_val)


training score 
 [[ 0.5126063 ]
 [ 0.45474466]
 [ 0.40363197]
 [ 0.33870405]
 [ 0.20649663]
 [ 0.03167074]
 [-0.0230361 ]]
validation score 
 [[ 0.03759617]
 [ 0.37079432]
 [ 0.3993058 ]
 [ 0.34885986]
 [ 0.20404278]
 [ 0.00675302]
 [-0.05300609]]


In [33]:
# Ridge Regression
from sklearn.linear_model import Ridge

#initialise arrays to store the scores 
ridge_score_train = np.zeros((7,1))
ridge_score_val = np.zeros((7,1))

for i, C in enumerate((1000,100,10,1, 0.1, 0.01,0.001)):
    ridge_regressor = Ridge(alpha=C, fit_intercept = True)
    ridge_regressor.fit(X_train,y_train)
    ridge_score_train[i,:] = ridge_regressor.score(X_train,y_train)
    ridge_score_val[i,:] = ridge_regressor.score(X_val,y_val)
    
print('training score \n', ridge_score_train)
print('validation score \n', ridge_score_val)

training score 
 [[0.34685104]
 [0.37373007]
 [0.37466656]
 [0.37467813]
 [0.37467825]
 [0.37467825]
 [0.37467825]]
validation score 
 [[0.38743331]
 [0.41773642]
 [0.41904414]
 [0.41909108]
 [0.4190947 ]
 [0.41909506]
 [0.41909509]]


In [34]:
# Lasso Regression
from sklearn.linear_model import Lasso

#initialise arrays to store the scores 
lasso_score_train = np.zeros((7,1))
lasso_score_val = np.zeros((7,1))

for i, C in enumerate((1000,100,10,1, 0.1, 0.01,0.001)):
    lasso_regressor = Lasso(alpha=C, fit_intercept = True)
    lasso_regressor.fit(X_train,y_train)
    lasso_score_train[i,:] = lasso_regressor.score(X_train,y_train)
    lasso_score_val[i,:] = lasso_regressor.score(X_val,y_val)
    
print('training score \n', lasso_score_train)
print('validation score \n', lasso_score_val)

training score 
 [[0.        ]
 [0.        ]
 [0.        ]
 [0.30317023]
 [0.37387385]
 [0.37466976]
 [0.37467817]]
validation score 
 [[-0.00535607]
 [-0.00535607]
 [-0.00535607]
 [ 0.34205776]
 [ 0.41828633]
 [ 0.4190793 ]
 [ 0.41909437]]
