In [3]:
import data_utils as util
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load Data

In [5]:
# Load data
raw_data = util.load_data('./data/warfarin_with_dose.csv')
X = []
y = []
for point in raw_data:
    feature, dose = util.patient_from_feature(point)
    X.append(feature)
    y.append(dose)
X = np.array(X)
y = np.array(y)

# Split the data into train, val and test
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size = 0.2, random_state = 5)

# data preprocessing

In [36]:
#######################################################
# Load the feature header
#######################################################
import csv

# read the csv file
feature_names = []
with open('./data/header.csv') as f:
    reader = csv.reader(f)
    for line in reader:
        feature_names.append(line)

# Dirty tricks to clean up the feature names
feature_names = feature_names[0]
feature_names = list(map(lambda x: x.replace(' ', '_'), feature_names))
feature_names[0] = 'PharmGKB_Subject_ID' # manual fix to remove a character in f...[0]

In [37]:
####################################################################
# Load the dataframe into X; clean up some unnecessary columns 
####################################################################

# Load data and process column names
df = pd.read_csv('./data/warfarin_with_dose.csv', names = feature_names)
df.columns = [c.replace(' ', '_') for c in df.columns]

# Extract the label (Warfarin dose) from the rest of the features
y = df.Therapeutic_Dose_of_Warfarin
X = df.drop('Therapeutic_Dose_of_Warfarin',axis=1)
feature_names.remove('Therapeutic_Dose_of_Warfarin')

# Drop Subject_ID (irrelevant) and Medication (different to encode)
X = X.drop('PharmGKB_Subject_ID',axis=1)
X = X.drop('Medications',axis=1)
feature_names.remove('PharmGKB_Subject_ID')
feature_names.remove('Medications')

In [40]:
####################################################################
# Encode different features with numeric/label/onehot encodings 
####################################################################
numeric_features = ['Height_(cm)',
                    'Weight_(kg)',
                    'Target_INR',
                    'INR_on_Reported_Therapeutic_Dose_of_Warfarin']
label_features =   ['Age',
                    'Estimated_Target_INR_Range_Based_on_Indication']
categorical_features = [f for f in feature_names \
                        if f not in numeric_features and f not in label_features]

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

label_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', LabelEncoder())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('lab', label_transformer, label_features),
        ('cat', categorical_transformer, categorical_features)]
)


In [42]:
preprocessor.fit(X)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [None]:


# dataset.fillna(dataset.mean(), inplace=True)
# dataset = dataset.dropna(axis = 1)
# print(dataset.shape)
# dataset.dropna()
# print(dataset.shape)

enc = OneHotEncoder(categorical_features = '',handle_unknown='ignore')
enc.fit(dataset)
enc.transform(dataset)

In [93]:
dataset = pd.read_csv("warfarin_with_dose.csv", header = None)
print(dataset.shape)
dataset.dropna(axis = 'columns')
print(dataset.shape)




(5528, 66)
(5528, 66)


In [43]:
#check dataset
print(X_train.shape)
print(X_train)
print(y_train)

(3537, 18)
[[  6.   166.    97.   ...   1.     0.     0.  ]
 [  6.   176.02  86.   ...   0.     0.     0.  ]
 [  8.   172.01  76.   ...   0.     0.     0.  ]
 ...
 [  7.   180.34 108.9  ...   0.     0.     0.  ]
 [  2.   168.   110.   ...   1.     0.     0.  ]
 [  8.   153.67  73.   ...   1.     0.     0.  ]]
[30.03 42.5  28.   ... 84.   77.   30.  ]


In [44]:
# Linear regression
from sklearn.linear_model import LinearRegression

lr_regressor = LinearRegression(fit_intercept = True)
lr_regressor.fit(X_train, y_train)
print(lr_regressor.score(X_val, y_val))

y_pred = lr_regressor.predict(X_val)
print(mean_squared_error(y_val,y_pred))
print(np.mean(y_val))
coef = lr_regressor.coef_
print(coef)




0.3938308838129855
182.1669176159891
31.52241807909605
[ -2.62093731   0.07702587   0.16954881  -1.81663937   1.4910022
   1.38656902  -1.06093184  -6.41899367  11.18164824  -5.79829658
 -10.09348135  -9.22958937 -19.87868406 -21.63813941  -4.36744505
  -9.4548067  -17.66790743  -7.08903945]


In [45]:
# SVM
from sklearn.svm import SVR

svm_regressor = SVR(kernel='rbf', gamma='auto')
svm_regressor.fit(X_train, y_train)
svm_regressor.score(X_val, y_val)

0.13329921820187896

In [46]:
# SVM with various hyperparameters
from sklearn.svm import SVR

#initialise arrays to store the scores 
svm_score_train = np.zeros((7,1))
svm_score_val = np.zeros((7,1))

for i, C in enumerate((1000,100,10,1, 0.1, 0.01,0.001)):
    svm_regressor = SVR(kernel='rbf', gamma='auto', C=C)
    svm_regressor.fit(X_train,y_train)
    svm_score_train[i,:] = svm_regressor.score(X_train,y_train)
    svm_score_val[i,:] = svm_regressor.score(X_val,y_val)
    
print('training score \n', svm_score_train)
print('validation score \n', svm_score_val)




training score 
 [[ 0.85439884]
 [ 0.6486505 ]
 [ 0.39173633]
 [ 0.18051436]
 [ 0.02837724]
 [-0.02581761]
 [-0.03313661]]
validation score 
 [[-0.16914487]
 [ 0.18075103]
 [ 0.21441668]
 [ 0.13329922]
 [ 0.01565171]
 [-0.03543393]
 [-0.04245123]]


In [47]:
# Ridge Regression
from sklearn.linear_model import Ridge

#initialise arrays to store the scores 
ridge_score_train = np.zeros((7,1))
ridge_score_val = np.zeros((7,1))

for i, C in enumerate((1000,100,10,1, 0.1, 0.01,0.001)):
    ridge_regressor = Ridge(alpha=C, fit_intercept = True)
    ridge_regressor.fit(X_train,y_train)
    ridge_score_train[i,:] = ridge_regressor.score(X_train,y_train)
    ridge_score_val[i,:] = ridge_regressor.score(X_val,y_val)
    
print('training score \n', ridge_score_train)
print('validation score \n', ridge_score_val)

best_ridge = Ridge(alpha = 1, fit_intercept = True)
best_ridge.fit(X_train,y_train)
y_pred = best_ridge.predict(X_val)
print(mean_squared_error(y_val,y_pred))
print(np.mean(y_val))

training score 
 [[0.28313693]
 [0.36697994]
 [0.39537848]
 [0.39781377]
 [0.39786211]
 [0.39786264]
 [0.39786265]]
validation score 
 [[0.28077951]
 [0.36252903]
 [0.39041267]
 [0.39362977]
 [0.39381411]
 [0.39382924]
 [0.39383072]]
182.22735732001104
31.52241807909605


In [48]:
# Lasso Regression
from sklearn.linear_model import Lasso

#initialise arrays to store the scores 
lasso_score_train = np.zeros((7,1))
lasso_score_val = np.zeros((7,1))

for i, C in enumerate((1000,100,10,1, 0.1, 0.01,0.001)):
    lasso_regressor = Lasso(alpha=C, fit_intercept = True)
    lasso_regressor.fit(X_train,y_train)
    lasso_score_train[i,:] = lasso_regressor.score(X_train,y_train)
    lasso_score_val[i,:] = lasso_regressor.score(X_val,y_val)
    
print('training score \n', lasso_score_train)
print('validation score \n', lasso_score_val)

training score 
 [[0.        ]
 [0.06638883]
 [0.1539467 ]
 [0.26538526]
 [0.37596028]
 [0.39758491]
 [0.39785987]]
validation score 
 [[-0.00081468]
 [ 0.06759243]
 [ 0.15332604]
 [ 0.26487317]
 [ 0.36701346]
 [ 0.39325069]
 [ 0.39379806]]


In [51]:
# non-nested cross validation using RBF SVM
from sklearn.model_selection import GridSearchCV

rbf = SVR(kernel='rbf')
gammas = np.logspace(-6, 0, 7)
params = {'gamma': gammas}
gridcv = GridSearchCV(estimator=rbf, param_grid=params, cv=5)
gridcv.fit(X_trainval, y_trainval)
scores_mean = gridcv.cv_results_['mean_test_score']
scores_sd = gridcv.cv_results_['std_test_score']
print('highest accuracy score is',gridcv.best_score_)
model = gridcv.best_estimator_

highest accuracy score is 0.17605920453271334


In [55]:
# non-nested cross validation using Ridge
from sklearn.model_selection import GridSearchCV

ridge_regressor = Ridge(fit_intercept = True)
params={'alpha': [25,10,4,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01]}

gridcv = GridSearchCV(estimator=ridge_regressor, param_grid=params, cv=10)
gridcv.fit(X_trainval, y_trainval)
scores_mean = gridcv.cv_results_['mean_test_score']
scores_sd = gridcv.cv_results_['std_test_score']
print('highest accuracy score is',gridcv.best_score_)
model = gridcv.best_estimator_

highest accuracy score is 0.4007028547482021
