In [1]:
import numpy as np
import helpers as hp
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
import pandas as pd

#read dataset:
df = pd.read_excel('Volumetry_Timepoint_1_2022-11-23_mlm.xlsx', sheet_name = 1)

#Convert dataframe to numpy array:
sample_data = df.values

In [3]:
# select labels and input matrix:
num_rows = len(df.axes[0])
num_cols = len(df.axes[1])

patient_ids = sample_data[2:num_rows,1]
patient_sex = sample_data[2:num_rows,3]
feature_ids = sample_data[0,4:num_cols]

#the inputs are the volumes for each brain area:
inputs = sample_data[1:num_rows,4:num_cols]
inputs = np.asarray(inputs, dtype=float)

#the labels correspond to the age of each patient:
labels = sample_data[1:num_rows,2] 
labels = np.asarray(labels, dtype=float)

In [4]:
#Remove features containing negative values for volume; remove features with only zeros (or mostly zeros)

#Correlation between age and brain area volume for each area : 
N = len(inputs[0,:])
corr = np.zeros((N,))
for i in range(N):
    corr_matrix = np.corrcoef(inputs[:,i], labels)
    corr[i] = corr_matrix[0,1]

#remove nan values from correlation matrix and keep track of the indices: 
remove_features = np.where(np.isnan(corr))
inputs = np.delete(inputs, remove_features, axis=1)

corr = corr[np.logical_not(np.isnan(corr))]

#find the minimum and maximum correlation values :
max_corr = np.max(np.abs(corr))
min_corr = np.min(np.abs(corr))
print(max_corr)
print(min_corr)

#select the features with the highest correlation values:
feature_inds = np.where(np.abs(corr) >= 0.3)

#create a new input matrix with only the most relevant features: we keep 16 features
new_inputs = np.take(inputs, feature_inds[0], axis=1)
new_inputs = np.asarray(new_inputs, dtype=float)

0.3796934332415149
0.0007140706846795975


  c /= stddev[:, None]
  c /= stddev[None, :]


In [5]:
#Shuffle data along axis 0 (shuffle the rows):

np.random.seed(1)
inds = np.random.permutation(new_inputs.shape[0])
shuffled_inputs = new_inputs[inds,:]
labels = labels[inds]

In [6]:
#Split the dataset into a training set and a test set:
y_val, y_tr, x_val, x_tr = hp.slice_data(labels, shuffled_inputs, 0.5)

In [14]:
#Least squares regression which fits a linear model to minimize the residual sum of squares

#with new inputs (only 16 features): 
regLS = make_pipeline(StandardScaler(), LinearRegression())
regLS.fit(x_tr, y_tr)
y_pred_LS = regLS.predict(x_val)
y_pred_LS = np.rint(y_pred_LS)

acc_LS = hp.accuracy(y_pred_LS, y_val)
print(acc_LS)

0.12121212121212122


In [15]:
#Stochastic GD: find a linear regression model

#with original inputs (all features): 
regSGD = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3))
regSGD.fit(x_tr, y_tr)
y_pred = regSGD.predict(x_val)
y_pred = np.rint(y_pred)

acc_SGD = hp.accuracy(y_pred, y_val)
#acc_SGD = accuracy_score(y_pred, y_val)
print(acc_SGD)

0.10606060606060606


In [18]:
#To do : cross validation -> divide into random sets to have one traning set and one test set