In [None]:
#Imports
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

import os
from pathlib import Path

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import platform
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(precision=3, suppress=True)

In [None]:
input_csv_path = '/Users/cu135/Dropbox (Partners HealthCare)/studies/atrophy_seeds_2023/metadata/correlation_to_baseline_scores.csv'

In [None]:
out_dir = '/Users/cu135/Dropbox (Partners HealthCare)/studies/atrophy_seeds_2023/Figures/correlation_to_baseline_scores/regress_to_total'

In [None]:
from calvin_utils.permutation_analysis_utils.statsmodels_palm import CalvinStatsmodelsPalm
# Instantiate the PalmPrepararation class
cal_palm = CalvinStatsmodelsPalm(input_csv_path=input_csv_path, output_dir=out_dir, sheet=None)
# Call the process_nifti_paths method
data_df = cal_palm.read_and_display_data()

In [None]:
data_df.columns

In [None]:
column = 'City'  # The column you'd like to evaluate
condition = 'not'  # The condition to check ('equal', 'above', 'below', 'not')
value = 'Wurzburg' # The value to drop if found

In [None]:
data_df, other_df = cal_palm.drop_rows_based_on_value(column, condition, value)
display(data_df)

In [None]:
cols_not_to_standardize = ['Z_Scored_Percent_Cognitive_Improvement_By_Origin_Group', 'Z_Scored_Subiculum_T_By_Origin_Group_'] #['Age']

In [None]:
data_df = cal_palm.standardize_columns(cols_not_to_standardize)
data_df

# Run Kernel Ridge

In [None]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import r2_score

def krr_run(df, dependent, predictors, alpha=1.0):
    krr = KernelRidge(alpha=alpha)
    Y = df.loc[:, [dependent]]
    X = df.loc[:, predictors]
    krr_model = krr.fit(X, Y)
    y_pred = krr_model.predict(X)
    print("R-squared: ", r2_score(Y, y_pred))
    return 

In [None]:
data_df.columns

In [None]:
dependent ='TOTAL11'
predictors = ['FrontalSurface' , 'OccipitalSurface', 'ParietalSurface', 'temp_ins_surface', 'MTLSurface']

In [None]:
krr_run(data_df, dependent, predictors, alpha=1)

## Kernel Ridge Regression LOOCV

In [None]:
import pandas as pd
import numpy as np
from sklearn.kernel_ridge import KernelRidge

def kernel_ridge_loo(df:pd.DataFrame, target:str, alpha:float, gamma:float, degree:int, kernel:str):
    y_pred = []
    for i in range(0, len(df[target].values.tolist())):
        # splitting dataframe into train and test set
        df_test = df.loc[[i]]
        df_train = df.drop(df.index[i])
        
        X_train = df_train.drop(target, axis=1)
        y_train = df_train[target]
        X_test = df_test.drop(target, axis=1)
        y_test = df_test[target]
        
        model = KernelRidge(kernel=kernel, alpha=alpha, gamma=gamma, degree=degree)
        model.fit(X_train, y_train)
        y_pred.append(model.predict(X_test)[0])
        
    outcome_dictionary = {}
    outcome_dictionary['actuals'] = df[target].values.tolist()
    outcome_dictionary['predictions'] = y_pred
    try:
        outcome_dictionary['loocv_prediction_r'], outcome_dictionary['loocv_prediction_p'] = pearsonr(outcome_dictionary['actuals'], outcome_dictionary['predictions'])
        outcome_dictionary['loocv_prediction_r2'] = outcome_dictionary['loocv_prediction_r']**2
        outcome_dictionary['loocv_mean_squared_error'] = np.sum( np.square( (outcome_dictionary['actuals'] - outcome_dictionary['predictions']))) / len(outcome_dictionary['predictions'])
        outcome_dictionary['loocv_root_mean_squared_error'] = np.sqrt(outcome_dictionary['loocv_mean_squared_error'])
        outcome_dictionary['lootcv_mean_absolute_error'] = np.sum(np.abs((outcome_dictionary['predictions'] - outcome_dictionary['actuals']))) / len(outcome_dictionary['predictions'])    
    except:
        print('Error generating outcome metrics')
    outcome_df = pd.DataFrame(outcome_dictionary)
    
    return outcome_df

In [None]:
data_df = data_df.loc[:, ['TOTAL11', 'FrontalCSF' , 'OccipitalCSF', 'ParietalCSF', 'temp_ins_csf']]

In [None]:
#Reminder: kernels = ['additive_chi2', 'polynomial', 'poly', 'rbf', 'cosine', 'laplacian', 'precomputed', 'sigmoid', 'linear', 'chi2']
kernel = 'rbf'
target = 'TOTAL11'
#----------------------------------------------------------------USER INPUTS ABOVE----------------------------------------------------------------
loo_results = kernel_ridge_loo(df=data_df, target=target, kernel=kernel, alpha=0.001, gamma=.0001, degree=4)
# loo_df = pd.DataFrame(loo_results)
from scipy.stats import pearsonr, spearmanr

#Calculate metrics
loo_results.dropna(inplace=True)
loo_results['loocv_pearson_r'], loo_results['loocv_pearson_p'] = pearsonr(loo_results['predictions'], loo_results['actuals'], alternative='greater')
loo_results['loocv_r_squared'] = np.square(loo_results['loocv_pearson_r'])
_, loo_results['loocv_pearson_p_greater'] = pearsonr(loo_results['predictions'], loo_results['actuals'], alternative='greater')
_, loo_results['loocv_pearson_p_lesser'] = pearsonr(loo_results['predictions'], loo_results['actuals'], alternative='less')
_, loo_results['loocv_pearson_p_two-sided'] = pearsonr(loo_results['predictions'], loo_results['actuals'], alternative='two-sided')
loo_results['loocv_mse'] = (np.sum(np.square((loo_results['actuals']-loo_results['predictions'])))) / len(loo_results['actuals'])
loo_results['loocv_rmse'] = np.sqrt(loo_results['loocv_mse'])
loo_results['mean_absolute_error'] = np.sum(np.absolute(loo_results['predictions']-loo_results['actuals']))/len(loo_results['predictions'])

display(loo_results)

In [None]:
#Save if desired
save=True
if save:
    if os.path.isdir(out_dir) != True:
        os.makedirs(out_dir)
        print('Made directory')
    try:
        loo_results.to_csv(os.path.join(out_dir, f'loocv_{kernel}_kernel_ridge_regression_{data_df.columns.values.tolist()[1:]}_statistically_significant_{loo_results["mean_absolute_error"].values.tolist[1]}.csv'))
    except:
        loo_results.to_csv(os.path.join(out_dir, f'ols_leave_one_out_full_formula.csv'))


print('Saved to: ' + out_dir)

## Assess various Kernels

In [None]:
# Define the list of kernel functions
def linear_kernel(X1, X2, **kwargs):
    """Linear kernel function"""
    return np.matmul(X1, X2.transpose())

def polynomial_kernel(x1, x2, degree, c=1, **kwargs):
    return (np.dot(x1, x2) + c)**degree

def rbf_kernel(x1, x2, gamma, **kwargs):
    return np.exp(-gamma * np.power(np.linalg.norm(x1 - x2), 2))

def gaussian_kernel(x1, x2, sigma, **kwargs):
    return np.exp(-np.power(np.linalg.norm(x1 - x2), 2) / (2 * sigma ** 2))

def sigmoid_kernel(x1, x2, k=1, c=1, **kwargs):
    return np.tanh(k * np.dot(x1, x2) + c)

def laplacian_kernel(x1, x2, gamma, **kwargs):
    return np.exp(-gamma * np.linalg.norm(x1 - x2))

def exponential_kernel(x1, x2, gamma, **kwargs):
    return np.exp(-gamma * np.linalg.norm(x1 - x2))

## Assess Transformation of a Kernel Upon the Data

In [None]:
n = new_df.shape[0]
kernel = np.zeros((n,n))
kernel = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        kernel[i,j] = rbf_kernel(data_df.iloc[i, 1], data_df.iloc[j, 1], gamma=1)#, sigma=1)# degree=1)
        
plt.hist(kernel)

## Run LOOCV

In [None]:
import pandas as pd
import numpy as np
from sklearn.kernel_ridge import KernelRidge
from scipy.stats import pearsonr, spearmanr
kernel_list = ["linear_kernel", "polynomial_kernel", "rbf_kernel", "gaussian_kernel", "sigmoid_kernel", "laplacian_kernel", "exponential_kernel"] #, "exp", "log", "tanh"]
#Iterate over the kernels of interest

result_list_kernel = []; results_list_r2 = []; results_list_r = []; results_list_p = []; results_list_mse = []; results_list_rmse = []; results_list_mae = []
for kernel_string in kernel_list:

    #Generate Kernelized data
    n = data_df.shape[0]
    kernel = np.zeros((n,n))
    Y = data_df['percent_change_adascog11']
    X = data_df.drop('percent_change_adascog11', axis=1)
    sigma = 1000
    for i in range(n):
        for j in range(n):
            kernel[i,j] = eval(kernel_string)(X.iloc[i], X.iloc[j], gamma=1, sigma=1, k=1, degree=2)#, sigma=1)# degree=1)
            
    #Train a ridge regression on kernelized data
    model = KernelRidge(kernel='precomputed')
    results = model.fit(kernel[:, :], Y[:])

    results_dict = {}
    # results_dict['predictions'] = np.dot(kernel,model.dual_coef_)
    # results_dict['observations'] = Y[:]
    result_list_kernel.append(kernel_string)
    results_list_r2.append(results.score(kernel[:, :], Y[:]))
    r, p = spearmanr(np.dot(kernel,model.dual_coef_), Y[:])
    results_list_r.append(r); results_list_p.append(p)
    results_list_mse.append((np.sum(np.square((Y[:]-np.dot(kernel,model.dual_coef_))))) / len(Y[:]))
    results_list_rmse.append(np.sqrt((np.sum(np.square((Y[:]-np.dot(kernel,model.dual_coef_))))) / len(Y[:])))
    results_list_mae.append(np.sum(np.absolute(np.dot(kernel,model.dual_coef_)-Y[:]))/len(np.dot(kernel,model.dual_coef_)))
results_dict['kernel'] = result_list_kernel
results_dict['r2'] = results_list_r2; results_dict['r'] = results_list_r; results_dict['p'] = results_list_p
results_dict['mse'] = results_list_mse; results_dict['rmse'] = results_list_rmse; results_dict['mae'] = results_list_mae

kernel_comparison_kernel_ridge_results = pd.DataFrame(results_dict)
display(kernel_comparison_kernel_ridge_results)

In [None]:
#Save Results if desired
save=True
if save:
    if os.path.isdir(out_dir) != True:
        os.makedirs(out_dir)
        print('Made directory')
    kernel_comparison_kernel_ridge_results.to_csv(os.path.join(out_dir, 'kernel_comparison_kernel_ridge_results.csv'))


print('Saved to: ' + out_dir)

## Grid Search for Optimal Hyperparameters

In [None]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import pandas as pd
## Refernece
#--- kernels = ['additive_chi2', 'polynomial', 'poly', 'rbf', 'cosine', 'laplacian', 'precomputed', 'sigmoid', 'linear', 'chi2']

# Load your dataframe
df = data_df.copy()

# Assign the independent variables and dependent variable
Y = df['TOTAL11']
X = df.loc[:, ['FrontalCSF' , 'OccipitalCSF', 'ParietalCSF', 'temp_ins_csf']]

# Define the kernel ridge model
kr = KernelRidge()

# Define evaluation metrics
scoring = 'r2' #'r2' neg_root_mean_squared_error
# Define the parameter grid for the grid search
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'kernel': ['linear', 'rbf']}

# Perform leave-one-out cross-validation
loo = LeaveOneOut()

# Perform a grid search using the leave-one-out cross-validation
grid_search = GridSearchCV(kr, param_grid, scoring=scoring, cv=len(data_df))
grid_search.fit(X, Y)

# The optimal hyperparameters
print("Numer of splits:", grid_search.n_splits_)
print("Optimal parameters:", grid_search.best_params_)
print("Optimal score:", grid_search.best_score_)
# print("Overall results:", grid_search.cv_results_)



## Perform Manual Kernalization and Manual Ridge Regression

In [None]:
data_df.columns

In [None]:
n = data_df.shape[0]
kernel = np.zeros((n,n))
Y = data_df['TOTAL11']
X = data_df.loc[:, ['FrontalCSF' , 'OccipitalCSF', 'ParietalCSF', 'temp_ins_csf']]
sigma = 1000
for i in range(n):
    for j in range(n):
        kernel[i,j] = exponential_kernel(X.iloc[i], X.iloc[j], gamma=1, k=1, degree=1)#, sigma=1)# degree=1)

In [None]:
#Generate Kernelized data
n = data_df.shape[0]
kernel = np.zeros((n,n))
Y = data_df['percent_change_adascog11']
X = data_df.drop('percent_change_adascog11', axis=1)
sigma = 1000
for i in range(n):
    for j in range(n):
        kernel[i,j] = exponential_kernel(X.iloc[i], X.iloc[j], gamma=1, k=1, degree=1)#, sigma=1)# degree=1)
        
#Train a ridge regression on kernelized data
model = KernelRidge(kernel='precomputed')
results = model.fit(kernel[:, :], Y[:])

results_dict = {}
results_dict['predictions'] = np.dot(kernel,model.dual_coef_)
results_dict['observations'] = Y[:]
results_dict['r2'] = results.score(kernel[:, :], Y[:])
results_dict['pearson_r'], results_dict['pearson_p'] = pearsonr(results_dict['predictions'], results_dict['observations'])
results_dict['mse'] = (np.sum(np.square((results_dict['observations']-results_dict['predictions'])))) / len(results_dict['observations'])
results_dict['rmse'] = np.sqrt(results_dict['mse'])
results_dict['mae'] = np.sum(np.absolute(results_dict['predictions']-results_dict['observations']))/len(results_dict['predictions'])

manual_kernel_ridge_results = pd.DataFrame(results_dict)
display(manual_kernel_ridge_results)

## Run K-Folds on Kernel Ridge (Useful for Precomputed Kernels)
### If using kernels, must make sure the folds are symmetric
### The ridge regression will expect a symmetric Kernel Matrix from training to testing

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.kernel_ridge import KernelRidge
from scipy.stats import pearsonr
import numpy as np

def kernel_ridge_kfold(df:pd.DataFrame, target:str, k:int, sigma:float):
    kf = KFold(n_splits=k, shuffle=True)
    y_pred = []
    y_test_all = []
    for train_index, test_index in kf.split(df):
        df_train, df_test = df.iloc[train_index], df.iloc[test_index]
        X_train = df_train.drop(target, axis=1)
        y_train = df_train[target]
        X_test = df_test.drop(target, axis=1)
        y_test = df_test[target]
        model = KernelRidge(kernel='rbf', gamma=1/(2*sigma**2))
        model.fit(X_train, y_train)
        y_pred.append(model.predict(X_test))
        y_test_all.append(y_test)
    y_test_all = np.concatenate(y_test_all)
    y_pred = np.concatenate(y_pred)
    R, P = pearsonr(y_pred, y_test_all)
    return R, P

R, P = kernel_ridge_kfold(df=data_df, target='percent_change_adascog11', k=2, sigma=1)#(data_df.shape[0]-1)/6)
results_df = pd.DataFrame({'R': R, 'P': P}, index=[0])
display(results_df)

save=False
if save:
    if os.path.isdir(out_dir) == False:
        os.mkdir(out_dir)
    results_df.to_csv(os.path.join(out_dir, 'kernel_ridge_regression_k_folds_results.csv'))