# Regularization
I'm following the guide here: <https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html>.

## Load Data

In [None]:
import glob
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def get_subject_from_path(path):
    normalized_path = os.path.normpath(path)
    path_components = normalized_path.split(os.sep)
    return path_components[-2][4:]

In [None]:
# Search for all functional connectivity files and read them into a
# numpy array.
fc_path = '/imaging3/owenlab/bpho/python_power_fc'
fc_paths = glob.glob(fc_path + '/**/power_fc.npy', recursive=True)

fcs = {}
for path in fc_paths:
    subject_id = get_subject_from_path(path)
    # print(subject_id)
    subject_fc = np.load(path)
    fcs[subject_id] = subject_fc[np.triu_indices(264, k=1)]
print("Number of functional connectivities:", len(fcs))

In [None]:
print("Number of features (connections):", fcs["NDARAP912JK3"].shape[0])

In [None]:
wisc_label_path = "/imaging3/owenlab/bpho/Biobank Labels/Subjects with WISC.csv"
wisc_labels = pd.read_csv(wisc_label_path)
wisc_labels.set_index(keys='assessment WISC,EID', inplace=True)
display(wisc_labels)

In [None]:
label_subject_ids = wisc_labels.index
subject_ages = wisc_labels["assessment Basic_Demos,Age"].to_numpy()
subjects_with_wisc = {}

for subject_id in label_subject_ids:
    if subject_id not in fcs:
        continue
    
    subject_wisc_fsiq_sum = wisc_labels.at[subject_id, 'assessment WISC,WISC_FSIQ_Sum']
    subjects_with_wisc[subject_id] = (fcs[subject_id], subject_wisc_fsiq_sum)

print(subjects_with_wisc['NDARAC331VEH'])

In [None]:
fc_matrices = []
wisc_measure = []

for features in subjects_with_wisc.values():
    fc_matrices.append(features[0])
    wisc_measure.append(features[1])

X = np.array(fc_matrices)
y = np.array(wisc_measure)

print("X shape:", X.shape, "y shape:", y.shape)

## Train Ridge Model

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import median_absolute_error

In [None]:
ridge = Ridge()
ridge.fit(X, y)

y_ridge = ridge.predict(X)
mae = median_absolute_error(y, y_ridge)
print("MAE:", mae)
print("Ridge r^2:", ridge.score(X, y))

In [None]:
plt.scatter(y, y_ridge)

In [None]:
print(np.amin(ridge.coef_), np.amax(ridge.coef_))
coefs = pd.DataFrame(ridge.coef_)
display(coefs)