# Project 1: Sex Differences in brain organization

### Main Script: Sex classification

## Load packages

In [1]:
# General
import os
import sys
import numpy as np
import pandas as pd
import csv
import math
from math import isnan
import statistics
import pingouin as pg
import pickle

# Computing
import scipy.io  # loadmat
from scipy import stats
import sklearn 
from brainstat.stats.terms import FixedEffect
from brainstat.stats.SLM import SLM

# Visualisation
import matplotlib.pyplot as plt 
import seaborn as sns
import vtk
from IPython.display import display
import matplotlib.collections as clt
import ptitprince as pt

# Neuroimaging
import nibabel
import nilearn
from brainstat.datasets import fetch_parcellation
from enigmatoolbox.permutation_testing import spin_test, shuf_test

# Gradients
import brainspace
from brainspace.datasets import load_parcellation, load_conte69
from brainspace.plotting import plot_hemispheres
from brainspace.gradient import GradientMaps
from brainspace.utils.parcellation import map_to_labels

## Define directories

In [2]:
codedir = os.path.abspath('')  # obtain current direction from which script is runnning

datadir = '/data/p_02667/sex_diff_gradients/data/'

resdir_gsp = '/data/p_02667/sex_diff_gradients/results/GSP/'
resdir_hcp = '/data/p_02667/sex_diff_gradients/results/HCP/'

## Define functions

#### Confound removal
https://academic.oup.com/gigascience/article/doi/10.1093/gigascience/giac014/6547681#339860007
https://github.com/darya-chyzhyk/confound_prediction

In [65]:
from sklearn.base import BaseEstimator, TransformerMixin

class ConfoundRegressor(BaseEstimator, TransformerMixin):
    """ Fits a confound onto each feature in X and returns their residuals."""

    def __init__(self, confound, X, cross_validate=True, precise=False,
                 stack_intercept=True):
        """ Regresses out a variable (confound) from each feature in X.

        Parameters
        ----------
        confound : numpy array
            Array of length (n_samples, n_confounds) to regress out of each
            feature; May have multiple columns for multiple confounds.
        X : numpy array
            Array of length (n_samples, n_features), from which the confound
            will be regressed. This is used to determine how the
            confound-models should be cross-validated (which is necessary
            to use in in scikit-learn Pipelines).
        cross_validate : bool
            Whether to cross-validate the confound-parameters (y~confound)
            estimated from the train-set to the test set (cross_validate=True)
            or whether to fit the confound regressor separately on the test-set
            (cross_validate=False). Setting this parameter to True is equivalent
            to "foldwise confound regression" (FwCR) as described in our paper
            (https://www.biorxiv.org/content/early/2018/03/28/290684). Setting
            this parameter to False, however, is NOT equivalent to "whole
            dataset confound regression" (WDCR) as it does not apply confound
            regression to the *full* dataset, but simply refits the confound
            model on the test-set. We recommend setting this parameter to True.
        precise: bool
            Transformer-objects in scikit-learn only allow to pass the data
            (X) and optionally the target (y) to the fit and transform methods.
            However, we need to index the confound accordingly as well. To do so,
            we compare the X during initialization (self.X) with the X passed to
            fit/transform. As such, we can infer which samples are passed to the
            methods and index the confound accordingly. When setting precise to
            True, the arrays are compared feature-wise, which is accurate, but
            relatively slow. When setting precise to False, it will infer the index
            by looking at the sum of all the features, which is less accurate, but much
            faster. For dense data, this should work just fine. Also, to aid the
            accuracy, we remove the features which are constant (0) across samples.
        stack_intercept : bool
            Whether to stack an intercept to the confound (default is True)

        Attributes
        ----------
        weights_ : numpy array
            Array with weights for the confound(s).
        """

        self.confound = confound
        self.cross_validate = cross_validate
        self.X = X
        self.precise = precise
        self.stack_intercept = stack_intercept
        self.weights_ = None
        self.nonzero_X_ = None


    def fit(self, X, y=None):
        """ Fits the confound-regressor to X.

        Parameters
        ----------
        X : numpy array
            An array of shape (n_samples, n_features), which should correspond
            to your train-set only!
        y : None
            Included for compatibility; does nothing.
        """

        if self.stack_intercept:
            icept = np.ones(self.confound.shape[0])
            self.confound = np.c_[icept, self.confound]

        # Find nonzero voxels (i.e., voxels which have not all zero
        # values across samples)
        if self.nonzero_X_ is None:
            self.nonzero_X_ = np.sum(self.X, axis=0) != 0
            self.X = self.X[:, self.nonzero_X_]

        X_nz = X[:, self.nonzero_X_]
        confound = self.confound

        if self.precise:
            tmp = np.in1d(self.X, X_nz).reshape(self.X.shape)
            fit_idx = tmp.sum(axis=1) == self.X.shape[1]
        else:
            fit_idx = np.in1d(self.X.sum(axis=1), X_nz.sum(axis=1))

        confound_fit = confound[fit_idx, :]

        # Vectorized implementation estimating weights for all features
        self.weights_ = np.linalg.lstsq(confound_fit, X_nz, rcond=None)[0]
        return self


    def transform(self, X):
        """ Regresses out confound from X.

        Parameters
        ----------
        X : numpy array
            An array of shape (n_samples, n_features), which should correspond
            to your train-set only!

        Returns
        -------
        X_new : ndarray
            ndarray with confound-regressed features
        """

        if not self.cross_validate:
            self.fit(X)

        X_nz = X[:, self.nonzero_X_]

        if self.precise:
            tmp = np.in1d(self.X, X_nz).reshape(self.X.shape)
            transform_idx = tmp.sum(axis=1) == self.X.shape[1]
        else:
            transform_idx = np.in1d(self.X.sum(axis=1), X_nz.sum(axis=1))

        confound_transform = self.confound[transform_idx]
        X_new = X - confound_transform.dot(self.weights_)
        X_corr = np.zeros_like(X)
        X_corr[:, self.nonzero_X_] = X_new
        return X_corr

## Import data

In [121]:
# demographics data
GSP_demographics_cleaned = pd.read_csv(resdir_gsp+'demographics_cleaned.csv')
HCP_demographics_cleaned = pd.read_csv(resdir_hcp+'demographics_cleaned.csv')

# CT data
GSP_ct_schaefer400 = pd.read_csv(resdir_gsp+'ct_schaefer400.csv')
HCP_ct_schaefer400 = pd.read_csv(resdir_hcp+'ct_schaefer400.csv')

# functional gradient (visual-heteromodal) data
GSP_fc_grad = pd.read_csv(resdir_gsp+'array_aligned_fc_G2.csv')  # G2
HCP_fc_grad = pd.read_csv(resdir_hcp+'array_aligned_fc_G1.csv')  # G1

In [132]:
# rename some data columns to make them same across samples
GSP_demographics_cleaned = GSP_demographics_cleaned.rename(columns={'Sex': 'sex', 'Age_Bin': 'age', 'ICV': 'icv'})
HCP_demographics_cleaned = HCP_demographics_cleaned.rename(columns={'Gender': 'sex', 'Age_in_Yrs': 'age', 'FS_IntraCranial_Vol': 'icv'})

## Classification with cross validation

### local structure

In [152]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, KFold 

In [113]:
##### define predictor and target variables

### target: sex
y_GSP = GSP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)

x_GSP = GSP_ct_schaefer400

## confound removal

# make a preprocessor confound regressor
confound_regressor = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_GSP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_GSP))

# transform the predictor data with confound variables regressed out
x_GSP_unconf = confound_regressor.transform(X = np.array(x_GSP))



##### create model via pipeline

# Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data
model = make_pipeline(StandardScaler(), SVC(kernel='rbf'))



##### within sample cross valiation 
cv_results = cross_validate(model, x_GSP_unconf, y_GSP, cv = 10)
scores = cv_results['test_score']

print(f"the mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

the mean cross-validation accuracy is: 0.840 +/- 0.020


### functional gradient


if I need to be able to distinguish the func/struct data (parcels): df = df.add_suffix('_some_suffix')

In [133]:
##### define predictor and target variables

### target: sex
y_GSP = GSP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)

x_GSP = GSP_fc_grad

## confound removal

# make a preprocessor confound regressor
confound_regressor = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'icv']]), X = np.array(x_GSP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_GSP))

# transform the predictor data with confound variables regressed out
x_GSP_unconf = confound_regressor.transform(X = np.array(x_GSP))



##### create model via pipeline

# Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data
model = make_pipeline(StandardScaler(), SVC(kernel='rbf'))



##### within sample cross valiation 
cv_results = cross_validate(model, x_GSP_unconf, y_GSP, cv = 10)
scores = cv_results['test_score']

print(f"the mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

the mean cross-validation accuracy is: 0.587 +/- 0.038


### local structure and functional gradient

In [148]:
##### define predictor and target variables

### target: sex
y_GSP = GSP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)

x_GSP_struct = GSP_ct_schaefer400
x_GSP_func = GSP_fc_grad


## confound removal (removing structural and functional data's confounding variables respectively)

# make a preprocessor confound regressor
confound_regressor_struct = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_GSP_struct))
confound_regressor_func = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'icv']]), X = np.array(x_GSP_func))

# fit it to the predictor data
_ = confound_regressor_struct.fit(X = np.array(x_GSP_struct))
_ = confound_regressor_func.fit(X = np.array(x_GSP_func))

# transform the predictor data with confound variables regressed out
x_GSP_struct_unconf = confound_regressor_struct.transform(X = np.array(x_GSP_struct))
x_GSP_func_unconf = confound_regressor_func.transform(X = np.array(x_GSP_func))

# concatenate the two arrays stored in one predictor variable (800 parcels (400 structural, 400 functional)
x_GSP_unconf = np.concatenate((x_GSP_struct_unconf, x_GSP_func_unconf), axis = 1)



##### create model via pipeline

# Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data
model = make_pipeline(StandardScaler(), SVC(kernel='rbf'))



##### within sample cross valiation 
cv_results = cross_validate(model, x_GSP_unconf, y_GSP, cv = 10)
scores = cv_results['test_score']

print(f"the mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

the mean cross-validation accuracy is: 0.818 +/- 0.027


## Classification out of sample

**STILL NEED TO DO: REGRESS OUT FAMILY AND TWIN IN HCP PART** https://techoverflow.net/2019/05/22/how-to-fix-numpy-typeerror-cannot-cast-ufunc-subtract-output-from-dtypefloat64-to-dtypeint64-with-casting-rule-same_kind/

### local CT

In [181]:
# TRAIN

##### define predictor and target variables

### target: sex
y_GSP = GSP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_GSP = GSP_ct_schaefer400

## confound removal

# make a preprocessor confound regressor
confound_regressor = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_GSP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_GSP))

# transform the predictor data with confound variables regressed out
x_GSP_unconf = confound_regressor.transform(X = np.array(x_GSP))


##### preprocess

scaler = StandardScaler()
_ = scaler.fit(x_GSP_unconf)
x_GSP_unconf_scaled = scaler.transform(x_GSP_unconf)


##### model

model = SVC(kernel='rbf')

_ = model.fit(x_GSP_unconf_scaled, y_GSP)




# TEST

##### define predictor and target variables

### target: sex
y_HCP = HCP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_HCP = HCP_ct_schaefer400

## confound removal

# make a preprocessor confound regressor # ADD Twin stat and Family ID (but categorical so make own preprocessing
confound_regressor = ConfoundRegressor(confound = np.array(HCP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_HCP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_HCP))

# transform the predictor data with confound variables regressed out
x_HCP_unconf = confound_regressor.transform(X = np.array(x_HCP))


##### preprocess - scale using prefitted scaler on GSP data (correct?) by simply transforming because this is test data 
    # this is what is instructed at https://stackoverflow.com/questions/57775530/why-does-calling-transform-on-test-data-return-an-error-that-the-data-is-not-f?rq=1
x_HCP_unconf_scaled = scaler.transform(x_HCP_unconf)


##### compute prediction accuracy
score = model.score(x_HCP_unconf_scaled, y_HCP)

print(f"the model's out of sample prediction accuracy is: {score:.3f}")

the model's out of sample prediction accuracy is: 0.736


### TRYING TO INCLUDE CATEGORICAL CONFOUND REGRESSION BELOW: ERROR
### TROUBLESHOOT https://techoverflow.net/2019/05/22/how-to-fix-numpy-typeerror-cannot-cast-ufunc-subtract-output-from-dtypefloat64-to-dtypeint64-with-casting-rule-same_kind/

In [186]:
# TRAIN

##### define predictor and target variables

### target: sex
y_GSP = GSP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_GSP = GSP_ct_schaefer400

## confound removal

# make a preprocessor confound regressor
confound_regressor = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_GSP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_GSP))

# transform the predictor data with confound variables regressed out
x_GSP_unconf = confound_regressor.transform(X = np.array(x_GSP))


##### preprocess

scaler = StandardScaler()
_ = scaler.fit(x_GSP_unconf)
x_GSP_unconf_scaled = scaler.transform(x_GSP_unconf)


##### model

model = SVC(kernel='rbf')

_ = model.fit(x_GSP_unconf_scaled, y_GSP)




# TEST

##### define predictor and target variables

### target: sex
y_HCP = HCP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_HCP = HCP_ct_schaefer400

## confound removal

# make a preprocessor confound regressor # ADD Twin stat and Family ID (but categorical so make own preprocessing
confound_regressor = ConfoundRegressor(confound = np.array(HCP_demographics_cleaned[['age', 'global_ct', 'Family_ID', 'TwinStatus']]), X = np.array(x_HCP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_HCP))

# transform the predictor data with confound variables regressed out
x_HCP_unconf = confound_regressor.transform(X = np.array(x_HCP))


##### preprocess - scale using prefitted scaler on GSP data (correct?) by simply transforming because this is test data 
    # this is what is instructed at https://stackoverflow.com/questions/57775530/why-does-calling-transform-on-test-data-return-an-error-that-the-data-is-not-f?rq=1
x_HCP_unconf_scaled = scaler.transform(x_HCP_unconf)


##### compute prediction accuracy
score = model.score(x_HCP_unconf_scaled, y_HCP)

print(f"the model's out of sample prediction accuracy is: {score:.3f}")

UFuncTypeError: Cannot cast ufunc 'lstsq_n' input 0 from dtype('O') to dtype('float64') with casting rule 'same_kind'

### functional gradient

In [183]:
# TRAIN

##### define predictor and target variables

### target: sex
y_GSP = GSP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_GSP = GSP_fc_grad

## confound removal

# make a preprocessor confound regressor
confound_regressor = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'icv']]), X = np.array(x_GSP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_GSP))

# transform the predictor data with confound variables regressed out
x_GSP_unconf = confound_regressor.transform(X = np.array(x_GSP))


##### preprocess

scaler = StandardScaler()
_ = scaler.fit(x_GSP_unconf)
x_GSP_unconf_scaled = scaler.transform(x_GSP_unconf)


##### model

model = SVC(kernel='rbf')

_ = model.fit(x_GSP_unconf_scaled, y_GSP)




# TEST

##### define predictor and target variables

### target: sex
y_HCP = HCP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_HCP = HCP_fc_grad

## confound removal

# make a preprocessor confound regressor # ADD Twin stat and Family ID (but categorical so make own preprocessing
confound_regressor = ConfoundRegressor(confound = np.array(HCP_demographics_cleaned[['age', 'icv']]), X = np.array(x_HCP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_HCP))

# transform the predictor data with confound variables regressed out
x_HCP_unconf = confound_regressor.transform(X = np.array(x_HCP))


##### preprocess - scale using prefitted scaler on GSP data (correct?) by simply transforming because this is test data 
    # this is what is instructed at https://stackoverflow.com/questions/57775530/why-does-calling-transform-on-test-data-return-an-error-that-the-data-is-not-f?rq=1
x_HCP_unconf_scaled = scaler.transform(x_HCP_unconf)


##### compute prediction accuracy
score = model.score(x_HCP_unconf_scaled, y_HCP)

print(f"the model's out of sample prediction accuracy is: {score:.3f}")

the model's out of sample prediction accuracy is: 0.598


### local structure and functional gradient

In [185]:
# TRAIN

##### define predictor and target variables

### target: sex
y_GSP = GSP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_GSP_struct = GSP_ct_schaefer400
x_GSP_func = GSP_fc_grad


## confound removal (removing structural and functional data's confounding variables respectively)

# make a preprocessor confound regressor
confound_regressor_struct = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_GSP_struct))
confound_regressor_func = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'icv']]), X = np.array(x_GSP_func))

# fit it to the predictor data
_ = confound_regressor_struct.fit(X = np.array(x_GSP_struct))
_ = confound_regressor_func.fit(X = np.array(x_GSP_func))

# transform the predictor data with confound variables regressed out
x_GSP_struct_unconf = confound_regressor_struct.transform(X = np.array(x_GSP_struct))
x_GSP_func_unconf = confound_regressor_func.transform(X = np.array(x_GSP_func))

# concatenate the two arrays stored in one predictor variable (800 parcels (400 structural, 400 functional)
x_GSP_unconf = np.concatenate((x_GSP_struct_unconf, x_GSP_func_unconf), axis = 1)


##### preprocess

scaler = StandardScaler()
_ = scaler.fit(x_GSP_unconf)
x_GSP_unconf_scaled = scaler.transform(x_GSP_unconf)


##### model

model = SVC(kernel='rbf')

_ = model.fit(x_GSP_unconf_scaled, y_GSP)




# TEST

##### define predictor and target variables

### target: sex
y_HCP = HCP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_HCP_struct = HCP_ct_schaefer400
x_HCP_func = HCP_fc_grad


## confound removal (removing structural and functional data's confounding variables respectively)

# make a preprocessor confound regressor
confound_regressor_struct = ConfoundRegressor(confound = np.array(HCP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_HCP_struct))
confound_regressor_func = ConfoundRegressor(confound = np.array(HCP_demographics_cleaned[['age', 'icv']]), X = np.array(x_HCP_func))

# fit it to the predictor data
_ = confound_regressor_struct.fit(X = np.array(x_HCP_struct))
_ = confound_regressor_func.fit(X = np.array(x_HCP_func))

# transform the predictor data with confound variables regressed out
x_HCP_struct_unconf = confound_regressor_struct.transform(X = np.array(x_HCP_struct))
x_HCP_func_unconf = confound_regressor_func.transform(X = np.array(x_HCP_func))

# concatenate the two arrays stored in one predictor variable (800 parcels (400 structural, 400 functional)
x_HCP_unconf = np.concatenate((x_HCP_struct_unconf, x_HCP_func_unconf), axis = 1)



##### preprocess - scale using prefitted scaler on GSP data (correct?) by simply transforming because this is test data 
    # this is what is instructed at https://stackoverflow.com/questions/57775530/why-does-calling-transform-on-test-data-return-an-error-that-the-data-is-not-f?rq=1
x_HCP_unconf_scaled = scaler.transform(x_HCP_unconf)


##### compute prediction accuracy
score = model.score(x_HCP_unconf_scaled, y_HCP)

print(f"the model's out of sample prediction accuracy is: {score:.3f}")


the model's out of sample prediction accuracy is: 0.738


## Classification out of sample

**STILL NEED TO DO: REGRESS OUT FAMILY AND TWIN IN HCP PART**

### local CT

In [181]:
# TRAIN

##### define predictor and target variables

### target: sex
y_GSP = GSP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_GSP = GSP_ct_schaefer400

## confound removal

# make a preprocessor confound regressor
confound_regressor = ConfoundRegressor(confound = np.array(GSP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_GSP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_GSP))

# transform the predictor data with confound variables regressed out
x_GSP_unconf = confound_regressor.transform(X = np.array(x_GSP))


##### preprocess

scaler = StandardScaler()
_ = scaler.fit(x_GSP_unconf)
x_GSP_unconf_scaled = scaler.transform(x_GSP_unconf)


##### model

model = SVC(kernel='rbf')

_ = model.fit(x_GSP_unconf_scaled, y_GSP)




# TEST

##### define predictor and target variables

### target: sex
y_HCP = HCP_demographics_cleaned['sex']


### predictor: brain data (400 parcels)
x_HCP = HCP_ct_schaefer400

## confound removal

# make a preprocessor confound regressor # ADD Twin stat and Family ID (but categorical so make own preprocessing
confound_regressor = ConfoundRegressor(confound = np.array(HCP_demographics_cleaned[['age', 'global_ct']]), X = np.array(x_HCP))

# fit it to the predictor data
_ = confound_regressor.fit(X = np.array(x_HCP))

# transform the predictor data with confound variables regressed out
x_HCP_unconf = confound_regressor.transform(X = np.array(x_HCP))


##### preprocess - scale using prefitted scaler on GSP data (correct?) by simply transforming because this is test data 
    # this is what is instructed at https://stackoverflow.com/questions/57775530/why-does-calling-transform-on-test-data-return-an-error-that-the-data-is-not-f?rq=1
x_HCP_unconf_scaled = scaler.transform(x_HCP_unconf)


##### compute prediction accuracy
score = model.score(x_HCP_unconf_scaled, y_HCP)

print(f"the model's out of sample prediction accuracy is: {score:.3f}")

the model's out of sample prediction accuracy is: 0.736


### functional gradient

In [14]:
from sklearn import svm
X = [[0, 0], [1, 1]]
y = [0, 1]
clf = svm.SVC(kernel='rbf')
clf.fit(X, y)


SVC()

In [15]:
clf.predict([[2., 2.]])

array([1])

In [16]:
# get support vectors (=training data)
print(clf.support_vectors_)

# get indices of support vectors
print(clf.support_)

# get number of support vectors for each class
print(clf.n_support_)

[[0. 0.]
 [1. 1.]]
[0 1]
[1 1]


In [131]:
x_GSP

Unnamed: 0,struct_1struct_,struct_ 2struct_,struct_ 3struct_,struct_ 4struct_,struct_ 5struct_,struct_ 6struct_,struct_ 7struct_,struct_ 8struct_,struct_ 9struct_,struct_ 10struct_,...,struct_ 391struct_,struct_ 392struct_,struct_ 393struct_,struct_ 394struct_,struct_ 395struct_,struct_ 396struct_,struct_ 397struct_,struct_ 398struct_,struct_ 399struct_,struct_ 400struct_
0,-0.093285,0.014340,-0.090736,-0.106147,-0.094011,-0.098007,-0.025013,-0.098085,-0.088955,-0.046987,...,0.084511,0.053425,0.052758,0.025645,0.109971,0.100447,0.083106,0.083238,0.102053,-0.013530
1,0.024041,0.105338,-0.071556,-0.059716,-0.104843,-0.086958,0.065324,-0.078393,-0.091654,-0.077376,...,0.083916,0.056192,-0.060981,-0.021545,0.133430,0.072175,-0.015382,0.053279,0.108403,0.005609
2,-0.049901,0.059752,-0.125352,-0.064487,-0.093566,-0.110654,-0.019181,-0.132276,-0.071831,-0.114914,...,0.070498,0.056943,0.056084,0.029230,0.083126,0.078022,0.079190,0.038425,0.073943,0.012058
3,-0.015274,0.054603,-0.075639,-0.127220,-0.123355,-0.117995,0.012339,-0.071264,-0.010436,-0.116792,...,0.096095,0.061396,0.057976,-0.001887,0.122786,0.123089,0.078660,0.071941,0.117249,0.074399
4,-0.017485,0.035287,-0.089764,-0.059690,-0.096601,-0.084487,-0.052344,-0.090220,-0.095404,-0.074597,...,0.076119,0.048153,-0.039816,-0.015104,0.094465,0.077439,0.037047,0.036123,0.072920,0.005451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1563,-0.120128,0.044156,-0.137043,-0.146324,-0.122973,-0.094081,0.008933,-0.062940,-0.113368,-0.122586,...,0.061912,0.033051,0.004043,0.032202,0.063845,0.055230,0.047795,0.054769,0.080993,0.021291
1564,0.009092,0.059904,-0.057393,-0.095516,-0.087502,-0.062153,-0.012518,-0.064893,-0.079708,-0.096062,...,0.078555,0.044381,-0.048281,-0.081537,0.110326,0.090343,0.013574,-0.060747,0.108213,-0.020079
1565,-0.065135,-0.008746,-0.088779,-0.092067,-0.082901,-0.047635,-0.061776,-0.088769,-0.014669,-0.081091,...,0.066905,-0.005900,-0.004692,0.031398,0.053522,0.053951,0.072585,0.069221,0.086213,0.037602
1566,0.020420,0.025629,-0.129486,-0.121391,-0.141431,-0.131816,0.071019,-0.030575,-0.127403,-0.066715,...,0.033368,0.069310,0.044957,0.063577,0.088689,0.090828,0.074498,0.066097,0.085203,0.068996


In [130]:
x_GSP = x_GSP.add_prefix('struct_')

Sofie's script

In [None]:
'''
This is a python script for running supervised machine learning to predict compassion score

data input : foldname

data output $np.save..... 
'''

import numpy as np
import pandas as pd
import scipy.stats as ss
import sklearn.linear_model as slm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import warnings
warnings.filterwarnings("ignore")

df_data = pd.read_csv ('../data/attention_data.csv')
sex_code = df_data['sex1'].unique()
if type(sex_code[0]) == str:
    sex_dict = {k: idx for idx, k in enumerate(sex_code)}
    df_data['sex1'] = df_data['sex1'].replace(sex_dict)  


def model_elasticnet(m): # l1_ratio
  print('m', str(m))

  dic = {}
  for i in range(sample): 
    lr = slm.ElasticNetCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1], l1_ratio=m, cv=5)
    sfs = SFS(lr, k_features=7, forward=True, floating=False, n_jobs=-1,
              scoring='neg_mean_absolute_error', cv=False)
    model = sfs.fit(x_correct[i], y_train[i])
    x_1 = model.transform(x_correct[i])
    x_2 = model.transform(x_test_corr[i])
    model2 = lr.fit(x_1, y_train[i])
    y_pred = lr.predict(x_2)
    corr = ss.pearsonr(y_pred, y_test[i])
    a = model.get_metric_dict()[7]
    a['importance'] = model.estimator.coef_
    a['intercept'] = model.estimator.intercept_
    a['alpha_best'] = model.estimator.alpha_
    a['predict_test_r_p'] = np.array(corr)
    a['mean_lr_mse']  = model2.mse_path_.mean()
    dic['train_test_'+str(i+1)] = a
    print('finish model......', str(i+1))
    print(a)
  np.save('../results/'+foldname+'feature_20_l1ratio_' + str(m)+ 'zscore.npy', dic)
  return print('feature_' + str(20) + '_l1ratio_' + str(m)+'   finished')

foldname=['attention']
for foldname in foldname:

  # IMPORT sample train_test iterations
  sample = 100

  x_train = [None] * sample
  y_train = [None] * sample
  x_test = [None] * sample
  y_test = [None] * sample
  x_correct = [None]  * sample
  x_test_corr = [None] * sample
  for i in range(sample):
    Y_col = 'val1'
    X_cols = df_data.loc[:, df_data.columns != Y_col].columns

    x_train[i], x_test[i], y_train[i], y_test[i] = train_test_split(        
          df_data[X_cols], df_data.iloc[:, 1], test_size=0.3, random_state=i)
	
    x_conf = x_train[i].iloc[:,[2,3]]
    y_conf = x_train[i].iloc[:,4:39]
    x, y   = np.array(x_conf), np.array(y_conf)
    model_conf = LinearRegression().fit(x, y)
    y_pred = model_conf.predict(x)
    x_correct[i] = y  - y_pred
    x_conf = x_test[i].iloc[:,[2,3]]
    y_conf = x_test[i].iloc[:,4:39]
    x, y   = np.array(x_conf), np.array(y_conf)
    model_conf = LinearRegression().fit(x, y)
    y_pred = model_conf.predict(x)
    x_test_corr[i] = y  - y_pred


  regulation = [1.0]

  for j in regulation:
    model_elasticnet(m=j)