# Notebook to generate templates for Transformer-based models

In [1]:
import os
import yaml
import numpy as np
import itertools

from utils import check_folder, read_yaml, save_yaml, write

In [2]:
path_to_main = "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/code/fMRI/fast_main.py"

In [3]:
subject_dict = {'english': [57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
                    72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 91, 92, 93,
                    94, 95, 96, 97, 98, 99, 100, 101, 103, 104, 105, 106, 108, 109, 110, 113, 114, 115],
                'french':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                          21, 22, 23, 24, 25, 26, 27, 29, 30
                         ]
               }

In [4]:
hrf_list = [
    'spm', # hrf model used in SPM
    'spm + derivative', # SPM model plus its time derivative (2 regressors)
    'spm + derivative + dispersion', # idem, plus dispersion derivative (3 regressors)
    'glover', # this one corresponds to the Glover hrf
    'glover + derivative', # the Glover hrf + time derivative (2 regressors)
    'glover + derivative + dispersion' # idem + dispersion derivative
]
hrf = 'spm'

In [5]:
language = 'english'
temporal_shifting = 0

In [6]:
template = {
  # Shared General parameters
  'subject': None,
  'parallel': False,
  'cuda': True,
  'seed': 1111,
  'language': None,
  'path_to_root': "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/",
  'path_to_fmridata': "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/fMRI",
  'output': "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/maps/"  ,
  'input': "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/" ,
  'detrend': True, 
  'standardize': True, 
  'high_pass': None, 
  'low_pass': None, 
  'mask_strategy': 'background', 
  #'dtype': 'float32', 
  'memory_level': 0, 
  'smoothing_fwhm': None , 
  'verbose': 0, 


  # Shared Splitter parameters
  'nb_runs': 9,
  'nb_runs_test': 1,

  # Shared Compression parameters
  'manifold_method': None,
  'manifold_args': {'n_neighbors':4, 'random_state':1111, 'min_dist':0.0, 'metric':'cosine'},

  # Shared Transformation parameters (includes the making of regressor and scaling)
  'tr': 2.,
  'scaling_mean': True,
  'scaling_var': True,
  'scaling_axis': 0,
  'hrf': None,
  'offset_path': "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/onsets-offsets/",
  'duration_path': "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/",
  'temporal_shifting': 0,
  'oversampling': 10,
  'add_noise_to_constant': True,

  # Shared Estimator model parameters
  'base': 10.0,
  'voxel_wise': True,
  'alpha_percentile': 99.9,
  'alpha': 100,
  'alpha_min_log_scale': 2,
  'alpha_max_log_scale': 3,
  'nb_alphas': 1,
  'optimizing_criteria': 'R2',
  'estimator_model': 'Ridge()',
  'save_all_weights': False, 

  # Maps creation parameters
  'atlas': 'cort-prob-2mm',
  'masker_path': "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/ROI_masks/global_masker_english",
  'smoothed_masker_path': "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/ROI_masks/smoothed_global_masker_english",


  # Models
  'models': None, 
  'model_name': None
}

In [7]:
def write_commands(command_lines, path_to_sh, job_to_launch_path, queue='Nspin_bigM'):
    for index, command in enumerate(command_lines):
        write(os.path.join('/', job_to_launch_path.split('/')[1], '/'.join(path_to_sh[index].split('/')[2:])), command)
        walltime = '99:00:00'
        output_log = '/home/ap263679/logs/log_o_{}'.format(index)
        error_log = '/home/ap263679/logs/log_e_{}'.format(index)
        job_name = os.path.basename(path_to_sh[index]).split('.')[0]
        write(job_to_launch_path, f"qsub -q {queue} -N {job_name} -l ncpus=6 -l walltime={walltime} -o {output_log} -e {error_log} {path_to_sh[index]}")


In [8]:
def get_BF_template(
    model_name, 
    column_names,
    surname,
    data_compression, 
    ncomponents,
    offset_type,
    duration_type,
    shift_surprisal,
    centering,
    order,
    scaling_type,
    input_template='activations'):
    
    columns_to_retrieve = column_names
    result = { 
        'model_name': model_name,
        'columns_to_retrieve': str(columns_to_retrieve),
        'surname': surname,
        'data_compression': data_compression,
        'ncomponents': ncomponents,
        'offset_type': offset_type, # word / word+punctuation / ...,
        'duration_type': duration_type,
        'shift_surprisal': shift_surprisal,
        'input_template': input_template, # activations
        'centering': centering,
        'order': order,
        'scaling_type': scaling_type,
      }
    return result

In [9]:
def get_model_template(
    model_name, 
    layers, 
    hidden_states, 
    attention_heads, 
    heads,
    surname,
    data_compression, 
    ncomponents,
    nb_hidden_states=768,
    offset_type='word+punctuation',
    duration_type=None,
    centering=False,
    shift_surprisal=False,
    scaling_type=None,
    order=None,
    input_template='activations'):
    
    columns_to_retrieve = []
    if hidden_states:
        columns_to_retrieve = ['hidden_state-layer-{}-{}'.format(layer, i) for layer in layers for i in range(1, nb_hidden_states + 1)]
    if attention_heads:
        columns_to_retrieve += ['attention-layer-{}-head-{}-{}'.format(layer, head, i) for layer in layers for head in heads for i in range(1, 65)]
    result = { 
        'model_name': model_name,
        'columns_to_retrieve': str(columns_to_retrieve),
        'surname': surname,
        'data_compression': data_compression,
        'ncomponents': ncomponents,
        'offset_type': offset_type, # word / word+punctuation / ...,
        'duration_type': duration_type,
        'shift_surprisal': shift_surprisal,
        'centering': centering,
        'order': order,
        'scaling_type': scaling_type,
        'input_template': input_template # cls / sep / activations
      }
    return result
    

## Here we start the generation

In [125]:
templates_folder = "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/tmp_bert_units_768_8/templates/"
sh_folder = "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/tmp_bert_units_768_8/shell_commands/"
job_to_launch_path = "/Volumes/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/tmp_bert_units_768_8/jobs.txt"
check_folder(os.path.join('/', job_to_launch_path.split('/')[1], '/'.join(templates_folder.split('/')[2:])))
check_folder(os.path.join('/', job_to_launch_path.split('/')[1], '/'.join(sh_folder.split('/')[2:])))


In [126]:
def fill_template_folder(
    templates_folder,
    sh_folder,
    job_to_launch_path,
    model_names, 
    language, 
    template, 
    hidden_layer_list,
    attention_layer_list,
    attention_layer_head_list,
    nb_hidden_states,
    centering,
    order,
    scaling_type,
    input_template,
    data_compression,
    ncomponents,
    temporal_shifting,
    path_to_main=path_to_main,
    extra=[],
    extra_name=''
):
    command_lines = []
    path_to_sh = []
    
    for index, model_name in enumerate(model_names):
        for subject in subject_dict[language]:
            template['subject'] = subject

            # hidden layers comparison
            for hidden_layers in hidden_layer_list:
                model = get_model_template(model_name=model_name, 
                                           layers=hidden_layers, 
                                           hidden_states=True,
                                           attention_heads=False, 
                                           heads=None, 
                                           nb_hidden_states=nb_hidden_states[index],
                                           surname="{}_hidden-layer-{}".format(model_name, hidden_layers),
                                           data_compression=data_compression[index], 
                                           ncomponents=ncomponents[index],
                                           offset_type="word+punctuation", 
                                           duration_type=None, 
                                           centering=centering[index],
                                           order=order[index],
                                           shift_surprisal=False,
                                           scaling_type=scaling_type[index],
                                           input_template=input_template
                                          )
                template['models'] = [model] + extra
                additional = '_{}_{}'.format(data_compression[index], ncomponents[index]) if data_compression[index] is not None else ''
                #template['model_name'] = '{}_norm-{}_temporal-shifting-{}_{}_hidden-layer-{}'.format(model_name, order[index], temporal_shifting, subject, '-'.join([str(i) for i in hidden_layers])).replace('np.', '') + extra_name
                #yaml_path = os.path.join(templates_folder, '{}_norm-{}_temporal-shifting-{}_{}_hidden-layer-{}{}.yml'.format(model_name,  order[index], temporal_shifting, subject, '-'.join([str(i) for i in hidden_layers]), extra_name)).replace('np.', '')
                template['model_name'] = '{}_norm-{}_temporal-shifting-{}_{}_hidden-all-layers{}'.format(model_name, order[index], temporal_shifting, subject, additional).replace('np.', '') + extra_name
                yaml_path = os.path.join(templates_folder, '{}_norm-{}_temporal-shifting-{}_{}_hidden-all-layers{}{}.yml'.format(model_name, order[index], temporal_shifting, subject, additional, extra_name)).replace('np.', '')

                save_yaml(template, os.path.join('/', job_to_launch_path.split('/')[1], '/'.join(yaml_path.split('/')[2:])))
                command_lines.append("python {} --yaml_file {}".format(path_to_main, yaml_path))
                #path_to_sh.append(os.path.join(sh_folder, '{}_norm-{}_temporal-shifting-{}_{}_hidden-layer-{}{}.sh'.format(model_name, order[index], temporal_shifting, subject, '-'.join([str(i) for i in hidden_layers]), extra_name).replace('np.', '')))
                path_to_sh.append(os.path.join(sh_folder, '{}_norm-{}_temporal-shifting-{}_{}_hidden-all-layers{}{}.sh'.format(model_name, order[index], temporal_shifting, subject, additional, extra_name).replace('np.', '')))

            # attention layers comparison
            for attention_layers in attention_layer_list:
                model = get_model_template(model_name=model_name, 
                                           layers=attention_layers, 
                                           hidden_states=False,
                                           attention_heads=True, 
                                           heads=heads, 
                                           surname="{}_norm-{}_attention-layer-{}".format(model_name, order[index], attention_layers),
                                           data_compression=data_compression[index], 
                                           ncomponents=ncomponents[index],
                                           offset_type="word+punctuation", 
                                           duration_type=None, 
                                           centering=centering[index],
                                           order=order[index],
                                           shift_surprisal=False,
                                           scaling_type=scaling_type[index],
                                           input_template=input_template
                                          )
                template['models'] = [model] + extra
                #additional = '_{}_{}'.format(data_compression[index], ncomponents[index]) if data_compression[index] is not None else ''
                template['model_name'] = '{}_norm-{}_temporal-shifting-{}_{}_attention-layer-{}'.format(model_name, order[index], temporal_shifting, subject, attention_layers[0]).replace('np.', '') + extra_name
                yaml_path = os.path.join(templates_folder, '{}_norm-{}_temporal-shifting-{}_{}_attention-layer-{}{}.yml'.format(model_name, order[index], temporal_shifting, subject, attention_layers[0], extra_name)).replace('np.', '')
                save_yaml(template, os.path.join('/', job_to_launch_path.split('/')[1], '/'.join(yaml_path.split('/')[2:])))
                command_lines.append("python {} --yaml_file {}".format(path_to_main, yaml_path))
                path_to_sh.append(os.path.join(sh_folder, '{}_norm-{}_temporal-shifting-{}_{}_attention-layer-{}{}.sh'.format(model_name, order[index], temporal_shifting, subject, attention_layers[0], extra_name).replace('np.', '')))

            # specific attention heads comparison    
            for (layer, head) in attention_layer_head_list:
                model = get_model_template(model_name=model_name, 
                                           layers=[layer], 
                                           hidden_states=False,
                                           attention_heads=True, 
                                           heads=[head], 
                                           surname="{}_norm-{}_attention-layer-{}-head-{}".format(model_name, order[index], layer, head),
                                           data_compression=None, 
                                           ncomponents=None,
                                           offset_type="word+punctuation", 
                                           duration_type=None, 
                                           centering=centering[index],
                                           order=order[index],
                                           shift_surprisal=False,
                                           scaling_type=scaling_type[index],
                                           input_template=input_template
                                          )
                template['models'] = [model] + extra
                template['model_name'] = '{}_norm-{}_temporal-shifting-{}_{}_attention-layer-{}_head-{}'.format(model_name, order[index], temporal_shifting, subject, layer, head).replace('np.', '') + extra_name
                yaml_path = os.path.join(templates_folder, '{}_norm-{}_temporal-shifting-{}_{}_attention-layer-{}_head-{}{}.yml'.format(model_name, order[index], temporal_shifting, subject, layer, head, extra_name)).replace('np.', '')
                save_yaml(template, os.path.join('/', job_to_launch_path.split('/')[1], '/'.join(yaml_path.split('/')[2:])))
                command_lines.append("python {} --yaml_file {}".format(path_to_main, yaml_path))
                path_to_sh.append(os.path.join(sh_folder, '{}_norm-{}_temporal-shifting-{}_{}_attention-layer-{}_head-{}{}.sh'.format(model_name, order[index], temporal_shifting, subject, layer, head, extra_name).replace('np.', '')))
    return path_to_sh, command_lines


### Layer-wise

In [81]:
model_names = [

#"bert-base-uncased_L-2_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-2_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-2_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-2_H-768_A-12_pre-20_norm-2",

#"bert-base-uncased_L-4_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-4_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-4_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-4_H-768_A-12_pre-20_norm-2",
#    
#"bert-base-uncased_L-6_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-6_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-6_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-6_H-768_A-12_pre-20_norm-2",
#    
#"bert-base-uncased_L-8_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-8_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-8_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-8_H-768_A-12_pre-20_norm-2",
#    
#"bert-base-uncased_L-10_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-10_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-10_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-10_H-768_A-12_pre-20_norm-2",
#    
"bert-base-uncased_L-12_H-128_A-2_pre-20_norm-2",
"bert-base-uncased_L-12_H-256_A-4_pre-20_norm-2",
"bert-base-uncased_L-12_H-512_A-8_pre-20_norm-2",
"bert-base-uncased_L-12_H-768_A-12_pre-20_norm-2",




]

In [82]:
#model_names = ['bert-base-cased', 'gpt2_scaled', 'roberta-base']
hidden_layer_list =  [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12]] # [[i] for i in range(13)] # 
attention_layer_list = [] # [[i] for i in range(1, 13)]
heads = np.arange(1, 13)
attention_layer_head_list = [] # list(itertools.product(np.arange(1, 13), np.arange(1, 13)))
#[[7, 6], [4, 10], [8, 1], [8,2], [6,7], [8, 10], [8, 11], [9, 6]]
command_lines = []
data_compression = [None] * 60
ncomponents = [None] * 60
order = [None] * 60
centering = ['True'] * 60
scaling_type = [None] * 60
input_template = 'activations'
scaling_axis = 0
temporal_shifting = 0
nb_hidden_states = [128, 256, 512, 768] * 60

In [14]:
BF_model_names = ['rms_chris']
BF_offset_types = ["rms_chris"] 

In [89]:
extra = []
for index, model_name in enumerate(BF_model_names):
    extra.append(get_BF_template(model_name, 
                                    ['amplitude'],
                                    model_name,
                                    None, 
                                    None,
                                    BF_offset_types[index],
                                    None,
                                    False,
                                    True, 
                                    None, 
                                    None,
                                    input_template='activations'))

In [83]:
template['scaling_axis'] = scaling_axis
template['language'] = language
template['temporal_shifting'] = temporal_shifting
template['hrf'] = hrf

In [84]:
path_to_sh, command_lines = fill_template_folder(
                    templates_folder,
                    sh_folder,
                    job_to_launch_path,
                    model_names, 
                    language, 
                    template, 
                    hidden_layer_list,
                    attention_layer_list,
                    attention_layer_head_list,
                    nb_hidden_states,
                    centering,
                    order,
                    scaling_type,
                    input_template,
                    data_compression,
                    ncomponents,
                    temporal_shifting,
                    path_to_main=path_to_main,
                    extra=[],
                    extra_name=''
                )

In [None]:
write_commands(command_lines, path_to_sh, job_to_launch_path)

In [None]:
template

In [39]:
len(command_lines)

408

### Full Model

In [127]:
model_names = [
    
#"bert-base-uncased_L-2_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-2_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-2_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-2_H-768_A-12_pre-20_norm-2",

#"bert-base-uncased_L-4_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-4_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-4_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-4_H-768_A-12_pre-20_norm-2",
#    
#"bert-base-uncased_L-6_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-6_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-6_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-6_H-768_A-12_pre-20_norm-2",
#    
#"bert-base-uncased_L-8_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-8_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-8_H-512_A-8_pre-20_norm-2",
"bert-base-uncased_L-8_H-768_A-12_pre-20_norm-2",
#    
#"bert-base-uncased_L-10_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-10_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-10_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-10_H-768_A-12_pre-20_norm-2",
#    
#"bert-base-uncased_L-12_H-128_A-2_pre-20_norm-2",
#"bert-base-uncased_L-12_H-256_A-4_pre-20_norm-2",
#"bert-base-uncased_L-12_H-512_A-8_pre-20_norm-2",
#"bert-base-uncased_L-12_H-768_A-12_pre-20_norm-2",

]

In [128]:
len(model_names)

1

In [129]:
nb_layers = 8
hidden_layer_list = [[i for i in range(1, nb_layers + 1)]] # attention à commencer à 0 ou 1 !!!!!
attention_layer_list = [] # [[i for i in range(1, 13)]]
attention_layer_head_list = [] # np.arange(1, 13)
command_lines = []
data_compression = [None] * 65
ncomponents = [0] * 65
order = [None] * 65 #'np.inf'
centering = ['True'] * 65
#scaling_type = ['standardize'] * 8 + ['normalize'] * 40
scaling_type = [None] * 65
input_template = 'activations'
scaling_axis = 0
temporal_shifting = 0
nb_hidden_states = [768] * 65


In [22]:
BF_model_names = ['wordrate', 'rms', 'log_frequency']
BF_offset_types = ["word", "rms_0.01", "word"] * len(model_names) # ["word", "rms_0.01"]

In [23]:
extra = []
for index, model_name in enumerate(BF_model_names):
    extra.append(get_BF_template(model_name, 
                                    [model_name],
                                    model_name,
                                    None, 
                                    None,
                                    BF_offset_types[index],
                                    None,
                                    False,
                                    True, 
                                    'np.inf', 
                                    0,
                                    input_template='activations'))

In [24]:
#for index, model_name in enumerate(model_names):
#    additional = '_{}_{}'.format(data_compression[index], ncomponents[index]) if data_compression[index] is not None else ''
#    print('{}_norm-{}_{}_hidden-all-layers{}'.format(model_name, order[index], '{}',additional).replace('np.', ''))

In [25]:
template['estimator_model'] = 'B2B_reg()'
template['save_all_weights'] = True

In [130]:
template['scaling_axis'] = scaling_axis
template['language'] = language
template['temporal_shifting'] = temporal_shifting
template['hrf'] = hrf

In [131]:
path_to_sh, command_lines = fill_template_folder(
                    templates_folder,
                    sh_folder,
                    job_to_launch_path,
                    model_names, 
                    language, 
                    template, 
                    hidden_layer_list,
                    attention_layer_list,
                    attention_layer_head_list,
                    nb_hidden_states,
                    centering,
                    order,
                    scaling_type,
                    input_template,
                    data_compression,
                    ncomponents,
                    temporal_shifting,
                    path_to_main=path_to_main,
                    extra=[],
                    extra_name='_alpha-100'
                )

In [132]:
write_commands(command_lines, path_to_sh, job_to_launch_path)

In [116]:
template

{'subject': 115,
 'parallel': False,
 'cuda': True,
 'seed': 1111,
 'language': 'english',
 'path_to_root': '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/',
 'path_to_fmridata': '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/fMRI',
 'output': '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/maps/',
 'input': '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/',
 'detrend': True,
 'standardize': True,
 'high_pass': None,
 'low_pass': None,
 'mask_strategy': 'background',
 'memory_level': 0,
 'smoothing_fwhm': None,
 'verbose': 0,
 'nb_runs': 9,
 'nb_runs_test': 1,
 'manifold_method': None,
 'manifold_args': {'n_neighbors': 4,
  'random_state': 1111,
  'min_dist': 0.0,
  'metric': 'cosine'},
 'tr': 2.0,
 'scaling_mean': True,
 'scaling_var': True,
 'scaling_axis': 0,
 'hrf': 'spm',
 'offset_path': '/neurospin/unicog/protocols/IRMf/L

In [48]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [42]:
data= pd.read_csv('/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/english/bert-base-uncased_pre-2_1_post-0_norm-None/activations_run1.csv')

In [44]:
d = data[columns[:13*768]]

In [46]:
def ortho_proj(Y, M):
    """ returns the orthogonal component of Y to the space spanned by M and the constant vector 1 """
    if M.ndim == 1:   # M is a vector but needs to be a 2-D matrix
        M = M[:, np.newaxis]
    I = np.ones(len(M))
    I = I[:, np.newaxis]
    M2 = np.hstack((I, M))  # adding the constant 
    betas,_,_,_ = np.linalg.lstsq(M2, Y)
    Xc = np.dot(M2, betas)  # colinear component "residuals"
    Xo = Y - Xc
    return Xo

In [49]:
columns_d = d.columns
result = []
for index, c in tqdm(enumerate(columns_d)):
    if index==0:
        result.append(d[c].values-np.mean(d[c].values))
    else:
        result.append(ortho_proj(d[c].values, d[columns_d[:index]].values))

  
551it [03:14,  1.16s/it]

KeyboardInterrupt: 

In [56]:
matrix = np.vstack(result).T

In [57]:
matrix.shape

(1894, 551)