In [6]:
from dataset import get_path, get_subjects, epoch_data
from utils import decod, correlate, match_list
import mne_bids
from pathlib import Path
import pandas as pd
import numpy as np
import mne
import spacy

nlp = spacy.load('fr_core_news_sm')

from sklearn.model_selection import KFold, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import RidgeCV
from wordfreq import zipf_frequency
from Levenshtein import editops
import matplotlib.pyplot as plt
import matplotlib

matplotlib.use("Agg")
mne.set_log_level(False)

In [12]:
run = 1

def format_meta(meta,run_id):
    model = 'fr_core_news_sm'
    if not spacy.util.is_package(model):
        spacy.cli.download(model)

    nlp = spacy.load(model)

    CHAPTERS = {
    1: "1-3",
    2: "4-6",
    3: "7-9",
    4: "10-12",
    5: "13-14",
    6: "15-19",
    7: "20-22",
    8: "23-25",
    9: "26-27",
    }
    txt_file = f'~/code/data/syntax/ch{CHAPTERS[run]}.syntax.txt'
    with open(txt_file, 'r') as f:
        txt = f.read().replace('\n', '')

    # parse text file
    doc = self.nlp(txt)

    # add parse information to metadata
    parse_annots = []
    for sent_id, sent in enumerate(doc.sents):
        # HERE ADD ERIC DE LA CLERGERIE parser instead
        closings = parse(sent)
        assert len(closings) == len(sent)
        for word, closing in zip(sent, closings):
            parse_annots.append(dict(
                word_index=word.i - sent[0].i,
                sequence_id=sent_id,
                sequence_uid=str(sent),
                closing=closing,
                match_token=word.text,
            ))

    # align text file and meg metadata
    def format_text(text):
        for char in ('jlsmtncd'):
            text = text.replace(f"{char}'", char)
        text = text.replace('œ', 'oe')
        return text.lower()

    meg_words = meta.word.fillna('######').values
    text_words = [format_text(w.text) for w in doc]

    i, j = utils.match_list(meg_words, text_words)

    # deal with missed tokens (e.g. wrong spelling, punctuation)
    assert len(parse_annots) == len(text_words)
    parse_annots = pd.DataFrame(parse_annots)
    parse_annots.closing = parse_annots.closing.fillna(0)
    parse_annots['closing_'] = 0
    parse_annots['missed_closing'] = 0
    missing = np.setdiff1d(range(len(parse_annots)), j)
    for missed in missing:
        current_closing = parse_annots.iloc[missed].closing
        prev_word = parse_annots.iloc[[missed-1]].index
        if prev_word[0] >=0:
            parse_annots.loc[prev_word, 'missed_closing'] = current_closing
    parse_annots.closing_ = parse_annots.closing + parse_annots.missed_closing

    # Add new columns to original mne.Epochs.metadata
    # fill columns
    columns = ('word_index', 'sequence_id', 'sequence_uid', 'closing_', 'match_token')
    for column in columns:
        meta[column] = None
        meta.loc[meta.iloc[i].index, column] = parse_annots[column].iloc[j].values
    return meta

In [8]:
report = mne.Report()
path = get_path('LPP_read')
subjects = get_subjects(path)
RUN = 1

print("\nSubjects for which the decoding will be tested: \n")
print(subjects)

for subject in subjects:  # Ignore the first one

    print(f"Subject {subject}'s decoding started")
    epochs = []
    for run_id in range(1, RUN + 1):
        print(".", end="")
        epo = epoch_data(subject, "%.2i" % run_id, task='listen', path=path)
        epo.metadata["label"] = f"run_{run_id}"
        epochs.append(epo)

    # Quick fix for the dev_head: has to be
    # fixed before doing source reconstruction
    for epo in epochs:
        epo.info["dev_head_t"] = epochs[0].info["dev_head_t"]
        # epo.info['nchan'] = epochs[0].info['nchan']

    epochs = mne.concatenate_epochs(epochs)

    # Get the evoked potential averaged on all epochs for each channel
    evo = epochs.average(method="median")
    evo.plot(spatial_colors=True)

    # Handling the data structure
    epochs.metadata["kind"] = epochs.metadata.trial_type.apply(
        lambda s: eval(s)["kind"]
    )
    epochs.metadata["word"] = epochs.metadata.trial_type.apply(
        lambda s: eval(s)["word"]
    )
    # TODO : re-epoch
    print(format_meta(epochs.metadata, run_id))
    epochs.metadat
    epochs.metadata['closing'] = epochs.metadata.closing_.fillna(0)
    # Run a linear regression between MEG signals
    # and word frequency classification
    X = epochs.get_data()

    embeddings = epochs.metadata.word.apply(lambda word: nlp(word).vector).values
    embeddings = np.array([emb for emb in embeddings])

    y = embeddings

    R_vec = decod(X, y)
    R_vec_avg = np.mean(R_vec,axis = 1)

    fig, ax = plt.subplots(1, figsize=[6, 6])
    dec = plt.fill_between(epochs.times, R_vec_avg)
    # plt.show()
    report.add_evokeds(evo, titles=f"Evoked for sub {subject} ")
    report.add_figure(fig, title=f"decoding for subject {subject}")
    # report.add_figure(dec, subject, tags="word")
    report.save("./figures/reading_decoding_embeddings.html", open_browser=False, overwrite=True)

    print("Finished!")



Subjects for which the decoding will be tested: 

['1', '2', '3', '4', '5', '6']
Subject 1's decoding started
.Running the script on RAW data:
run 01, subject: 1


  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)
  epochs = mne.Epochs(
  epochs = mne.concatenate_epochs(epochs)


NameError: name 'self' is not defined

In [13]:
print(format_meta(epochs.metadata, run_id))

FileNotFoundError: [Errno 2] No such file or directory: '~/code/data/syntax/ch1-3.syntax.txt'