# Exploring the visual data

In [54]:
import mne
import logging
import mne_bids
import pandas as pd
import numpy as np
from pathlib import Path

# Set the logger level to WARNING to reduce verbosity
logger = logging.getLogger('mne')
logger.setLevel(logging.ERROR)

#path = Path("/home/co/data/neuralset/LPP_copy/pallierlisten2023/download")
path = Path("/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download")

def testing(subject, run_id):
    task = 'read'
    bids_path = mne_bids.BIDSPath(
        subject=subject,
        session="01",
        task=task,
        datatype="meg",
        root=path,
        run=run_id,
    )

    raw = mne_bids.read_raw_bids(bids_path)
    # triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)
    triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)

    # Generate event_file path
    event_file = path / f"sub-{bids_path.subject}"
    event_file = event_file / f"ses-{bids_path.session}"
    event_file = event_file / "meg"
    event_file = str(event_file / f"sub-{bids_path.subject}")
    event_file += f"_ses-{bids_path.session}"
    event_file += f"_task-{bids_path.task}"
    event_file += f"_run-{bids_path.run}_events.tsv"
    assert Path(event_file).exists()

    meta = pd.read_csv(event_file, sep="\t")

    meta["word"] = meta["trial_type"].apply(
            lambda x: eval(x)["word"] if type(eval(x)) == dict else np.nan)

    # Remove the empty words:

    meta.loc[meta['word'] == ' ', 'word'] = None

    # Drop the rows containing NaN values in the text column
    meta = meta.dropna(subset=['word'])

    meta['start'] = meta.onset

    # return meta
    # Get the length of the meta file
    total_time_meta = np.array(meta.onset)[-1] - np.array(meta.onset)[0]

    # Length of triggers
    total_time_triggers = triggers[-1][0] - triggers[0][0]

    return total_time_meta, total_time_triggers, (len(triggers) / len(meta))

In [56]:
subject = "2"
run = '01'

# meta = testing(subject, run)
total_time_meta, total_time_triggers, perc = testing(subject, run)
shift = total_time_meta - (total_time_triggers / 1000)
shift, perc

(50.35699999999997, 1.0013651877133105)

In [29]:
total_time_meta, total_time_triggers

(509.7, 459343)

In [51]:
subject = "40"
run = '01'

task = 'read'
bids_path = mne_bids.BIDSPath(
    subject=subject,
    session="01",
    task=task,
    datatype="meg",
    root=path,
    run=run,
)

raw = mne_bids.read_raw_bids(bids_path)
# triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)
triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)

# Generate event_file path
event_file = path / f"sub-{bids_path.subject}"
event_file = event_file / f"ses-{bids_path.session}"
event_file = event_file / "meg"
event_file = str(event_file / f"sub-{bids_path.subject}")
event_file += f"_ses-{bids_path.session}"
event_file += f"_task-{bids_path.task}"
event_file += f"_run-{bids_path.run}_events.tsv"
assert Path(event_file).exists()

meta = pd.read_csv(event_file, sep="\t")

meta["word"] = meta["trial_type"].apply(
        lambda x: eval(x)["word"] if type(eval(x)) == dict else np.nan)

# Remove the empty words:

meta.loc[meta['word'] == ' ', 'word'] = None

# Drop the rows containing NaN values in the text column
meta = meta.dropna(subset=['word'])

meta['start'] = meta.onset

# return meta
# Get the length of the meta file
total_time_meta = np.array(meta.onset)[-1] - np.array(meta.onset)[0]

# Length of triggers
total_time_triggers = triggers[-1][0] - triggers[0][0]


In [52]:
plt.plot(np.diff(triggers[:,0]))
plt.show(block=True)

In [60]:
# Plot the raw data
import matplotlib
matplotlib.use('Qt5Agg')

import matplotlib.pyplot as plt
plt.plot(raw.copy().pick_channels(['STI101']).get_data()[0] )
# Plot meta wlenght
meta['wlength'] = meta['word'].apply(len)
# plt.plot(meta.wlength, 'r')
plt.show(block=True)

In [3]:
from utils import match_list

import mne_bids
import mne
import pandas as pd
from pathlib import Path
path = Path("/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download")


subject = "30"
run = '01'

task = 'read'
bids_path = mne_bids.BIDSPath(
    subject=subject,
    session="01",
    task=task,
    datatype="meg",
    root=path,
    run=run,
)

raw = mne_bids.read_raw_bids(bids_path)

all_triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)

# Triggers are simpler for this modality: no need to get the step function / offsets
word_triggers = all_triggers
if word_triggers[:, 2].max() > 2048:
    word_triggers[:, 2] = (
        word_triggers[:, 2] - 2048
    ) 


eventsile = '/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-1/ses-01/meg/sub-1_ses-01_task-read_run-01_events.tsv'
words = pd.read_csv(eventsile, sep="\t")
# file = "/home/co/code/LPP_experiment/formatting/v2/run1_v2_0.25_0.5.tsv"
# words = pd.read_csv(file, sep="\t")
# words['wlength'] = words['word'].apply(len)
# i, j = match_list(word_triggers[:, 2], words.wlength)

Opening raw data file /media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-30/ses-01/meg/sub-30_ses-01_task-read_run-01_meg.fif...
    Read a total of 13 projection items:
        grad_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v6 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v7 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v8 (1 x 306)  idle
    Range : 23000 ... 494999 =     23.000 ...   494.999 secs
Ready.


  raw = mne_bids.read_raw_bids(bids_path)


Reading events from /media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-30/ses-01/meg/sub-30_ses-01_task-read_run-01_events.tsv.
Reading channel info from /media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-30/ses-01/meg/sub-30_ses-01_task-read_run-01_channels.tsv.
Using 4 HPI coils: 293 307 314 321 Hz
Not fully anonymizing info - keeping his_id, sex, and hand info


  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)


1466 events found
Event IDs: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [9]:
mne.find_events(raw)

1466 events found
Event IDs: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


array([[ 30167,      0,      1],
       [ 30787,      1,      7],
       [ 31113,      0,      7],
       ...,
       [489006,      0,      5],
       [489273,      0,      4],
       [489548,      0,     11]])

In [10]:
raw.annotations

<Annotations | 1355 segments: {'kind': 'word', 'word': '1'} (3), {'kind': ...>

In [80]:
len(i) / len(words)

0.7147124719940254

In [77]:
word_triggers[~np.isin(np.arange(word_triggers.shape[0]), i)][:,2]

array([ 1,  7, 14, 12,  9,  9, 12,  8, 13, 11,  7,  8, 13,  6, 11,  8, 11,
        8, 10,  6, 13, 12, 14,  6, 11,  9, 11,  7, 14, 13, 10, 12, 13, 10,
       10,  6, 12,  9, 11,  7, 13,  8,  8,  8, 11,  6, 11, 15, 11,  8, 10,
        6,  8,  9, 11])

# Check Neuralset

In [83]:
events = []
for annot in raw.annotations:
    description = annot.pop("description")
    if "BAD_ACQ_SKIP" in description:
        continue
    event = eval(description)
    event["condition"] = "sentence"
    event["type"] = event.pop("kind").capitalize()
    event["start"] = annot["onset"]
    event["duration"] = annot["duration"]
    event["stop"] = annot["onset"] + annot["duration"]
    event["language"] = "french"
    events.append(event)

events_df = pd.DataFrame(events).rename(columns=dict(word="text"))

# Remove empty words that were included in the metadata files...
events_df.loc[events_df["text"] == " ", "text"] = None
# Drop the rows containing NaN values in the text column
events_df = events_df.dropna(subset=["text"])
events_df.reset_index(drop=True, inplace=True)

# Match the events with the metadata
metadata = pd.read_csv('/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sourcedata/task-read_run-01_extra_info.tsv')

# TODO: this hack doesnt work as in read, the j and avais have been merged
# It is thus needed to think about how to find again this information
# Small data augmentation because some columns dont exist in the read metadata
# metadata_listen = pd.read_csv(self.path / "sourcedata/task-listen_run-{self.run}_extra_info.tsv")
# # Add to metadata the missing columns from the listen metadata: n_closing, is_last_word, pos, content_word
# metadata = metadata.merge(metadata_listen[["word", "n_closing", "is_last_word", "pos", "content_word"]], on="word")

rows_events, rows_metadata = match_list(
    [str(word) for word in events_df["text"].values],
    [str(word) for word in metadata["word"].values],
)


events_idx, metadata_idx = (
    events_df.index[rows_events],
    metadata.index[rows_metadata],events_df
)

# Adding the information about sequence_id and n_closing
events_df["word"] = events_df["text"]
# for col in ["sequence_id", "n_closing", "is_last_word", "pos"]:
for col in ["sequence_id"]:
    events_df.loc[events_idx, col] = metadata.loc[metadata_idx, col]

# get the correct words (pb with apostrophes)
eventsile = '/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-1/ses-01/meg/sub-1_ses-01_task-read_run-01_events.tsv'

correct_words_df = pd.read_csv(eventsile, delimiter="\t")
correct_words_df.trial_type = correct_words_df.trial_type.apply(
    lambda x: eval(x)["word"]
)
rows_events, rows_metadata = match_list(
    events_df["text"].values.astype(str),
    correct_words_df["trial_type"].values.astype(str),
)

events_idx, metadata_idx = (
    events_df.index[rows_events],
    correct_words_df.index[rows_metadata],
)
events_df.loc[events_idx, "text"] = correct_words_df.loc[metadata_idx, "word"]


all_triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)

# Triggers are simpler for this modality: no need to get the step function / offsets
word_triggers = all_triggers[all_triggers[:, 2] > 1]

words = events_df.loc[events_df.type == "Word"]
words["wlength"] = words.text.apply(len)
if word_triggers[:, 2].max() > 2048:
    word_triggers[:, 2] = (
        word_triggers[:, 2] - 2048
    )  # HACK because of a bug in the word_triggers for 2 subjects that have particularly high word_triggers
i, j = match_list(word_triggers[:, 2], words.wlength)
print(f"Matched: {len(i) / len(word_triggers)}")

true_indices = words.iloc[j].index

events_df.loc[true_indices, "start"] = word_triggers[i, 0] / raw.info["sfreq"]


# sort by start
events_df = events_df.sort_values(by="start").reset_index(drop=True)

Matched: 0.8837047353760445


In [13]:
events = []
for annot in raw.annotations:
    description = annot.pop("description")
    if "BAD_ACQ_SKIP" in description:
        continue
    event = eval(description)
    event["condition"] = "sentence"
    event["type"] = event.pop("kind").capitalize()
    event["start"] = annot["onset"]
    event["duration"] = annot["duration"]
    event["stop"] = annot["onset"] + annot["duration"]
    event["language"] = "french"
    events.append(event)

events_df = pd.DataFrame(events).rename(columns=dict(word="text"))

# Remove empty words that were included in the metadata files...
events_df.loc[events_df["text"] == " ", "text"] = None
# Drop the rows containing NaN values in the text column
events_df = events_df.dropna(subset=["text"])
events_df.reset_index(drop=True, inplace=True)

# Match the events with the metadata
metadata = pd.read_csv('/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sourcedata/task-read_run-01_extra_info.tsv')

# TODO: this hack doesnt work as in read, the j and avais have been merged
# It is thus needed to think about how to find again this information
# Small data augmentation because some columns dont exist in the read metadata
# metadata_listen = pd.read_csv(self.path / "sourcedata/task-listen_run-{self.run}_extra_info.tsv")
# # Add to metadata the missing columns from the listen metadata: n_closing, is_last_word, pos, content_word
# metadata = metadata.merge(metadata_listen[["word", "n_closing", "is_last_word", "pos", "content_word"]], on="word")

rows_events, rows_metadata = match_list(
    [str(word) for word in events_df["text"].values],
    [str(word) for word in metadata["word"].values],
)



events_idx, metadata_idx = (
    events_df.index[rows_events],
    metadata.index[rows_metadata],
)

# Adding the information about sequence_id and n_closing
events_df["word"] = events_df["text"]
# for col in ["sequence_id", "n_closing", "is_last_word", "pos"]:
for col in ["sequence_id"]:
    events_df.loc[events_idx, col] = metadata.loc[metadata_idx, col]

# get the correct words (pb with apostrophes)
eventsile = '/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-1/ses-01/meg/sub-1_ses-01_task-read_run-01_events.tsv'

correct_words_df = pd.read_csv(eventsile, delimiter="\t")
correct_words_df.trial_type = correct_words_df.trial_type.apply(
    lambda x: eval(x)["word"]
)
rows_events, rows_metadata = match_list(
    events_df["text"].values.astype(str),
    correct_words_df["trial_type"].values.astype(str),
)


In [38]:
events_df.loc[events_idx, "clean_text"] = correct_words_df.loc[metadata_idx, "trial_type"].values.astype(str)

In [39]:
events_df

Unnamed: 0,text,condition,type,start,duration,stop,language,word,sequence_id,clean_text
0,Lorsque,sentence,Word,23.7,0.25,23.95,french,Lorsque,0,Lorsque
1,j'avais,sentence,Word,24.0,0.25,24.25,french,javais,0,javais
2,six,sentence,Word,24.3,0.25,24.55,french,six,0,six
3,"ans,",sentence,Word,24.6,0.25,24.85,french,ans,0,ans
4,j'ai,sentence,Word,24.9,0.25,25.15,french,jai,0,jai
...,...,...,...,...,...,...,...,...,...,...
1350,avec,sentence,Word,493.7,0.25,493.95,french,avec,114,avec
1351,la,sentence,Word,494.0,0.25,494.25,french,la,114,la
1352,caisse,sentence,Word,494.3,0.25,494.55,french,caisse,114,caisse
1353,que,sentence,Word,494.6,0.25,494.85,french,que,114,que


In [23]:
correct_words_match = correct_words_df["trial_type"].values.astype(str)
rows_events, rows_metadata = match_list(
    events_df["text"].values.astype(str),
    correct_words_match,
)

events_idx, metadata_idx = (
    events_df.index[rows_events],
    correct_words_df.index[rows_metadata],
)
events_df.loc[events_idx, "text"] = correct_words_df.loc[metadata_idx, "word"]
events_df.loc[events_idx, "clean_text"] = correct_words_df.loc[metadata_idx, "trial_type"].values.astype(str)

OrderedDict([('onset', 24.0),
             ('duration', 0.25),
             ('description', "{'kind': 'word', 'word': 'javais'}"),
             ('orig_time',
              datetime.datetime(2023, 5, 2, 14, 52, 47, 347192, tzinfo=datetime.timezone.utc))])

In [52]:
words = []
for annot in raw.annotations:
    word = annot["description"]
    words.append(eval(word)['word'])
    # Match it with the metadata

word_meta = correct_words_df.trial_type
i,j = match_list(words, word_meta)

In [53]:
# Print the words in word_meta not matched
import numpy as np
word_meta[~np.isin(np.arange(len(word_meta)), j)]

1355       mas
1356    donnée
1357      cest
1358       que
1359        la
         ...  
1460      peut
1461       pas
1462     aller
1463      bien
1464      loin
Name: trial_type, Length: 110, dtype: object

In [51]:
events_df.shape

(1355, 10)

In [12]:
events_df

Unnamed: 0,text,condition,type,start,duration,stop,language
0,Lorsque,sentence,Word,23.7,0.25,23.95,french
1,javais,sentence,Word,24.0,0.25,24.25,french
2,six,sentence,Word,24.3,0.25,24.55,french
3,ans,sentence,Word,24.6,0.25,24.85,french
4,jai,sentence,Word,24.9,0.25,25.15,french
...,...,...,...,...,...,...,...
1350,avec,sentence,Word,493.7,0.25,493.95,french
1351,la,sentence,Word,494.0,0.25,494.25,french
1352,caisse,sentence,Word,494.3,0.25,494.55,french
1353,que,sentence,Word,494.6,0.25,494.85,french


In [86]:
rows_events.shape[0] / events_df.shape[0]

1.0

In [108]:
triggers.shape

(1466, 3)

In [76]:
# Print the words of not matched words
words.loc[~words.index.isin(j)][:50]


Unnamed: 0,word,onset,duration,wlength
41,"""les",14.5,0.25,4
67,"digestion.""",22.8,0.25,11
123,"""pourquoi",41.6,0.25,9
127,"peur?""",42.8,0.25,6
383,"""c'est",129.1,0.25,6
385,"chapeau.""",129.7,0.25,9
554,"""s'il",188.9,0.25,5
563,"mouton...""",193.1,0.25,10
738,"""mais...",252.1,0.25,8
743,"là?""",254.1,0.25,4
