# Raw.annotations + order of things test

In [None]:
import mne
import logging
import mne_bids
import pandas as pd
import numpy as np
from pathlib import Path
from utils import match_list

# Set the logger level to WARNING to reduce verbosity
logger = logging.getLogger('mne')
logger.setLevel(logging.ERROR)

#path = Path("/home/co/data/neuralset/LPP_copy/pallierlisten2023/download")
path = Path("/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download")

subject = '30'
run = '01'

In [None]:
task = 'read'
bids_path = mne_bids.BIDSPath(
    subject=subject,
    session="01",
    task=task,
    datatype="meg",
    root=path,
    run=run,
)

raw = mne_bids.read_raw_bids(bids_path)

# extract annotations
events = []
for (
    annot
) in (
    raw.annotations
):
    description = annot.pop("description")
    if "BAD_ACQ_SKIP" in description:
        continue
    event = eval(description)
    event["condition"] = "sentence"
    event["type"] = event.pop("kind").capitalize()
    event["start"] = annot["onset"]
    event["duration"] = annot["duration"]
    event["stop"] = annot["onset"] + annot["duration"]
    event["language"] = "french"
    events.append(event)

# The size of raw.annotations impacts the creation of the events_df: smaller than the number of events
events_df = pd.DataFrame(events).rename(columns=dict(word="text"))

# Read the TSV file into the 'words' DataFrame
eventsfile = '/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-1/ses-01/meg/sub-1_ses-01_task-read_run-01_events.tsv'
words = pd.read_csv(eventsfile, sep="\t")


# Create the 'events_df' DataFrame
events = []
for _, row in words.iterrows():
    description = row["trial_type"]
    if "BAD_ACQ_SKIP" in description:
        continue
    event = eval(description)
    event["condition"] = "sentence"
    event["type"] = event.pop("kind").capitalize()
    event["start"] = row["onset"]
    event["duration"] = row["duration"]
    event["stop"] = row["onset"] + row["duration"]
    event["language"] = "french"
    event["text"] = row["word"]
    events.append(event)

events_df2 = pd.DataFrame(events)

In [None]:
words

In [None]:
events_df

In [None]:
events_df2

In [None]:

# Open the grounth truth for words that were used in the STIM setup
correct_words_df = pd.read_csv(self._get_word_info_path(), delimiter="\t")

# In order to match with the events_df, we need the clean words from correct_words_df
correct_words_df.trial_type = correct_words_df.trial_type.apply(
    lambda x: eval(x)["word"]
)
correct_words_match = correct_words_df["trial_type"].values.astype(str)
rows_events, rows_metadata = match_list(
    events_df["text"].values.astype(str),
    correct_words_match,
)
assert len(rows_events) / len(events_df) > 0.95, (
    error_msg_prefix
    + f"only {len(rows_events) / len(events_df)} of the words were found in the metadata"
)

events_idx, metadata_idx = (
    events_df.index[rows_events],
    correct_words_df.index[rows_metadata],
)
events_df.loc[events_idx, "text"] = correct_words_df.loc[metadata_idx, "word"]
events_df.loc[events_idx, "clean_text"] = correct_words_df.loc[
    metadata_idx, "trial_type"
].values.astype(str)

# TODO: this hack doesnt work as in read, the j and avais have been merged
# It is thus needed to think about how to find again this information

# Small data augmentation because some columns dont exist in the read metadata
# metadata_listen = pd.read_csv(self.path / "sourcedata/task-listen_run-{self.run}_extra_info.tsv")
# # Add to metadata the missing columns from the listen metadata: n_closing, is_last_word, pos, content_word
# metadata = metadata.merge(metadata_listen[["word", "n_closing", "is_last_word", "pos", "content_word"]], on="word")

word_triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)
words = events_df.loc[events_df.type == "Word"]
words["wlength"] = words.text.apply(len)
if word_triggers[:, 2].max() > 2048:
    word_triggers[:, 2] = (
        word_triggers[:, 2] - 2048
    )  # HACK because of a bug in the word_triggers for 2 subjects that have particularly high word_triggers

# Matching the triggers wlength (with hyphens, dashes etc..) with the CORRECT metadata
i, j = match_list(word_triggers[:, 2], words.wlength)

assert len(i) / len(word_triggers) > 0.9, (
    error_msg_prefix
    + f"only {len(i)/len(word_triggers)} of the words were found in the word_triggers"
)
matched_word_indices = words.iloc[j].index

# Create new type of events: missed words that were not found in the triggers
events_df["unaligned_start"] = events_df["start"]
missed_words = words[~words.index.isin(matched_word_indices)].copy()
missed_words["type"] = "MissedWord"

events_df.loc[matched_word_indices, "start"] = (
    word_triggers[i, 0] / raw.info["sfreq"]
)

# Drop the word events that were not found in the triggers
false_indices = words[~words.index.isin(matched_word_indices)].index
events_df.loc[false_indices, "start"] = np.nan
events_df = events_df.dropna(subset=["start"])

# Add the missed words to the events_df
events_df = pd.concat([events_df, missed_words])

# Match the events with the metadata
metadata = pd.read_csv(self._get_seq_id_path())

# Match with the metadata df that contains syntactic info, in order to append them later
# Match it with the CLEAN text, as it is the one that is present in the extra_info
rows_events, rows_metadata = match_list(
    [str(word) for word in events_df["clean_text"].values],
    [str(word) for word in metadata["word"].values],
)

assert len(rows_events) / len(events_df) > 0.95, (
    error_msg_prefix
    + f"only {len(rows_events) / len(events_df)} of the words were found in the metadata"
)
events_idx, metadata_idx = (
    events_df.index[rows_events],
    metadata.index[rows_metadata],
)

# Adding the information about sequence_id and n_closing
events_df["word"] = events_df["text"]
# for col in ["sequence_id", "n_closing", "is_last_word", "pos"]:
for col in ["sequence_id"]:
    events_df.loc[events_idx, col] = metadata.loc[metadata_idx, col]

# Add sentence / constituent info
events_df = _enrich_metadata(events_df)

# add train/test/val splits
events_df = set_sentence_split(events_df)  # TODO

# add raw event
uri = f"method:_load_raw?timeline={self.timeline}"
meg = {"filepath": uri, "type": "Meg", "start": 0}
events_df = pd.concat([pd.DataFrame([meg]), events_df])

# sort by start
events_df = events_df.sort_values(by="start").reset_index(drop=True)

# Exploring the visual data

In [None]:
import mne
import logging
import mne_bids
import pandas as pd
import numpy as np
from pathlib import Path

# Set the logger level to WARNING to reduce verbosity
logger = logging.getLogger('mne')
logger.setLevel(logging.ERROR)

#path = Path("/home/co/data/neuralset/LPP_copy/pallierlisten2023/download")
path = Path("/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download")

def testing(subject, run_id):
    task = 'read'
    bids_path = mne_bids.BIDSPath(
        subject=subject,
        session="01",
        task=task,
        datatype="meg",
        root=path,
        run=run_id,
    )

    raw = mne_bids.read_raw_bids(bids_path)
    # triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)
    triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)

    # Generate event_file path
    event_file = path / f"sub-{bids_path.subject}"
    event_file = event_file / f"ses-{bids_path.session}"
    event_file = event_file / "meg"
    event_file = str(event_file / f"sub-{bids_path.subject}")
    event_file += f"_ses-{bids_path.session}"
    event_file += f"_task-{bids_path.task}"
    event_file += f"_run-{bids_path.run}_events.tsv"
    assert Path(event_file).exists()

    meta = pd.read_csv(event_file, sep="\t")

    meta["word"] = meta["trial_type"].apply(
            lambda x: eval(x)["word"] if type(eval(x)) == dict else np.nan)

    # Remove the empty words:

    meta.loc[meta['word'] == ' ', 'word'] = None

    # Drop the rows containing NaN values in the text column
    meta = meta.dropna(subset=['word'])

    meta['start'] = meta.onset

    # return meta
    # Get the length of the meta file
    total_time_meta = np.array(meta.onset)[-1] - np.array(meta.onset)[0]

    # Length of triggers
    total_time_triggers = triggers[-1][0] - triggers[0][0]

    return total_time_meta, total_time_triggers, (len(triggers) / len(meta))

In [None]:
subject = "2"
run = '01'

# meta = testing(subject, run)
total_time_meta, total_time_triggers, perc = testing(subject, run)
shift = total_time_meta - (total_time_triggers / 1000)
shift, perc

In [None]:
total_time_meta, total_time_triggers

In [None]:
subject = "40"
run = '01'

task = 'read'
bids_path = mne_bids.BIDSPath(
    subject=subject,
    session="01",
    task=task,
    datatype="meg",
    root=path,
    run=run,
)

raw = mne_bids.read_raw_bids(bids_path)
# triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)
triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)

# Generate event_file path
event_file = path / f"sub-{bids_path.subject}"
event_file = event_file / f"ses-{bids_path.session}"
event_file = event_file / "meg"
event_file = str(event_file / f"sub-{bids_path.subject}")
event_file += f"_ses-{bids_path.session}"
event_file += f"_task-{bids_path.task}"
event_file += f"_run-{bids_path.run}_events.tsv"
assert Path(event_file).exists()

meta = pd.read_csv(event_file, sep="\t")

meta["word"] = meta["trial_type"].apply(
        lambda x: eval(x)["word"] if type(eval(x)) == dict else np.nan)

# Remove the empty words:

meta.loc[meta['word'] == ' ', 'word'] = None

# Drop the rows containing NaN values in the text column
meta = meta.dropna(subset=['word'])

meta['start'] = meta.onset

# return meta
# Get the length of the meta file
total_time_meta = np.array(meta.onset)[-1] - np.array(meta.onset)[0]

# Length of triggers
total_time_triggers = triggers[-1][0] - triggers[0][0]


In [None]:
plt.plot(np.diff(triggers[:,0]))
plt.show(block=True)

In [None]:
# Plot the raw data
import matplotlib
matplotlib.use('Qt5Agg')

import matplotlib.pyplot as plt
plt.plot(raw.copy().pick_channels(['STI101']).get_data()[0] )
# Plot meta wlenght
meta['wlength'] = meta['word'].apply(len)
# plt.plot(meta.wlength, 'r')
plt.show(block=True)

In [None]:
from utils import match_list

import mne_bids
import mne
import pandas as pd
from pathlib import Path
path = Path("/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download")


subject = "30"
run = '01'

def get_annot_trigg(subject, run):
    task = 'read'
    bids_path = mne_bids.BIDSPath(
        subject=subject,
        session="01",
        task=task,
        datatype="meg",
        root=path,
        run=run,
    )

    raw = mne_bids.read_raw_bids(bids_path)

    all_triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)

    # Triggers are simpler for this modality: no need to get the step function / offsets
    word_triggers = all_triggers
    if word_triggers[:, 2].max() > 2048:
        word_triggers[:, 2] = (
            word_triggers[:, 2] - 2048
        ) 


    eventsile = '/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-1/ses-01/meg/sub-1_ses-01_task-read_run-01_events.tsv'
    words = pd.read_csv(eventsile, sep="\t")
    # file = "/home/co/code/LPP_experiment/formatting/v2/run1_v2_0.25_0.5.tsv"
    # words = pd.read_csv(file, sep="\t")
    # words['wlength'] = words['word'].apply(len)
    # i, j = match_list(word_triggers[:, 2], words.wlength)
    return len(raw.annotations), len(word_triggers), words.shape[0]

In [None]:
import mne 

mne.set_log_level('ERROR')
for subject in range(1, 41):
    for run in range(1, 2):
        print(f"Subject {subject}, run {run}")
        print(get_annot_trigg(str(subject), '0' + str(run)))

# Testing for annotations

In [None]:
raw_io = mne.io.read_raw(bids_path, allow_maxshield=True, preload=True)
raw_io.annotations

In [None]:
raw.annotations

In [None]:
print(raw, raw.first_samp)

In [None]:
7*60 + 51

In [None]:
mne.find_events(raw, stim_channel="STI101", shortest_event=1)

In [None]:
raw.annotations[-1]

In [None]:
raw_io

In [None]:
# Plot the raw triggers

import matplotlib
matplotlib.use('Qt5Agg')

import matplotlib.pyplot as plt

raw.pick_types(meg=False, eeg=False, stim=True).plot(start=0, duration=10)

In [None]:
mne.find_events(raw)

In [None]:
len(raw.annotations)

In [None]:
len(i) / len(words)

In [None]:
word_triggers[~np.isin(np.arange(word_triggers.shape[0]), i)][:,2]

# Check Neuralset

In [None]:
events = []
for annot in raw.annotations:
    description = annot.pop("description")
    if "BAD_ACQ_SKIP" in description:
        continue
    event = eval(description)
    event["condition"] = "sentence"
    event["type"] = event.pop("kind").capitalize()
    event["start"] = annot["onset"]
    event["duration"] = annot["duration"]
    event["stop"] = annot["onset"] + annot["duration"]
    event["language"] = "french"
    events.append(event)

events_df = pd.DataFrame(events).rename(columns=dict(word="text"))

# Remove empty words that were included in the metadata files...
events_df.loc[events_df["text"] == " ", "text"] = None
# Drop the rows containing NaN values in the text column
events_df = events_df.dropna(subset=["text"])
events_df.reset_index(drop=True, inplace=True)

# Match the events with the metadata
metadata = pd.read_csv('/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sourcedata/task-read_run-01_extra_info.tsv')

# TODO: this hack doesnt work as in read, the j and avais have been merged
# It is thus needed to think about how to find again this information
# Small data augmentation because some columns dont exist in the read metadata
# metadata_listen = pd.read_csv(self.path / "sourcedata/task-listen_run-{self.run}_extra_info.tsv")
# # Add to metadata the missing columns from the listen metadata: n_closing, is_last_word, pos, content_word
# metadata = metadata.merge(metadata_listen[["word", "n_closing", "is_last_word", "pos", "content_word"]], on="word")

rows_events, rows_metadata = match_list(
    [str(word) for word in events_df["text"].values],
    [str(word) for word in metadata["word"].values],
)


events_idx, metadata_idx = (
    events_df.index[rows_events],
    metadata.index[rows_metadata],events_df
)

# Adding the information about sequence_id and n_closing
events_df["word"] = events_df["text"]
# for col in ["sequence_id", "n_closing", "is_last_word", "pos"]:
for col in ["sequence_id"]:
    events_df.loc[events_idx, col] = metadata.loc[metadata_idx, col]

# get the correct words (pb with apostrophes)
eventsile = '/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-1/ses-01/meg/sub-1_ses-01_task-read_run-01_events.tsv'

correct_words_df = pd.read_csv(eventsile, delimiter="\t")
correct_words_df.trial_type = correct_words_df.trial_type.apply(
    lambda x: eval(x)["word"]
)
rows_events, rows_metadata = match_list(
    events_df["text"].values.astype(str),
    correct_words_df["trial_type"].values.astype(str),
)

events_idx, metadata_idx = (
    events_df.index[rows_events],
    correct_words_df.index[rows_metadata],
)
events_df.loc[events_idx, "text"] = correct_words_df.loc[metadata_idx, "word"]


all_triggers = mne.find_events(raw, stim_channel="STI101", shortest_event=1)

# Triggers are simpler for this modality: no need to get the step function / offsets
word_triggers = all_triggers[all_triggers[:, 2] > 1]

words = events_df.loc[events_df.type == "Word"]
words["wlength"] = words.text.apply(len)
if word_triggers[:, 2].max() > 2048:
    word_triggers[:, 2] = (
        word_triggers[:, 2] - 2048
    )  # HACK because of a bug in the word_triggers for 2 subjects that have particularly high word_triggers
i, j = match_list(word_triggers[:, 2], words.wlength)
print(f"Matched: {len(i) / len(word_triggers)}")

true_indices = words.iloc[j].index

events_df.loc[true_indices, "start"] = word_triggers[i, 0] / raw.info["sfreq"]


# sort by start
events_df = events_df.sort_values(by="start").reset_index(drop=True)

In [None]:
events = []
for annot in raw.annotations:
    description = annot.pop("description")
    if "BAD_ACQ_SKIP" in description:
        continue
    event = eval(description)
    event["condition"] = "sentence"
    event["type"] = event.pop("kind").capitalize()
    event["start"] = annot["onset"]
    event["duration"] = annot["duration"]
    event["stop"] = annot["onset"] + annot["duration"]
    event["language"] = "french"
    events.append(event)

events_df = pd.DataFrame(events).rename(columns=dict(word="text"))

# Remove empty words that were included in the metadata files...
events_df.loc[events_df["text"] == " ", "text"] = None
# Drop the rows containing NaN values in the text column
events_df = events_df.dropna(subset=["text"])
events_df.reset_index(drop=True, inplace=True)

# Match the events with the metadata
metadata = pd.read_csv('/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sourcedata/task-read_run-01_extra_info.tsv')

# TODO: this hack doesnt work as in read, the j and avais have been merged
# It is thus needed to think about how to find again this information
# Small data augmentation because some columns dont exist in the read metadata
# metadata_listen = pd.read_csv(self.path / "sourcedata/task-listen_run-{self.run}_extra_info.tsv")
# # Add to metadata the missing columns from the listen metadata: n_closing, is_last_word, pos, content_word
# metadata = metadata.merge(metadata_listen[["word", "n_closing", "is_last_word", "pos", "content_word"]], on="word")

rows_events, rows_metadata = match_list(
    [str(word) for word in events_df["text"].values],
    [str(word) for word in metadata["word"].values],
)



events_idx, metadata_idx = (
    events_df.index[rows_events],
    metadata.index[rows_metadata],
)

# Adding the information about sequence_id and n_closing
events_df["word"] = events_df["text"]
# for col in ["sequence_id", "n_closing", "is_last_word", "pos"]:
for col in ["sequence_id"]:
    events_df.loc[events_idx, col] = metadata.loc[metadata_idx, col]

# get the correct words (pb with apostrophes)
eventsile = '/media/co/T7/workspace-LPP/data/MEG/LPP/PallierRead2023/download/sub-1/ses-01/meg/sub-1_ses-01_task-read_run-01_events.tsv'

correct_words_df = pd.read_csv(eventsile, delimiter="\t")
correct_words_df.trial_type = correct_words_df.trial_type.apply(
    lambda x: eval(x)["word"]
)
rows_events, rows_metadata = match_list(
    events_df["text"].values.astype(str),
    correct_words_df["trial_type"].values.astype(str),
)


In [None]:
events_df.loc[events_idx, "clean_text"] = correct_words_df.loc[metadata_idx, "trial_type"].values.astype(str)

In [None]:
events_df

In [None]:
correct_words_match = correct_words_df["trial_type"].values.astype(str)
rows_events, rows_metadata = match_list(
    events_df["text"].values.astype(str),
    correct_words_match,
)

events_idx, metadata_idx = (
    events_df.index[rows_events],
    correct_words_df.index[rows_metadata],
)
events_df.loc[events_idx, "text"] = correct_words_df.loc[metadata_idx, "word"]
events_df.loc[events_idx, "clean_text"] = correct_words_df.loc[metadata_idx, "trial_type"].values.astype(str)

In [None]:
words = []
for annot in raw.annotations:
    word = annot["description"]
    words.append(eval(word)['word'])
    # Match it with the metadata

word_meta = correct_words_df.trial_type
i,j = match_list(words, word_meta)

In [None]:
# Print the words in word_meta not matched
import numpy as np
word_meta[~np.isin(np.arange(len(word_meta)), j)]

In [None]:
events_df.shape

In [None]:
events_df

In [None]:
rows_events.shape[0] / events_df.shape[0]

In [None]:
triggers.shape

In [None]:
# Print the words of not matched words
words.loc[~words.index.isin(j)][:50]
