conda meg-analysis-env

In [7]:
import os

base_raw_dir = '../SMN4Lang_data/ds004078/derivatives/preprocessed_data/sub-01/MEG'
base_ann_dir = '../SMN4Lang_data/ds004078/derivatives/annotations'
base_sti_dir = '../SMN4Lang_data/ds004078/stimuli'

import os

def list_folders_recursive(path, indent=0):
    folders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
    for folder in folders:
        print('  ' * indent + f'- {folder}')
        folder_path = os.path.join(path, folder)
        # 再進一步列出下一層
        list_folders_recursive(folder_path, indent + 1)

# Example usage:
list_folders_recursive(base_ann_dir)



- embeddings
  - word2vec
    - word-level
      - 100d
      - 300d
    - char-level
      - 100d
      - 300d
  - gpt
    - word-level
  - bert
    - word-level
    - char-level
- frequency
  - word-level
  - char-level
- quiz
- scripts
- time_align
  - word-level
  - char-level
- syntactic_annotations
  - dependency_parsing
  - constituency_parsing
  - part_of_speech


In [19]:
import os
import scipy.io as sio
import numpy as np

# base path
base_ann_dir = '../SMN4Lang_data/ds004078/derivatives/annotations'

### === PART OF SPEECH ===
print('--- PART OF SPEECH ---')
pos_path = os.path.join(base_ann_dir, 'syntactic_annotations/part_of_speech/story_1_pos.txt')
with open(pos_path, 'r', encoding='utf-8') as f:
    for i in range(20):
        print(f.readline().strip())

### === FREQUENCY (word-level) ===
print('\n--- WORD FREQUENCY (word-level) ---')
freq_path = os.path.join(base_ann_dir, 'frequency/word-level/story_1_word_logfreq.mat')
freqwordmat = sio.loadmat(freq_path)
word_freq_values = freqwordmat['wf'].squeeze()
print(word_freq_values[:20])

### === TIME ALIGN (word-level) ===
print('\n--- TIME ALIGN (word-level) ---')
time_path = os.path.join(base_ann_dir, 'time_align/word-level/story_1_word_time.mat')
timewordmat = sio.loadmat(time_path)
word_start_times = timewordmat['start'].squeeze()
print(word_start_times[:20])


'''
### === DEPENDENCY PARSING ===
print('\n--- DEPENDENCY PARSING ---')
dep_path = os.path.join(base_ann_dir, 'syntactic_annotations/dependency_parsing/story_1_dependency.conllx')
with open(dep_path, 'r', encoding='utf-8') as f:
    for i in range(10):
        print(f.readline().strip())

### === CONSTITUENCY PARSING ===
print('\n--- CONSTITUENCY PARSING ---')
const_path = os.path.join(base_ann_dir, 'syntactic_annotations/constituency_parsing/story_1_constituency.txt')
with open(const_path, 'r', encoding='utf-8') as f:
    for i in range(10):
        print(f.readline().strip())

### === WORD2VEC embedding (word-level 300d) ===
print('\n--- WORD2VEC EMBEDDING (word-level, 300d) ---')
w2v_path = os.path.join(base_ann_dir, 'embeddings/word2vec/word-level/300d/story_1_word_word2vec.mat')
w2v_mat = sio.loadmat(w2v_path)
w2v_values = w2v_mat['embedding']  # 通常叫 embedding
print(w2v_values[:10])  # (n_words, embedding_dim)

### === BERT embedding (word-level) ===
print('\n--- BERT EMBEDDING (word-level) ---')
bert_path = os.path.join(base_ann_dir, 'embeddings/bert/word-level/story_1_word_bert_1-12_768.mat')
bert_mat = sio.loadmat(bert_path)
bert_values = bert_mat['embedding']
print(bert_values[:10])  # (n_words, embedding_dim)
'''

print(freqwordmat.keys())
print(timewordmat.keys())


--- PART OF SPEECH ---
0	我们	PN
1	经常	AD
2	会	VV
3	说	VV
4	教育	NN
5	关系	VV
6	千家万户	NN
7	，	PU
8	有关	VV
9	教育	NN
10	的	DEC
11	讨论	NN
12	总	AD
13	能	VV
14	引发	VV
15	社会	NN
16	关注	VV
17	。	PU

0	最近	NT

--- WORD FREQUENCY (word-level) ---
[15.41563968 12.55887159 15.04833405 15.15246419 14.94689299 13.53182715
 10.28530866 19.29483723 13.90786273 14.94689299 18.6445706  12.41426904
 13.44249279 14.67172597 12.83701724 14.70894281 13.87025534 18.49681653
 12.47815111 19.29483723]

--- TIME ALIGN (word-level) ---
[11.28 11.45 11.68 11.82 12.1  12.65 13.07 14.12 14.25 14.57 14.86 14.98
 15.61 15.76 15.84 16.23 16.56 17.04 17.6  17.91]
dict_keys(['__header__', '__version__', '__globals__', 'wf'])
dict_keys(['__header__', '__version__', '__globals__', 'start', 'end', 'word'])


In [3]:
#測試用需要的資料對齊不存檔
# === Get word_text_list from timewordmat['word']
words_raw = timewordmat['word']
words_raw = np.squeeze(words_raw)

# Decode if bytes
if isinstance(words_raw[0], bytes):
    word_text_list = [w.decode('utf-8') for w in words_raw]
else:
    word_text_list = list(words_raw)

# === Read POS as list of (word, POS)
pos_path = os.path.join(base_ann_dir, f'syntactic_annotations/part_of_speech/story_{story_id}_pos.txt')
pos_word_list = []
pos_tag_list = []
with open(pos_path, 'r', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()
        if len(tokens) >= 3:
            pos_word_list.append(tokens[1])
            pos_tag_list.append(tokens[2])
        else:
            # 有些行可能是空行或缺欄位
            continue

# === 對齊 POS → 對齊到 word_text_list 裡有的詞
# 常見做法：pos_word_list → lower casing、strip 處理
# 最簡單做法：直接對齊順序！（你要確認你的 pos.txt 是否 "只存詞性，不存 word"，如果只存詞性就直接用 pos_list）

# --- 如果 pos.txt 是只有 POS tag per line，則 pos_tag_list = pos_list

# === Check length again after alignment
n_words = len(word_text_list)
if len(pos_tag_list) != n_words:
    print(f'WARNING: POS length mismatch: {len(pos_tag_list)} vs {n_words}')
    # 自動 truncate to shorter one → 避免報錯
    min_len = min(len(pos_tag_list), n_words)
    pos_tag_list = pos_tag_list[:min_len]
    word_text_list = word_text_list[:min_len]
    word_freq_values = word_freq_values[:min_len]
    word_start_times = word_start_times[:min_len]

# === Build DataFrame
df = pd.DataFrame({
    'word': word_text_list,
    'pos': pos_tag_list,
    'word_freq_log': word_freq_values,
    'word_onset_sec': word_start_times - 10.65
})

print(df.head(20))


    word  pos  word_freq_log  word_onset_sec
0   最近     NT      12.478151            0.59
1   这      DT      15.077642            0.88
2   段       M      12.593048            0.98
3   时间     NN      14.674694            1.09
4   ，      PU      19.294837            1.64
5   与       P      16.051127            1.67
6   儿童     NN      13.308856            1.87
7   有关     VV      13.907863            2.17
8   的     DEC      18.644571            2.49
9   几      CD      10.071372            2.63
10  款       M      12.132652            2.77
11  新      JJ      15.704966            2.94
12  产品     NN      14.661952            3.14
13  和      CC      17.094431            3.56
14  新      JJ      15.704966            3.72
15  服务     NN      15.447769            3.88
16  ，      PU      19.294837            4.37
17  引发     VV      12.837017            4.49
18  了      AS      17.074538            4.84
19  不小     JJ      10.841070            4.96


In [4]:
#此版會將需要的資料對齊存檔
import os
import scipy.io as sio
import numpy as np
import pandas as pd

# === Base path ===
base_ann_dir = '../SMN4Lang_data/ds004078/derivatives/annotations'
save_csv_dir = 'StimulusTables'  # 改成你想存的地方
os.makedirs(save_csv_dir, exist_ok=True)

# === Loop over stories ===
for story_id in range(1, 61):
    print(f'=== Processing story {story_id} ===')

    try:
        ### === Read POS ===
        pos_path = os.path.join(base_ann_dir, f'syntactic_annotations/part_of_speech/story_{story_id}_pos.txt')
        pos_word_list = []
        pos_tag_list = []
        with open(pos_path, 'r', encoding='utf-8') as f:
            for line in f:
                tokens = line.strip().split()
                if len(tokens) >= 3:
                    pos_word_list.append(tokens[1])  # word
                    pos_tag_list.append(tokens[2])   # POS
                else:
                    continue

        ### === Read word freq ===
        freq_path = os.path.join(base_ann_dir, f'frequency/word-level/story_{story_id}_word_logfreq.mat')
        freqwordmat = sio.loadmat(freq_path)
        word_freq_values = freqwordmat['wf'].squeeze()

        ### === Read time align ===
        time_path = os.path.join(base_ann_dir, f'time_align/word-level/story_{story_id}_word_time.mat')
        timewordmat = sio.loadmat(time_path)
        word_start_times = timewordmat['start'].squeeze()

        # Get word list
        words_raw = timewordmat['word']
        words_raw = np.squeeze(words_raw)

        # Decode if bytes
        if isinstance(words_raw[0], bytes):
            word_text_list = [w.decode('utf-8') for w in words_raw]
        else:
            word_text_list = list(words_raw)

        ### === Check alignment
        n_words = len(word_text_list)
        if len(pos_tag_list) != n_words:
            print(f'WARNING: Story {story_id}: POS length mismatch: {len(pos_tag_list)} vs {n_words}')
            min_len = min(len(pos_tag_list), n_words)
            pos_tag_list = pos_tag_list[:min_len]
            word_text_list = word_text_list[:min_len]
            word_freq_values = word_freq_values[:min_len]
            word_start_times = word_start_times[:min_len]

        ### === Build DataFrame
        df = pd.DataFrame({
            'word': word_text_list,
            'pos': pos_tag_list,
            'word_freq_log': word_freq_values,
            'word_onset_sec': word_start_times - 10.65
        })

        ### === Save to CSV
        save_path = os.path.join(save_csv_dir, f'story_{story_id}_stimulus_table.csv')
        df.to_csv(save_path, index=False)
        print(f'Saved to {save_path} — n_words: {len(df)}')

    except Exception as e:
        print(f'ERROR processing story {story_id}: {e}')


=== Processing story 1 ===
Saved to StimulusTables/story_1_stimulus_table.csv — n_words: 1044
=== Processing story 2 ===
Saved to StimulusTables/story_2_stimulus_table.csv — n_words: 939
=== Processing story 3 ===
Saved to StimulusTables/story_3_stimulus_table.csv — n_words: 902
=== Processing story 4 ===
Saved to StimulusTables/story_4_stimulus_table.csv — n_words: 908
=== Processing story 5 ===
Saved to StimulusTables/story_5_stimulus_table.csv — n_words: 871
=== Processing story 6 ===
Saved to StimulusTables/story_6_stimulus_table.csv — n_words: 885
=== Processing story 7 ===
Saved to StimulusTables/story_7_stimulus_table.csv — n_words: 840
=== Processing story 8 ===
Saved to StimulusTables/story_8_stimulus_table.csv — n_words: 781
=== Processing story 9 ===
Saved to StimulusTables/story_9_stimulus_table.csv — n_words: 735
=== Processing story 10 ===
Saved to StimulusTables/story_10_stimulus_table.csv — n_words: 710
=== Processing story 11 ===
Saved to StimulusTables/story_11_stimul

In [5]:
#讀取存好的檔案
import pandas as pd

story_id = 1
csv_path = f'StimulusTables/story_{story_id}_stimulus_table.csv'

df = pd.read_csv(csv_path)

# 看前幾筆確認
print(df.head(20))


     word  pos  word_freq_log  word_onset_sec
0   我们      PN      15.415640            0.63
1   经常      AD      12.558872            0.80
2   会       VV      15.048334            1.03
3   说       VV      15.152464            1.17
4   教育      NN      14.946893            1.45
5   关系      VV      13.531827            2.00
6   千家万户    NN      10.285309            2.42
7   ，       PU      19.294837            3.47
8   有关      VV      13.907863            3.60
9   教育      NN      14.946893            3.92
10  的      DEC      18.644571            4.21
11  讨论      NN      12.414269            4.33
12  总       AD      13.442493            4.96
13  能       VV      14.671726            5.11
14  引发      VV      12.837017            5.19
15  社会      NN      14.708943            5.58
16  关注      VV      13.870255            5.91
17  。       PU      18.496817            6.39
18  最近      NT      12.478151            6.95
19  ，       PU      19.294837            7.26
