In [1]:
import scipy.io as io
import h5py
import os
import json
from glob import glob
from tqdm import tqdm
import numpy as np
import pickle
import argparse


In [15]:

# Interactive selection for task name
available_tasks = ['task1-SR', 'task2-NR', 'task3-TSR']
print("Please select a task name from the following options:")
for i, task in enumerate(available_tasks, 1):
    print(f"{i}. {task}")

task_choice = input("Enter the number corresponding to your choice: ").strip()
if task_choice.isdigit() and int(task_choice) in range(1, len(available_tasks) + 1):
    task_name = available_tasks[int(task_choice) - 1]
else:
    print("Invalid choice, defaulting to 'task1-SR'")
    task_name = 'task1-SR'
    
print(f"Selected task: {task_name}")


Please select a task name from the following options:
1. task1-SR
2. task2-NR
3. task3-TSR
Selected task: task3-TSR


In [16]:
version = 'v1'  # Can be 'v1' or 'v2'
print('##############################')
print(f'start processing ZuCo {task_name}...')

input_mat_files_dir = f'./dataset/ZuCo/{task_name}/Matlab_files' 
output_dir = f'./dataset/ZuCo/{task_name}/pickle'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


##############################
start processing ZuCo task3-TSR...


In [17]:

mat_files = glob(os.path.join(input_mat_files_dir,'*.mat'))
mat_files = sorted(mat_files)


In [18]:
if len(mat_files) == 0:
    print(f'No mat files found for {task_name}')
    quit()

dataset_dict = {}
for mat_file in tqdm(mat_files):
    subject_name = os.path.basename(mat_file).split('_')[0].replace('results','').strip()
    dataset_dict[subject_name] = []
    
    if version == 'v1':
        matdata = io.loadmat(mat_file, squeeze_me=True, struct_as_record=False)['sentenceData']
    elif version == 'v2':
        matdata = h5py.File(mat_file,'r')
        print(matdata)

    for sent in matdata:
        word_data = sent.word
        if not isinstance(word_data, float):
            # sentence level:
            sent_obj = {'content':sent.content}
            sent_obj['sentence_level_EEG'] = {'mean_t1':sent.mean_t1, 'mean_t2':sent.mean_t2, 'mean_a1':sent.mean_a1, 'mean_a2':sent.mean_a2, 'mean_b1':sent.mean_b1, 'mean_b2':sent.mean_b2, 'mean_g1':sent.mean_g1, 'mean_g2':sent.mean_g2}

            if task_name == 'task1-SR':
                sent_obj['answer_EEG'] = {'answer_mean_t1':sent.answer_mean_t1, 'answer_mean_t2':sent.answer_mean_t2, 'answer_mean_a1':sent.answer_mean_a1, 'answer_mean_a2':sent.answer_mean_a2, 'answer_mean_b1':sent.answer_mean_b1, 'answer_mean_b2':sent.answer_mean_b2, 'answer_mean_g1':sent.answer_mean_g1, 'answer_mean_g2':sent.answer_mean_g2}
            
            # word level:
            sent_obj['word'] = []
            
            word_tokens_has_fixation = [] 
            word_tokens_with_mask = []
            word_tokens_all = []

            for word in word_data:
                word_obj = {'content':word.content}
                word_tokens_all.append(word.content)
                # TODO: add more version of word level eeg: GD, SFD, GPT
                word_obj['nFixations'] = word.nFixations
                if word.nFixations > 0:    
                    word_obj['word_level_EEG'] = {'FFD':{'FFD_t1':word.FFD_t1, 'FFD_t2':word.FFD_t2, 'FFD_a1':word.FFD_a1, 'FFD_a2':word.FFD_a2, 'FFD_b1':word.FFD_b1, 'FFD_b2':word.FFD_b2, 'FFD_g1':word.FFD_g1, 'FFD_g2':word.FFD_g2}}
                    word_obj['word_level_EEG']['TRT'] = {'TRT_t1':word.TRT_t1, 'TRT_t2':word.TRT_t2, 'TRT_a1':word.TRT_a1, 'TRT_a2':word.TRT_a2, 'TRT_b1':word.TRT_b1, 'TRT_b2':word.TRT_b2, 'TRT_g1':word.TRT_g1, 'TRT_g2':word.TRT_g2}
                    word_obj['word_level_EEG']['GD'] = {'GD_t1':word.GD_t1, 'GD_t2':word.GD_t2, 'GD_a1':word.GD_a1, 'GD_a2':word.GD_a2, 'GD_b1':word.GD_b1, 'GD_b2':word.GD_b2, 'GD_g1':word.GD_g1, 'GD_g2':word.GD_g2}
                    sent_obj['word'].append(word_obj)
                    word_tokens_has_fixation.append(word.content)
                    word_tokens_with_mask.append(word.content)
                else:
                    word_tokens_with_mask.append('[MASK]')
                    # if a word has no fixation, use sentence level feature
                    # word_obj['word_level_EEG'] = {'FFD':{'FFD_t1':sent.mean_t1, 'FFD_t2':sent.mean_t2, 'FFD_a1':sent.mean_a1, 'FFD_a2':sent.mean_a2, 'FFD_b1':sent.mean_b1, 'FFD_b2':sent.mean_b2, 'FFD_g1':sent.mean_g1, 'FFD_g2':sent.mean_g2}}
                    # word_obj['word_level_EEG']['TRT'] = {'TRT_t1':sent.mean_t1, 'TRT_t2':sent.mean_t2, 'TRT_a1':sent.mean_a1, 'TRT_a2':sent.mean_a2, 'TRT_b1':sent.mean_b1, 'TRT_b2':sent.mean_b2, 'TRT_g1':sent.mean_g1, 'TRT_g2':sent.mean_g2}
                    
                    # NOTE:if a word has no fixation, simply skip it
                    continue
            
            sent_obj['word_tokens_has_fixation'] = word_tokens_has_fixation
            sent_obj['word_tokens_with_mask'] = word_tokens_with_mask
            sent_obj['word_tokens_all'] = word_tokens_all
            
            dataset_dict[subject_name].append(sent_obj)

        else:
            print(f'missing sent: subj:{subject_name} content:{sent.content}, return None')
            dataset_dict[subject_name].append(None)

            continue
    # print(dataset_dict.keys())
    # print(dataset_dict[subject_name][0].keys())
    # print(dataset_dict[subject_name][0]['content'])
    # print(dataset_dict[subject_name][0]['word'][0].keys())
    # print(dataset_dict[subject_name][0]['word'][0]['word_level_EEG']['FFD'])



  if word.nFixations > 0:
 33%|███▎      | 4/12 [00:29<00:57,  7.19s/it]

missing sent: subj:ZGW content:Peyton Williams Manning (born March 24, 1976 in New Orleans, Louisiana) is an American football quarterback for the Indianapolis Colts NFL franchise., return None
missing sent: subj:ZGW content:Jose Vicente Ferrer de Otero y Cintron (January 8, 1909 - January 26, 1992), was an actor and film director, born in Santurce, Puerto Rico., return None
missing sent: subj:ZGW content:He became Chief Secretary to the Treasury in 1994, a Cabinet position, but resigned in 1995, to defend himself against accusations that whilst Minister of Defence Procurement he violated ministerial rules by allowing an Arab businessman to pay for his stay in the Ritz Hotel Paris., return None
missing sent: subj:ZGW content:During World War II, he accompanied FDR as a military aide to the Casablanca meeting of 1943 and the subsequent Cairo and Tehran Conferences., return None
missing sent: subj:ZGW content:Sir Henry Edward Bolte (20 May 1908 - 4 January 1990), Australian politician, w

 67%|██████▋   | 8/12 [01:08<00:33,  8.44s/it]

missing sent: subj:ZKB content:His parents were both members of the Nazi party., return None
missing sent: subj:ZKB content:After he graduated basic training, he was sent to Camp Lejuene in North Carolina where he underwent advanced training before being sent to Korea., return None
missing sent: subj:ZKB content:He moved to San Juan where he started to work for the Texas Company as a file clerk., return None
missing sent: subj:ZKB content:He married Carolyn Bessette in 1996., return None
missing sent: subj:ZKB content:Ferrer had previously been married to Uta Hagen (1938-1948), by whom he had a daughter, and actress Phyllis Hill (1948-1953)., return None
missing sent: subj:ZKB content:At the time of his death, Ferrer was married to Stella Magee, whom he married in 1992., return None
missing sent: subj:ZKB content:He married his second wife, Elizabeth Harris in June 2003., return None
missing sent: subj:ZKB content:Dole has been married to Senator Elizabeth Dole, nee Hanford of North Ca

100%|██████████| 12/12 [01:46<00:00,  8.90s/it]

missing sent: subj:ZPH content:Jonathan Aitken (born August 30, 1942) is a former Conservative minister, and convicted perjurer., return None
missing sent: subj:ZPH content:In 1960, Dole was elected as a Republican to the United States House of Representatives for the 87th Congress and to three succeeding Congresses, spanning from January 3, 1961 to January 3, 1969., return None
missing sent: subj:ZPH content:John Alston Maxton, Baron Maxton (born May 5, 1936) was a Labour backbench Member of Parliament in the British House of Commons., return None
missing sent: subj:ZPH content:Reagan's presidency is regarded as a turning point for the United States Republican Party and the American conservative movement., return None
missing sent: subj:ZPH content:Richard Bruce Cheney (born January 30, 1941), widely known as Dick Cheney, is an American politician and businessman affiliated with the U.S. Republican Party., return None
missing sent: subj:ZPH content:As he once joked, he was the younges




In [21]:
output_name = f'{task_name}-dataset.pickle'
with open(os.path.join(output_dir,output_name), 'wb') as handle:
    pickle.dump(dataset_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('write to:', os.path.join(output_dir,output_name))


write to: ./dataset/ZuCo/task3-TSR/pickle\task3-TSR-dataset.pickle


In [22]:
with open(os.path.join(output_dir,output_name), 'rb') as handle:
    whole_dataset = pickle.load(handle)
print('subjects:', whole_dataset.keys())


subjects: dict_keys(['ZAB', 'ZDM', 'ZDN', 'ZGW', 'ZJM', 'ZJN', 'ZJS', 'ZKB', 'ZKH', 'ZKW', 'ZMG', 'ZPH'])


In [23]:
#ZUCO 2.0
import os
import numpy as np
import h5py
import data_loading_helpers_modified as dh
from glob import glob
from tqdm import tqdm
import pickle

In [24]:

task = "NR"

rootdir = "./dataset/ZuCo/task2-NR-2.0/Matlab_files/"

print('##############################')
print(f'start processing ZuCo task2-NR-2.0...')

dataset_dict = {}

for file in tqdm(os.listdir(rootdir)):
    if file.endswith(task+".mat"):

        file_name = rootdir + file

        # print('file name:', file_name)
        subject = file_name.split("ts")[1].split("_")[0]
        # print('subject: ', subject)

        # exclude YMH due to incomplete data because of dyslexia
        if subject != 'YMH':
            assert subject not in dataset_dict
            dataset_dict[subject] = []

            f = h5py.File(file_name,'r')
            # print('keys in f:', list(f.keys()))
            sentence_data = f['sentenceData']
            # print('keys in sentence_data:', list(sentence_data.keys()))
            
            # sent level eeg 
            # mean_t1 = np.squeeze(f[sentence_data['mean_t1'][0][0]][()])
            mean_t1_objs = sentence_data['mean_t1']
            mean_t2_objs = sentence_data['mean_t2']
            mean_a1_objs = sentence_data['mean_a1']
            mean_a2_objs = sentence_data['mean_a2']
            mean_b1_objs = sentence_data['mean_b1']
            mean_b2_objs = sentence_data['mean_b2']
            mean_g1_objs = sentence_data['mean_g1']
            mean_g2_objs = sentence_data['mean_g2']
            
            rawData = sentence_data['rawData']
            contentData = sentence_data['content']
            # print('contentData shape:', contentData.shape, 'dtype:', contentData.dtype)
            omissionR = sentence_data['omissionRate']
            wordData = sentence_data['word']


            for idx in range(len(rawData)):
                # get sentence string
                obj_reference_content = contentData[idx][0]
                sent_string = dh.load_matlab_string(f[obj_reference_content])
                # print('sentence string:', sent_string)
                
                sent_obj = {'content':sent_string}
                
                # get sentence level EEG
                sent_obj['sentence_level_EEG'] = {
                    'mean_t1':np.squeeze(f[mean_t1_objs[idx][0]][()]), 
                    'mean_t2':np.squeeze(f[mean_t2_objs[idx][0]][()]), 
                    'mean_a1':np.squeeze(f[mean_a1_objs[idx][0]][()]), 
                    'mean_a2':np.squeeze(f[mean_a2_objs[idx][0]][()]), 
                    'mean_b1':np.squeeze(f[mean_b1_objs[idx][0]][()]), 
                    'mean_b2':np.squeeze(f[mean_b2_objs[idx][0]][()]), 
                    'mean_g1':np.squeeze(f[mean_g1_objs[idx][0]][()]), 
                    'mean_g2':np.squeeze(f[mean_g2_objs[idx][0]][()])
                }
                # print(sent_obj)
                sent_obj['word'] = []

                # get word level data
                word_data, word_tokens_all, word_tokens_has_fixation, word_tokens_with_mask = dh.extract_word_level_data(f, f[wordData[idx][0]])
                
                if word_data == {}:
                    print(f'missing sent: subj:{subject} content:{sent_string}, append None')
                    dataset_dict[subject].append(None)
                    continue
                elif len(word_tokens_all) == 0:
                    print(f'no word level features: subj:{subject} content:{sent_string}, append None')
                    dataset_dict[subject].append(None)
                    continue

                else:                    
                    for widx in range(len(word_data)):
                        data_dict = word_data[widx]
                        word_obj = {'content':data_dict['content'], 'nFixations': data_dict['nFix']}
                        if 'GD_EEG' in data_dict:
                            # print('has fixation: ', data_dict['content'])
                            gd = data_dict["GD_EEG"]
                            ffd = data_dict["FFD_EEG"]
                            trt = data_dict["TRT_EEG"]
                            assert len(gd) == len(trt) == len(ffd) == 8
                            word_obj['word_level_EEG'] = {
                                'GD':{'GD_t1':gd[0], 'GD_t2':gd[1], 'GD_a1':gd[2], 'GD_a2':gd[3], 'GD_b1':gd[4], 'GD_b2':gd[5], 'GD_g1':gd[6], 'GD_g2':gd[7]},
                                'FFD':{'FFD_t1':ffd[0], 'FFD_t2':ffd[1], 'FFD_a1':ffd[2], 'FFD_a2':ffd[3], 'FFD_b1':ffd[4], 'FFD_b2':ffd[5], 'FFD_g1':ffd[6], 'FFD_g2':ffd[7]},
                                'TRT':{'TRT_t1':trt[0], 'TRT_t2':trt[1], 'TRT_a1':trt[2], 'TRT_a2':trt[3], 'TRT_b1':trt[4], 'TRT_b2':trt[5], 'TRT_g1':trt[6], 'TRT_g2':trt[7]}
                            }
                            sent_obj['word'].append(word_obj)
                        
                    sent_obj['word_tokens_has_fixation'] = word_tokens_has_fixation
                    sent_obj['word_tokens_with_mask'] = word_tokens_with_mask
                    sent_obj['word_tokens_all'] = word_tokens_all     
                    
                    # print(sent_obj.keys())
                    # print(len(sent_obj['word']))
                    # print(sent_obj['word'][0])

                    dataset_dict[subject].append(sent_obj)

"""output"""
task_name = 'task2-NR-2.0'

if dataset_dict == {}:
    print(f'No mat file found for {task_name}')
    quit()

output_dir = f'./dataset/ZuCo/{task_name}/pickle'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_name = f'{task_name}-dataset.pickle'
# with open(os.path.join(output_dir,'task1-SR-dataset.json'), 'w') as out:
#     json.dump(dataset_dict,out,indent = 4)

with open(os.path.join(output_dir,output_name), 'wb') as handle:
    pickle.dump(dataset_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('write to:', os.path.join(output_dir,output_name))

"""sanity check"""
print('subjects:', dataset_dict.keys())
print('num of sent:', len(dataset_dict['YAC']))

##############################
start processing ZuCo task2-NR-2.0...


  0%|          | 0/18 [00:00<?, ?it/s]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
- is not a real word.
– is not a real word.
missing sent: subj:YAC content:Reagan was reelected in a landslide in the 1984 presidential election, defeating Carter's Vice President Walter Mondale by winning 49 of 50 states and receiving nearly 60 percent of the popular vote., append None
missing sent: subj:YAC content:Reagan developed an early gift for storytelling and acting., append None
missing sent: subj:YAC content:He was a radio announcer as an affiliate of the Chicago Cubs baseball games, getting only the bare outlines of the game from a ticker and relying on his imagination and storytelling gifts to flesh out the game., append None
missing sent: subj:YAC content:Once in 1934, during the ninth inning of a Cubs-St. Louis Cardinals game, the wi

  6%|▌         | 1/18 [03:20<56:52, 200.76s/it]

& is not a real word.
- is not a real word.
no SFD!
missing sent: subj:YAG content:Rosemary, Betty, and brother, Nick, as well as her nephew, George Clooney (Nick's son), all became entertainers., append None
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YAG content:Groucho did a German accent., append None
- is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YAG content:The Perrys have four children., append None
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 11%|█         | 2/18 [29:47<4:30:52, 1015.80s/it]

no word level features: subj:YAK content:Henry Ford, with his son Edsel, founded the Ford Foundation in 1936 as a local philanthropic organization with a broad charter to promote human welfare., append None
no word level features: subj:YAK content:After this initial success, Ford left Edison Illuminating and, with other investors, formed the Detroit Automobile Company., append None
no word level features: subj:YAK content:During this period, he personally drove his Quadricycle to victory in a race against Alexander Winton, a well-known driver and the heavy favorite on October 10, 1901., append None
no word level features: subj:YAK content:Ford was forced out of the company by the investors, including Henry M. Leland in 1902, and the company was reorganized as Cadillac., append None
no word level features: subj:YAK content:In 1891, Ford became an engineer with the Edison Illuminating Company, and after his promotion to Chief Engineer in 1893, he had enough time and money to devote atten

 17%|█▋        | 3/18 [31:33<2:30:10, 600.70s/it] 

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YDG content:He was an African-American jazz trumpeter, bandleader, singer, and composer., append None
- is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YDG content:This proved to be an regrettable decision., append None
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 22%|██▏       | 4/18 [34:07<1:38:59, 424.24s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YDR content:He and his wife had seven children., append None
no SFD!
missing sent: subj:YDR content:Groucho did a German accent., append None
- is not a real word.
– is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 28%|██▊       | 5/18 [36:42<1:10:51, 327.02s/it]

no SFD!
missing sent: subj:YFR content:Henry Ford advocated long-time associate Harry Bennett to take the spot., append None
& is not a real word.
- is not a real word.
& is not a real word.
no SFD!
missing sent: subj:YFR content:Timothy Bush, Sr. (c. 1728 - c. 1815) - soldier., append None
- is not a real word.
- is not a real word.
- is not a real word.
no SFD!
missing sent: subj:YFR content:Symptoms during this time included paranoia, self-mutilation and hallucinations., append None
- is not a real word.
– is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YFR content:In March 1819 he married Martha Hodgkins of Philadelphia, Pennsylvania., append None
- is not a real word.
no SFD!
missing sent: subj:YFR content:Laurance married Mary French in 1934., append None
– is not a real word.
no SFD!
missing sent: subj:YFR content:At the end of the war he was a captain on General Douglas MacArthur's intelligence staff., append None
- is not a real word.
- is not a real word.

 33%|███▎      | 6/18 [39:26<54:22, 271.83s/it]  

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
- is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YFS content:He was a naval officer from 1952-54., append None
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 39%|███▉      | 7/18 [41:50<42:09, 229.98s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YHS content:He is named for his maternal grandfather., append None
- is not a real word.
– is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 44%|████▍     | 8/18 [44:16<33:51, 203.18s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YIS content:In 1958 Ferrer appeared in I Accuse!, append None
– is not a real word.
- is not a real word.
– is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 50%|█████     | 9/18 [47:07<28:58, 193.12s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YLS content:Clooney was Ferrer's third wife., append None
– is not a real word.
- is not a real word.
no word level features: subj:YLS content:Charles II married on May 3 or May 21, 1662, in Portsmouth, and her dowry brought Tangier and Bombay to British control., append None
– is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 56%|█████▌    | 10/18 [49:50<24:31, 183.93s/it]

& is not a real word.
- is not a real word.
no SFD!
missing sent: subj:YMD content:In 1966 she went to United Artists Records., append None
& is not a real word.
no SFD!
missing sent: subj:YMD content:Timothy Bush, Sr. (c. 1728 - c. 1815) - soldier., append None
- is not a real word.
- is not a real word.
- is not a real word.
no SFD!
missing sent: subj:YMD content:He became submissive to upstart Johann Friedrich Struensee, who rose steadily in power in the late 1760s., append None
- is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YMD content:Clooney was Ferrer's third wife., append None
– is not a real word.
- is not a real word.
– is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 61%|██████    | 11/18 [52:53<21:25, 183.60s/it]

& is not a real word.
- is not a real word.
& is not a real word.
no word level features: subj:YMS content:On January 28, 2005 it was revealed that he accepted money from the George W. Bush administration to promote their marriage initiative program, which he did not disclose to his readers., append None
- is not a real word.
- is not a real word.
no word level features: subj:YMS content:Timothy Bush, Sr. (c. 1728 - c. 1815) - soldier., append None
no word level features: subj:YMS content:He is assumed to be the son of Richard Bush and Mary Fairbanks both of Dedham, Massachusetts., append None
no word level features: subj:YMS content:By training he was a blacksmith but when the American Revolution broke out militia Captain Bush led a company of soldiers for the Continental Army., append None
no word level features: subj:YMS content:The family moved around 1810 to Springport, in Cayuga County in the Rochester, New York area., append None
no word level features: subj:YMS content:He died 

 67%|██████▋   | 12/18 [55:51<18:10, 181.78s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
missing sent: subj:YRH content:After many football insiders criticized Manning for being 0-3 in the playoffs, he won his first NFL playoff game against the Denver Broncos on January 4, 2004., append None
missing sent: subj:YRH content:Manning's Passer Rating in the Colts' playoff games against the Broncos and Kansas City Chiefs was a perfect 158.3., append None
missing sent: subj:YRH content:He coached at Winston-Salem State University from 1946 to 1993, compiling a 828-447 record., append None
missing sent: subj:YRH content:He shared the honor with Tennessee Titans quarterback Steve McNair., append None
missing sent: subj:YRH content:However, he posted the third lowest passer rating of his career - 35.5 - in the AFC title game against the New England Patriots, throwing four interceptions in a 24-14 loss., append None
missing sent: subj:YRH content:He put 

 72%|███████▏  | 13/18 [58:17<14:15, 171.05s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YRK content:Clooney was Ferrer's third wife., append None
– is not a real word.
no SFD!
missing sent: subj:YRK content:During the 1940s, Coppola worked under Arturo Toscanini with the NBC Symphony Orchestra., append None
no word level features: subj:YRK content:She was named after Margaret Marge Groening, mother of Matt Groening, creator of The Simpsons., append None
no word level features: subj:YRK content:Her disapproving mother, Jacqueline, lives on but is rarely seen., append None
no SFD!
missing sent: subj:YRK content:Groucho did a German accent., append None
- is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YRK content:This proved to be an regrettable decision., append None
- is not a real word.
- is not a real word.

 78%|███████▊  | 14/18 [1:01:20<11:38, 174.63s/it]

no SFD!
missing sent: subj:YRP content:In the years between the wars, Henry Ford supported Adolf Hitler's Nazi regime., append None
& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
no SFD!
missing sent: subj:YRP content:He is of three quarters Irish and one quarter French descent., append None
- is not a real word.
– is not a real word.
no word level features: subj:YRP content:By the 1964 election, Reagan was an outspoken supporter of conservative Republican Barry Goldwater., append None
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 83%|████████▎ | 15/18 [1:04:17<08:46, 175.41s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
- is not a real word.
– is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 89%|████████▉ | 16/18 [1:07:21<05:55, 177.76s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
- is not a real word.
– is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


 94%|█████████▍| 17/18 [1:10:40<03:04, 184.42s/it]

& is not a real word.
- is not a real word.
& is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
– is not a real word.
– is not a real word.
no word level features: subj:YTL content:He died, however, aboard ship and was given a sea burial., append None
no SFD!
missing sent: subj:YTL content:Groucho did a German accent., append None
- is not a real word.
– is not a real word.
no word level features: subj:YTL content:The Perrys have four children., append None
no word level features: subj:YTL content:He was the victim of sexual abuse several times in childhood., append None
no word level features: subj:YTL content:He played a snobby bully opposite Mickey Rooney., append None
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.
- is not a real word.


100%|██████████| 18/18 [1:13:56<00:00, 246.49s/it]


write to: ./dataset/ZuCo/task2-NR-2.0/pickle\task2-NR-2.0-dataset.pickle
subjects: dict_keys(['YAC', 'YAG', 'YAK', 'YDG', 'YDR', 'YFR', 'YFS', 'YHS', 'YIS', 'YLS', 'YMD', 'YMS', 'YRH', 'YRK', 'YRP', 'YSD', 'YSL', 'YTL'])
num of sent: 349


#Sentiment labels

In [25]:
import os
from glob import glob
import json

print('##############################')
print('start generating ZuCo task1-SR sentiment labels...')


sentiment_labels_task1_csv_path = './dataset/ZuCo/task_materials/sentiment_labels_task1.csv'

sentiment_labels = {}
with open(sentiment_labels_task1_csv_path, 'r') as f:
    for line in f:
        if line.startswith('sentence_id') or line.startswith('#'):
            continue
        else:
            parsed_line = line.split(';')
            # handle edge case:
            if '\";' in line:
                sent_text = line.split('\";')[0].split('\"')[1]
            else:
                sent_text = parsed_line[1]
            label = int(parsed_line[-1].strip())
            sentiment_labels[sent_text] = label

output_dir = f'./dataset/ZuCo/task1-SR/sentiment_labels'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with open(os.path.join(output_dir, 'sentiment_labels.json'), 'w') as out:
    json.dump(sentiment_labels,out,indent = 4)
    print('write to ./dataset/ZuCo/task1-SR/sentiment_labels/sentiment_labels.json')



##############################
start generating ZuCo task1-SR sentiment labels...
write to ./dataset/ZuCo/task1-SR/sentiment_labels/sentiment_labels.json


In [28]:
#!pip install torch
#!pip install transformers
!pip install fuzzy_match

Collecting fuzzy_match
  Using cached fuzzy_match-0.0.1-py3-none-any.whl (5.4 kB)
Installing collected packages: fuzzy_match
Successfully installed fuzzy_match-0.0.1


In [29]:

import os
import numpy as np
import torch
import pickle   
from torch.utils.data import Dataset, DataLoader
import json
import matplotlib.pyplot as plt
from glob import glob
from transformers import BartTokenizer
from tqdm import tqdm
from fuzzy_match import match
from fuzzy_match import algorithims


def get_SST_dataset(SST_dir_path, ZuCo_used_sentences, ZUCO_SENTIMENT_LABELS):
    
    def get_sentiment_label_dict(SST_dictionary_file_path):
        '''
            return {phrase_id:sentiment_score(0-1)}
        '''
        ret_dict = {}
        with open(SST_dictionary_file_path) as f:
            for line in f:
                if line.startswith('phrase'):
                    continue
                else:
                    phrase_id = int(line.split('|')[0])
                    label = float(line.split('|')[1].strip())
                    assert phrase_id not in ret_dict
                    ret_dict[phrase_id] = label
        return ret_dict

    def get_phrasestr_phrase_dict(SST_dictionary_file_path):
        '''
            return {phrase_str: phrase_id}
        '''
        ret_dict = {}
        with open(SST_dictionary_file_path) as f:
            for line in f:
                phrase_str = line.split('|')[0]
                phrase_id = int(line.split('|')[1].strip())
                assert phrase_str not in ret_dict
                ret_dict[phrase_str] = phrase_id
        return ret_dict

    def get_sentence_label_dict(SST_sentences_file_path, SST_labels_file_path, SST_dictionary_file_path):
        '''
            return {sentence_str:label(0-1)}
        '''
        phraseID_2_label = get_sentiment_label_dict(SST_labels_file_path)
        phraseStr_2_phraseID = get_phrasestr_phrase_dict(SST_dictionary_file_path)

        sentence_2_label_all = {}
        sentence_2_label_ternary = {}
        with open(SST_sentences_file_path) as f:
            for line in f:
                if line.startswith('sentence_index'):
                    continue
                else:
                    parsed_line = line.split('\t')
                    assert len(parsed_line) == 2
                    sentence = parsed_line[1].strip()
                    # convert -LRB- to (, -RRB- to ):
                    sentence = sentence.replace('-LRB-','(').replace('-RRB-',')').replace('Ã©','é')
                    if sentence not in phraseStr_2_phraseID:
                        # print(f'[ERROR]sentence-phrase match not found in dictionary, skipped: {sentence}')
                        # print()
                        continue
                    sent_phrase_id = phraseStr_2_phraseID[sentence]
                    label = phraseID_2_label[sent_phrase_id]
                    
                    # add to all dict
                    if sentence not in sentence_2_label_all:
                        sentence_2_label_all[sentence] = label

                    # add to ternary dict
                    if sentence not in sentence_2_label_ternary:
                        if label<=0.2:
                            label = 0
                            sentence_2_label_ternary[sentence] = label
                        elif (label > 0.4) and (label<=0.6): 
                            label = 1
                            sentence_2_label_ternary[sentence] = label
                        elif label>0.8:
                            label = 2
                            sentence_2_label_ternary[sentence] = label

        return sentence_2_label_all, sentence_2_label_ternary


    SST_sentences_file_path = os.path.join(SST_dir_path,'datasetSentences.txt')
    if not os.path.isfile(SST_sentences_file_path):
        print(f'NOT FOUND file: {SST_sentences_file_path}')
    SST_labels_file_path = os.path.join(SST_dir_path,'sentiment_labels.txt')
    if not os.path.isfile(SST_labels_file_path):
        print(f'NOT FOUND file: {SST_labels_file_path}')
    SST_dictionary_file_path = os.path.join(SST_dir_path,'dictionary.txt')
    if not os.path.isfile(SST_dictionary_file_path):
        print(f'NOT FOUND file: {SST_dictionary_file_path}')

    sentence_2_label_all, sentence_2_label_ternary = get_sentence_label_dict(SST_sentences_file_path, SST_labels_file_path, SST_dictionary_file_path)
    print('original ternary dataset size:', len(sentence_2_label_ternary))

    ZuCo_used_sentences = list(ZUCO_SENTIMENT_LABELS)

    filtered_ternary_dataset = {}
    filtered_pairs = []
    for key,value in sentence_2_label_ternary.items():
        add_instance = True
        for used_sent in ZuCo_used_sentences:
            if algorithims.trigram(used_sent, key) > 0.7:
                # print(f'Filter match: \n\t{used_sent}\n\t{key}')
                # print('###########################')
                filtered_pairs.append((used_sent, key))
                ZuCo_used_sentences.remove(used_sent)
                add_instance = False
                break
        if add_instance:
            filtered_ternary_dataset[key] = value
    
    print('filtered instance number:', len(filtered_pairs))
    print('filtered ternary dataset size:', len(filtered_ternary_dataset))
    print('unmatched remaining sentences:', ZuCo_used_sentences)
    print('unmatched remaining sentences length:', len(ZuCo_used_sentences))
    with open('temp.txt','w') as temp:
        for matched_pair in filtered_pairs:
            temp.write('#######\n')
            temp.write('\t'+matched_pair[0]+'\n')
            temp.write('\t'+matched_pair[1]+'\n')
            temp.write('\n')

    with open('./dataset/stanfordsentiment/ternary_dataset.json', 'w') as out:
        json.dump(filtered_ternary_dataset,out, indent = 4)
    print('write json to /dataset/stanfordsentiment/ternary_dataset.json')

if __name__ == '__main__':
    print('##############################')
    print('start generating stanfordSentimentTreebank ternary sentiment dataset...')
    SST_dir_path = './dataset/stanfordsentiment/stanfordSentimentTreebank'
    ZuCo_task1_csv_path = './dataset/ZuCo/task_materials/sentiment_labels_task1.csv'
    ZUCO_SENTIMENT_LABELS = json.load(open('./dataset/ZuCo/task1-SR/sentiment_labels/sentiment_labels.json'))

    get_SST_dataset(SST_dir_path, ZuCo_task1_csv_path, ZUCO_SENTIMENT_LABELS)

##############################
start generating stanfordSentimentTreebank ternary sentiment dataset...
original ternary dataset size: 5578
filtered instance number: 393
filtered ternary dataset size: 5185
unmatched remaining sentences: ['Angel presents events partly from the perspective of Aurelie and Christelle, and infuses the film with the sensibility of a particularly nightmarish fairytale.', 'Feels less like a cousin to Blade Runner than like a bottom-feeder sequel in the Escape From New York series.', 'I was feeling this movie until it veered off too far into the Exxon zone, and left me behind at the station looking for a return ticket to realism.', 'The sort of picture in which, whenever one of the characters has some serious soul searching to do, they go to a picture-perfect beach during sunset.', "If there's a way to effectively teach kids about the dangers of drugs, I think it's in projects like the (unfortunately R-rated) Paid.", "I didn't laugh at the ongoing efforts of Cub