In [None]:
import os
import json
import random
import pandas as pd
from random import shuffle
from sklearn.model_selection import train_test_split

In [None]:
data_path = "/content/drive/MyDrive/VU Thesis/Code/fine_tune/data"

In [None]:
def read_all_data(data_directory_path):
    """
    Reads all filtered BLiMP data from the specified directory.
    Args:
        data_directory_path (str): Path to the directory containing the BLiMP data files.
    Returns:
        pd.DataFrame: A DataFrame containing all the data from the files.
        list: A list of dictionaries containing all the data.
    """
    all_data = []
    for filename in os.listdir(data_directory_path):
        file_path = os.path.join(data_directory_path, filename)
        if os.path.isfile(file_path):
            error_lable = filename.split(".")[0]
            print(f'Read {error_lable} File...')
            with open(file_path, 'r', encoding='utf-8') as file:
                if error_lable == "preposition":
                    for itm in json.load(file):
                        all_data.append(itm)
                else:
                    for line in file:
                        all_data.append(json.loads(line))

                print("Done.")

    return pd.DataFrame(all_data), all_data

In [None]:
def data_errors_and_g_classes(data_list, rnd_per):
    """
    Creates a list of grammatical sentences and a list of errors from the provided data.
    Args:
        data_list (list): A list of dictionaries containing the data.
        rnd_per (float): The percentage of errors to include in the grammatical sentences.
    Returns:
        list: A list of grammatical sentences.
        list: A list of errors.
    """
    g_list=[]
    error_list=[]
    pair_list=[]
    errors_count = 0
    for item in data_list:
        if item['linguistics_term'] in ["Preposition", "subject_verb_agreement", "determiner_noun_agreement"]:
            g_list.append(item)
            error_list.append(item)
            errors_count+=1
        else:
            pair_list.append(item)
    print(errors_count)

    rand_sample_number = int(len(error_list)*rnd_per)
    g_list = random.sample(g_list, rand_sample_number)
    rnd_sample = random.sample(pair_list, int(len(error_list) - rand_sample_number))
    g_list = g_list + rnd_sample
    shuffle(g_list)

    return g_list, error_list

In [None]:
def prepare_data(error_list, g_list):
    """
    Prepares the final data structure for training by combining grammatical sentences and errors.
    Args:
        error_list (list): A list of errors.
        g_list (list): A list of grammatical sentences.
    Returns:
        list: A list of dictionaries containing the final data structure.
    """
    final_list = []
    for item in error_list:
        final_list.append({'text':item['sentence_bad'], 'labels':item['linguistics_term'], 'UID':item['UID']})
    for item in g_list:
        final_list.append({'text':item['sentence_good'], 'labels':"G", 'UID':item['UID']})
    return final_list

In [None]:
def count_data_tokens(data):
    """
    Counts the number of tokens in the provided data.
    Args:
        data (pd.DataFrame): A DataFrame containing the data.
    """
    token_count = 0
    for sent in data.text.tolist():
        token_count+=len(sent.split())
    print(token_count)

#####  IF DATA EXIST RUN NEXT TWO CELLS ONLY

In [None]:
print("loading data...")

print("loading all_prepared_data.csv")
prepared_data_df =pd.read_csv(os.path.join(data_path, "all_prepared_data.csv"))

print("loading g_class_data.csv")
grammatical_df =pd.read_csv(os.path.join(data_path, "g_class_data.csv"))
print("loading errors_class_data.csv")
errors_df =pd.read_csv(os.path.join(data_path, "errors_class_data.csv"))
# grammatical_list = grammatical_df.to_dict('records')
# errors_list = errors_df.to_dict('records')

print("loading train_data.csv")
train_df =pd.read_csv(os.path.join(data_path, "train_data.csv"))
print("loading dev_test_no_split_data.csv")
dev_test_df =pd.read_csv(os.path.join(data_path, "dev_test_no_split_data.csv"))

print("loading dev_data.csv")
dev_df =pd.read_csv(os.path.join(data_path, "dev_data.csv"))
print("loading test_data.csv")
test_df =pd.read_csv(os.path.join(data_path, "test_data.csv"))
print("loading data compelete.")

loading data...
loading all_prepared_data.csv
loading g_class_data.csv
loading errors_class_data.csv
loading train_data.csv
loading dev_test_no_split_data.csv
loading dev_data.csv
loading test_data.csv
loading data compelete.


In [None]:
print('All data tokens:')
count_data_tokens(prepared_data_df)
print('Train tokens:')
count_data_tokens(train_df)
print('Dev tokens:')
count_data_tokens(dev_df)
print('Test tokens:')
count_data_tokens(test_df)


All data tokens:
325046
Train tokens:
194474
Dev tokens:
65097
Test tokens:
65475


#####  IF DATA NOT EXIST RUN FOLLOWING CELLS

In [None]:
blimp_filtered_path = os.path.join(data_path, "blimp_filtered")
df, data_list= read_all_data(blimp_filtered_path)

Read anaphor_agreement File...
Done.
Read island_effects File...
Done.
Read binding File...
Done.
Read argument_structure File...
Done.
Read ellipsis File...
Done.
Read control_raising File...
Done.
Read npi_licensing File...
Done.
Read filler_gap File...
Done.
Read irregular_forms File...
Done.
Read quantifiers File...
Done.
Read subject_verb_agreement File...
Done.
Read determiner_noun_agreement File...
Done.
Read preposition File...
Done.


In [None]:
grammatical_list, errors_list = data_errors_and_g_classes(data_list, 0.7)
print(f"sentences in:\nerror list: {len(errors_list)}\ngrammatical list: {len(grammatical_list)}")

17470
sentences in:
error list: 17470
grammatical list: 17470


In [None]:
prepared_data = prepare_data(errors_list, grammatical_list)
print(f"prepared data senteces: {len(prepared_data)}")
prepared_data_df = pd.DataFrame(prepared_data)

prepared data senteces: 34940


In [None]:
label_mapping = {
    'G': 'G',
    'Preposition': 'PREP',
    'determiner_noun_agreement': 'DET',
    'subject_verb_agreement': 'SVA'
}

prepared_data_df['labels'] = prepared_data_df['labels'].map(label_mapping)
prepared_data_df.head()

Unnamed: 0,text,labels,UID
0,A niece of most senators haven't descended mos...,SVA,distractor_agreement_relational_noun
1,The sketch of those trucks haven't hurt Alan.,SVA,distractor_agreement_relational_noun
2,A newspaper article about the Borgias have dis...,SVA,distractor_agreement_relational_noun
3,The niece of most guests have cleaned every co...,SVA,distractor_agreement_relational_noun
4,A sketch of lights don't appear.,SVA,distractor_agreement_relational_noun


In [None]:
prepared_data_df.groupby(prepared_data_df['labels'].tolist(),as_index=False).size()

Unnamed: 0,index,size
0,DET,7542
1,G,17470
2,PREP,4393
3,SVA,5535


In [None]:
errors_df = pd.DataFrame(errors_list)
errors_df.to_csv(os.path.join(data_path, "errors_class_data.csv"), index=False)
errors_df.groupby(errors_df['linguistics_term'].tolist(),as_index=False).size()

Unnamed: 0,index,size
0,Preposition,4393
1,determiner_noun_agreement,7542
2,subject_verb_agreement,5535


In [None]:
grammatical_df = pd.DataFrame(grammatical_list)
grammatical_df.to_csv(os.path.join(data_path, "g_class_data.csv"), index=False)
grammatical_df.groupby(grammatical_df['linguistics_term'].tolist(),as_index=False).size()

Unnamed: 0,index,size
0,Preposition,3001
1,anaphor_agreement,231
2,argument_structure,786
3,binding,798
4,control_raising,532
5,determiner_noun_agreement,5297
6,ellipsis,203
7,filler_gap_dependency,763
8,irregular_forms,228
9,island_effects,284


In [None]:
prepared_data_df.to_csv(os.path.join(data_path, "all_prepared_data.csv"), index=False)

In [None]:
train_df, dev_test_df = train_test_split(prepared_data_df, train_size=0.6, random_state=42, stratify=prepared_data_df['labels'])

In [None]:
test_df, dev_df = train_test_split(dev_test_df, train_size=0.5, random_state=42, stratify=dev_test_df['labels'])

In [None]:
train_df.groupby(train_df['labels'].tolist(),as_index=False).size()

Unnamed: 0,index,size
0,DET,4525
1,G,10482
2,PREP,2636
3,SVA,3321


In [None]:
dev_test_df.groupby(dev_test_df ['labels'].tolist(),as_index=False).size()

Unnamed: 0,index,size
0,DET,3017
1,G,6988
2,PREP,1757
3,SVA,2214


In [None]:
test_df.groupby(test_df['labels'].tolist(),as_index=False).size()

Unnamed: 0,index,size
0,DET,1508
1,G,3494
2,PREP,879
3,SVA,1107


In [None]:
dev_df.groupby(dev_df['labels'].tolist(),as_index=False).size()

Unnamed: 0,index,size
0,DET,1509
1,G,3494
2,PREP,878
3,SVA,1107


In [None]:
dev_df.to_csv(os.path.join(data_path, "dev_data.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data.csv"), index=False)
train_df.to_csv(os.path.join(data_path, "train_data.csv"), index=False)
dev_test_df.to_csv(os.path.join(data_path, "dev_test_no_split_data.csv"), index=False)

In [None]:
print('All data tokens:')
count_data_tokens(prepared_data_df)
print('Train tokens:')
count_data_tokens(train_df)
print('Dev tokens:')
count_data_tokens(dev_df)
print('Test tokens:')
count_data_tokens(test_df)


All data tokens:
325046
Train tokens:
194474
Dev tokens:
65097
Test tokens:
65475
