In [1]:
import os
import json

In [2]:
def read_non_filter_data(data_path):
    """
    Reads non-filter data from a JSON file.
    Args:
		data_path (str): Path to the JSON file.
    Returns:
    	dict: Parsed JSON data.
	"""
    with open(data_path, 'r') as input_file:
        data = json.load(input_file)
    return data

In [3]:
def read_train_data(data_directory_path):
  """
  Read all training data from a directory.
    Args:
			data_directory_path (str): Path to the directory containing training data files.
    Returns:
    	list: List of sentences from all files in the directory.
	"""
  all_data = []
  total_sentence = 0
  for filename in os.listdir(data_directory_path):
    file_path = os.path.join(data_directory_path, filename)
    if os.path.isfile(file_path):
      print(f'Reading file: {filename}')
      with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
          total_sentence+=1
          all_data.append(line.strip())
  print(f"total sentece in directory: {total_sentence}")
  return all_data

In [15]:
def filter_data(data, freq_words_list, uid):
    """
    Filters data based on the presence of words in a frequency list.
    Args:
			data (list): List of dictionaries containing sentences.
			freq_words_list (set): Set of frequent words to filter by.
			uid (str): Unique identifier for the filtered data.
    Returns:
			list: Filtered list of dictionaries containing sentences.
    """
    filtered_data = []
    for prep in data:
        flag = True
        sent_good = prep['sentence_good']
        sent_bad = prep["sentence_bad"]
        prep['UID'] = uid
        for word_good, word_bad in zip(sent_good.split(),sent_bad.split()):
            if not word_good in freq_words_list or not word_bad in freq_words_list:
                flag = False
                break
        if flag:
            filtered_data.append(prep)

    return filtered_data

In [5]:
def frequency_of_train_data(train_data):
    """
    Calculate the frequency of words in the training data.
    Args:
		train_data (list): List of sentences.
    Returns:
    	dict: Dictionary with words as keys and their frequencies as values.
	"""
    words_count = dict()
    for sentence in train_data:
        for word in sentence.split():
            if word in words_count:
                words_count[word] +=1
            else:
                words_count[word] =1
    return words_count

In [6]:
def freq_filter(word_and_freq_dict, word_freq):
    """
    Filters words based on their frequency.
    Args:
		word_and_freq_dict (dict): Dictionary with words as keys and their frequencies as values.
		word_freq (int): Minimum frequency threshold.
    Returns:
		list: List of words that meet the frequency threshold.
    """
    all_words = 0
    freq_words = 0
    freq_data_list = []
    for word, freq in word_and_freq_dict.items():
        all_words +=1
        if freq > word_freq:
            freq_words+=1
            freq_data_list.append(word)
    print(all_words, freq_words)
    return freq_data_list

In [7]:
def merge_filer_perps(prep_r, prep_m, prep_u):
    """
    Merges three preposition lists.
    Args:
		prep_r (list): List of dictionaries containing replacing_preposition.
		prep_m (list): List of dictionaries containing missing_preposition.
		prep_u (list): List of dictionaries containing unnessary_preposition.
    Returns:
		list: Merged list of all preposition types.
    """
    merged_list = prep_r + prep_m + prep_u
    return merged_list

In [8]:
def save_filter_prep(list_preps, save_path):
    """
    Saves the filtered list of dictionaries to a JSON file.
    Args:
		list_preps (list): List of dictionaries containing sentences.
		save_path (str): Path to the output JSON file.
	"""
    with open(save_path, 'w') as out_file:
        json.dump(list_preps, out_file)

In [9]:
# Reading babyLM challenge train data
train_directory_path = '/content/drive/MyDrive/VU Thesis/Code/BLiMP_style_code/data/babylm_10M'
sequences = read_train_data(train_directory_path)
print('='*50)

Reading file: wikipedia.train
Reading file: gutenberg.train
Reading file: open_subtitles.train
Reading file: simple_wikipedia.train
Reading file: qed.train
Reading file: aochildes.train
Reading file: bnc_spoken.train
Reading file: children_stories.train
Reading file: switchboard.train
Reading file: cbt.train
total sentece in directory: 1058740


In [10]:
# Calculating frequency of words in the training data
words_count = frequency_of_train_data(sequences)

In [11]:
# Filtering training data words based on their frequency
final_list = freq_filter(words_count, 2)


437728 125991


In [16]:
# Reading and filtering unnecessary preposition data
prep_data_u = read_non_filter_data('/content/drive/MyDrive/VU Thesis/Code/BLiMP_style_code/data/blimp_prep_u.json')
filtered_u = filter_data(prep_data_u, final_list, "Preposition_Unnecessary")
len(filtered_u), len(prep_data_u)

(866, 1075)

In [17]:
# Reading and filtering replacing preposition data
prep_data_r = read_non_filter_data('/content/drive/MyDrive/VU Thesis/Code/BLiMP_style_code/data/blimp_prep_r.json')
filtered_r = filter_data(prep_data_r, final_list, "Preposition_Replacing")
len(filtered_r), len(prep_data_r)

(2374, 3055)

In [18]:
# Reading and filtering missing preposition data
prep_data_m = read_non_filter_data('/content/drive/MyDrive/VU Thesis/Code/BLiMP_style_code/data/blimp_prep_m.json')
filtered_m = filter_data(prep_data_m, final_list, "Preposition_Missing")
len(filtered_m), len(prep_data_m)

(1154, 1414)

In [21]:
# Merging filtered preposition data
final_prep = merge_filer_perps(filtered_r, filtered_m, filtered_u)
len(final_prep)

4394

In [22]:
# Saving the merged filtered preposition data to a JSON file
save_filter_prep(final_prep, 'preposition.json')

In [23]:
# check words with '-'
count = 0
for k, v in words_count.items():
    if v>=2 and '-' in k:
        count+=1
        print(k,": ", v)
print(f"{'='*50}\nThere is {count} words with '-'")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
-Let's :  30
-Which :  14
-They :  46
in-- :  8
-Frank, :  3
-Who :  42
phone-- :  2
-Watch :  5
-This :  57
-Fuck. :  2
-Look, :  10
-They're :  18
-She's :  28
-ln :  2
-Hello? :  6
-Listen, :  7
-Hey! :  10
-What'd :  4
-Nothing. :  10
-Thanks, :  8
-For :  21
-Thanks. :  23
mother-in-law's :  4
-Yes. :  113
-Objection! :  3
-Objection :  2
sister-in-Iaw :  4
-Yes! :  13
-A :  59
father-son :  4
sister-in-Iaw. :  7
-Yes? :  14
-Sir! :  2
-Sir, :  3
-Action! :  4
-Thats :  4
sister-in-Iaw? :  2
Sister-in-Iaw! :  2
sister-in-Iaw, :  2
sister-in-Iaw! :  4
-Yeah! :  9
Um-- :  5
a-- :  34
uh-- :  36
thing-- :  11
of-- :  24
You- :  3
Uh-- :  6
I-- :  43
I- :  51
don't-- :  18
is-- :  24
or-- :  9
gonna-- :  10
does-- :  2
mean-- :  9
Uh- :  5
left-- :  2
need-- :  2
life-- :  2
truth-- :  2
um-- :  14
Uh-huh. :  1125
herself-- :  2
figures-- :  2
gun-- :  2
Uh-oh. :  30
if-- :  6
this-- :  14
see-- :  6
be- :  7
that- :  10