#### Dictionary coverage

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from scipy import stats
import statistics
from matplotlib.patches import FancyArrow
import json

plt.rcParams['figure.dpi'] = 900

# 将X/Y轴的刻度线方向设置向内
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'

# 设置字体
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Microsoft Yahei'

# 设置公式字体
config = {
    "mathtext.fontset": 'stix',
}
rcParams.update(config)
colors = ['#3a3a8c', '#7daae2', '#fae084', '#f47044', '#cf3d3c', '#a5514e']


# 搜索词组的句子
# def search_phrase(filename, phrase):
#     local_data = []
#     text = []
#     with open('./data/' + filename, 'r', encoding='utf-8') as file:
#         local_data = json.load(file)
#     for ele in local_data:
#         if phrase in ele['phrase']:
#             text.append(ele['text'].strip())
#     return text
    

def appear_times_for_single(data, phrase):
    times = 0
    for i,ele in enumerate(data):
        if phrase in list(ele['phrase'].keys()):
            times += ele['phrase'][phrase]
    return times    


def count_bins(number):
    n_string = '{:.15f}'.format(number)
    decimal_part = str(n_string).split('.')[1]  # 将小数部分转换为字符串并分割
    count = 0
    for digit in decimal_part:
        if digit == '0':
            count += 1
        else:
            break  # 遇到第一个非零数字就停止计数
    return count + 1


# Michel 2007 计算频区间的中心值
def get_center_value(freqs):
    maxi = np.log10(max(freqs))
    minu = np.log10(min(freqs))
    center = (maxi + minu) / 2  # logarithmic mean 对数均值
    return 10**center

##### 选择数据集版本

In [4]:
filename = 'en_data_final_1.json'

##### 加载Json到内存

In [5]:
local_data = []
with open('./data/'+filename, 'r', encoding='utf-8') as file:
    local_data = json.load(file)
print(len(local_data))
print(local_data[0])

# 统计词组类型
phrases = []

for ele in local_data:
    text = ele['text']
    phrase_list = ele['phrase']
    for p in phrase_list:
        phrases.append(p)

from collections import Counter

latin_phrases = []
apper_times = {}
# 使用Counter统计元素个数
element_counts = Counter(phrases)
# 按照出现次数降序排序
sorted_counts = sorted(element_counts.items(), key=lambda x: x[1], reverse=True)
# 输出排序结果
for element, count in sorted_counts:
    print(f"{element}: {count}")
    latin_phrases.append(element)
    apper_times[element] = count

# print(latin_phrases)

# 保存数据集中出现的拉丁词组
# with open('./data/data readability/latin phrases.txt', 'w', encoding='utf-8') as file:
#     for pha in latin_phrases:
#         file.write(pha + '\n')
# file.close()

In [None]:
# 查找语句
def search_phrase(phrase, data):
    results = []
    for ele in data:
        # if len(ele['phrase'].keys()) > 1:
        #     print(ele['phrase'])
        if phrase in ele['phrase'].keys():
            results.append(ele)
    return results
        
        
pha = 'alter ego'
len(search_phrase(pha, local_data))

4932

##### 获取只有一个拉丁词组的语句

In [5]:
selected_phrases = []
with open('./data/data readability/latin phrases.txt', 'r', encoding='utf-8') as file:
    contents = file.readlines()
    print(len(contents))
    for ele in contents:
        selected_phrases.append(ele.strip())
# print(selected_phrases)

# 将语句的数量限制在100条以内，一个词组对应的语句不超过100条
phrase_text_dict = {}
for i, pha in enumerate(selected_phrases):
    # if i+1 > 1:
    #     break
    # print(i+1, pha)
    ele_list = []
    for ele in search_phrase(pha, local_data):
        # print(ele)
        # 对于{'de facto': 12, 'de jure': 1}，跳过包含多个词组的语句
        # {'ex vivo': 2} 一句话中包含多个相同词组的情况，跳过
        if len(ele['phrase'].keys())==1 and  ele['phrase'][pha]==1 and len(ele['text'])==1:
            ele_list.append(ele)
            # print(type(ele), ele)
    # print(len(ele_list))
    # 获取词组和对应的语句
    text_list = []
    for j, pha_dict in enumerate(ele_list):
        # if j > 2:
        #     break
        # print(j+1, pha_dict)
        try:
            # 将string文本 转换成 dict
            text_string = str(pha_dict['text'])
            text_dict = json.loads(text_string.replace("'", "\""))

            text = text_dict[pha].strip()
            if pha not in text:
                continue
            text_list.append(text)
            # print(pha, text)
        # 按照上面的 string-->dict 有一部分的语句可能会转换失败
        # 对于出现错误的语句，不用处理，直接跳过
        except Exception as e:
            continue
    if len(text_list) != 0:
        phrase_text_dict[pha] = text_list
        print(i+1, pha, len(text_list))

# print(len(phrase_text_dict['status quo']))

760
1 status quo 21191
2 de facto 15196
3 per capita 12715
4 vice versa 8245
5 per se 6709
6 alma mater 7437
7 pros and cons 7038
8 per annum 6466
9 ad hoc 5664
10 bona fide 4375
11 in fine 4656
12 in retrospect 3438
13 alter ego 2549
14 modus operandi 2691
15 pro bono 2629
16 in vitro 1783
17 a cappella 1899
18 in absentia 2051
19 quid pro quo 1717
20 et cetera 1088
21 prima facie 1417
22 in situ 1826
23 pro forma 1168
24 inter alia 1268
25 habeas corpus 1225
26 post mortem 1439
27 mea culpa 1121
28 in vivo 575
29 magnum opus 930
30 per diem 782
31 ad hominem 685
32 ad nauseam 833
33 persona non grata 706
34 pro tem 745
35 in utero 696
36 in camera 577
37 a priori 530
38 spina bifida 556
39 de jure 219
40 ad infinitum 544
41 pro rata 502
42 sine qua non 498
43 suo motu 489
44 in pace 452
45 curriculum vitae 512
46 pro tempore 410
47 ex parte 328
48 amicus curiae 349
49 de novo 282
50 terra firma 387
51 aurora borealis 312
52 sui generis 341
53 non sequitur 300
54 ipso facto 300
55 ad 

In [10]:
import random


# 将排序好的列表分成 number 等份， 再获取中间的元素
def get_middle_values(sorted_list, number):
    chunk_size = len(sorted_list) // number
    # 从每份中选取中间的元素
    middle_elements = []
    for i in range(number):
        start_index = i * chunk_size
        end_index = start_index + chunk_size
        middle_index = (start_index + end_index) // 2
        middle_elements.append(sorted_list[middle_index])
    return middle_elements


# 词组和对应的语句列表
using_dict = {}
# 获取语料
number = 0
for i, (pha, texts) in enumerate(phrase_text_dict.items()):
    # if (i+1) > 1:
    #     break
    # print(pha)
    # 如果语句的数量超过 50条，从中随机选取 50条作为实验对象
    selected_texts = []
    if len(texts) >= 50:
        # 按照文本字符数量排序，均匀选取
        sorted_texts = sorted(texts, key=lambda x: len(x))
        # selected_texts = random.sample(texts, 10)
        selected_texts = get_middle_values(sorted_texts, 50)
    else:
        selected_texts = texts
    # print(pha, len(selected_texts))
    number += len(selected_texts)
    using_dict[pha] = selected_texts
    
print('词组数量:', len(phrase_text_dict))
print('语句数量:', number)

# save dict, using LLMs generate
with open('./data/llm/LE-dataset v1.2.json', 'w') as file:
    json.dump(using_dict, file, indent=4)
file.close()        

词组数量: 527
语句数量: 8003


In [14]:
import json

def compare_json_files(file1_path, file2_path):
    # 读取第一个 JSON 文件并加载为字典
    with open(file1_path, 'r') as file1:
        json_data1 = json.load(file1)

    # 读取第二个 JSON 文件并加载为字典
    with open(file2_path, 'r') as file2:
        json_data2 = json.load(file2)

    # 比较两个字典是否相同
    return json_data1 == json_data2

# 两个 JSON 文件的路径
file1_path = './data/llm/LE-dataset v1.0.json'
file2_path = './data/llm/LE-dataset v1.1.json'

# 比较两个 JSON 文件是否相同
result = compare_json_files(file1_path, file2_path)

if result:
    print("两个 JSON 文件相同")
else:
    print("两个 JSON 文件不同")


两个 JSON 文件相同
