In [1]:
from datasets import load_dataset

az_massive_train = load_dataset("AmazonScience/massive", "ko-KR", split='train')
az_massive_valid = load_dataset("AmazonScience/massive", "ko-KR", split='validation')
az_massive_test = load_dataset("AmazonScience/massive", "ko-KR", split='test')

In [2]:
# dataset count
print("Train Dataset : ", len(az_massive_train))
print("Valid Dataset : ", len(az_massive_valid))
print("Test Dataset  : ", len(az_massive_test))

Train Dataset :  11514
Valid Dataset :  2033
Test Dataset  :  2974


In [3]:
def filter_empty_slots(dataset): # ["slot_method"]["slot"] None filtering and delete unchanged_translation
    filtered_dataset = dataset.filter(
        lambda example: example['slot_method']['slot'] != [] and 'unchanged_translation' not in example['slot_method']['method']
    )
    return filtered_dataset


filtered_train = filter_empty_slots(az_massive_train)
filtered_valid = filter_empty_slots(az_massive_valid)
filtered_test = filter_empty_slots(az_massive_test)

print(f"Original train size: {len(az_massive_train)}, Filtered train size: {len(filtered_train)}")
print(f"Original valid size: {len(az_massive_valid)}, Filtered valid size: {len(filtered_valid)}")
print(f"Original test size: {len(az_massive_test)}, Filtered test size: {len(filtered_test)}")

Original train size: 11514, Filtered train size: 7757
Original valid size: 2033, Filtered valid size: 1387
Original test size: 2974, Filtered test size: 1980


In [4]:
filtered_train

Dataset({
    features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
    num_rows: 7757
})

In [5]:
intents_dict = {
    'datetime_query': '날짜_및_시간_조회',
    'iot_hue_lightchange': 'IoT_조명_변경',
    'transport_ticket': '교통_티켓',
    'takeaway_query': '테이크아웃_문의',
    'qa_stock': '주식_문의',
    'general_greet': '일반_인사',
    'recommendation_events': '이벤트_추천',
    'music_dislikeness': '음악_비호감',
    'iot_wemo_off': 'IoT_Wemo_끄기',
    'cooking_recipe': '요리_레시피',
    'qa_currency': '환율_문의',
    'transport_traffic': '교통_혼잡도',
    'general_quirky': '일반_독특함',
    'weather_query': '날씨_문의',
    'audio_volume_up': '오디오_볼륨_올리기',
    'email_addcontact': '이메일_연락처_추가',
    'takeaway_order': '테이크아웃_주문',
    'email_querycontact': '이메일_연락처_조회',
    'iot_hue_lightup': 'IoT_조명_밝게_하기',
    'recommendation_locations': '장소_추천',
    'play_audiobook': '오디오북_재생',
    'lists_createoradd': '목록_생성_또는_추가',
    'news_query': '뉴스_문의',
    'alarm_query': '알람_조회',
    'iot_wemo_on': 'IoT_Wemo_켜기',
    'general_joke': '일반_농담',
    'qa_definition': '정의_문의',
    'social_query': '소셜_문의',
    'music_settings': '음악_설정',
    'audio_volume_other': '오디오_볼륨_기타_설정',
    'calendar_remove': '캘린더_삭제',
    'iot_hue_lightdim': 'IoT_조명_어둡게_하기',
    'calendar_query': '캘린더_조회',
    'email_sendemail': '이메일_보내기',
    'iot_cleaning': 'IoT_청소',
    'audio_volume_down': '오디오_볼륨_낮추기',
    'play_radio': '라디오_재생',
    'cooking_query': '요리_문의',
    'datetime_convert': '날짜_및_시간_변환',
    'qa_maths': '수학_문의',
    'iot_hue_lightoff': 'IoT_조명_끄기',
    'iot_hue_lighton': 'IoT_조명_켜기',
    'transport_query': '교통_조회',
    'music_likeness': '음악_호감',
    'email_query': '이메일_문의',
    'play_music': '음악_재생',
    'audio_volume_mute': '오디오_볼륨_음소거',
    'social_post': '소셜_게시',
    'alarm_set': '알람_설정',
    'qa_factoid': '사실_문의',
    'calendar_set': '캘린더_설정',
    'play_game': '게임_재생',
    'alarm_remove': '알람_삭제',
    'lists_remove': '목록_제거',
    'transport_taxi': '택시',
    'recommendation_movies': '영화_추천',
    'iot_coffee': 'IoT_커피',
    'music_query': '음악_문의',
    'play_podcasts': '팟캐스트_재생',
    'lists_query': '목록_조회'
}

_INTENTS_KO = [value for key, value in intents_dict.items()]

# ## intent dict 형태로 변환

# def intent_dict_save(file_path, intents_dict):
#     intents_dict_modified = {k: v.replace(" ", "_") for k, v in intents_dict.items()}
#     dict_str = "\n".join([f"{key}: {value}" for key, value in intents_dict_modified.items()])
#     with open(file_path, "w", encoding="utf-8") as file:
#         file.write(dict_str)
#     return intents_dict_modified
        
# file_path = '/home/oks/oks/people/mj/[2024]OKS/DATA_eda/intent_dic'
# intent_dict_update = intent_dict_save(file_path, intents_dict)


In [6]:
slot_dict = {
    'sport_type': '운동_유형',
    'food_type': '음식_유형',
    'place_name': '장소_이름',
    'device_type': '장치_유형',
    'music_album': '음악_앨범',
    'currency_name': '화폐_이름',
    'definition_word': '정의_단어',
    'time': '시간',
    'transport_type': '교통_유형',
    'person': '사람',
    'business_name': '비즈니스_이름',
    'general_frequency': '일반_빈도',
    'player_setting': '플레이어_설정',
    'radio_name': '라디오_이름',
    'personal_info': '개인_정보',
    'ingredient': '재료',
    'event_name': '이벤트_이름',
    'playlist_name': '재생목록_이름',
    'song_name': '노래_이름',
    'movie_type': '영화_유형',
    'movie_name': '영화_이름',
    'coffee_type': '커피_유형',
    'drink_type': '음료_유형',
    'transport_descriptor': '교통_설명자',
    'audiobook_name': '오디오북_이름',
    'house_place': '집_장소',
    'transport_agency': '교통_기관',
    'date': '날짜',
    'music_genre': '음악_장르',
    'business_type': '비즈니스_유형',
    'game_type': '게임_유형',
    'game_name': '게임_이름',
    'podcast_descriptor': '팟캐스트_설명',
    'cooking_type': '요리_유형',
    'email_folder': '이메일_폴더',
    'meal_type': '식사_유형',
    'podcast_name': '팟캐스트_이름',
    'email_address': '이메일_주소',
    'app_name': '앱_이름',
    'order_type': '주문_유형',
    'transport_name': '교통_이름',
    'color_type': '색상_유형',
    'weather_descriptor': '날씨_설명',
    'change_amount': '변경_금액',
    'time_zone': '시간대',
    'joke_type': '농담_유형',
    'news_topic': '뉴스_주제',
    'media_type': '미디어_유형',
    'timeofday': '시간대',
    'alarm_type': '알람_유형',
    'list_name': '목록_이름',
    'music_descriptor': '음악_설명',
    'artist_name': '아티스트_이름',
    'audiobook_author': '오디오북_저자',
    'relation': '관계'
}

In [7]:
import re

pattern = r'\[(.*?)\]'

# _INTENTS = ['datetime_query', 'iot_hue_lightchange', 'transport_ticket', 'takeaway_query', 'qa_stock',
#             'general_greet', 'recommendation_events', 'music_dislikeness', 'iot_wemo_off', 'cooking_recipe',
#             'qa_currency', 'transport_traffic', 'general_quirky', 'weather_query', 'audio_volume_up',
#             'email_addcontact', 'takeaway_order', 'email_querycontact', 'iot_hue_lightup',
#             'recommendation_locations', 'play_audiobook', 'lists_createoradd', 'news_query',
#             'alarm_query', 'iot_wemo_on', 'general_joke', 'qa_definition', 'social_query',
#             'music_settings', 'audio_volume_other', 'calendar_remove', 'iot_hue_lightdim',
#             'calendar_query', 'email_sendemail', 'iot_cleaning', 'audio_volume_down',
#             'play_radio', 'cooking_query', 'datetime_convert', 'qa_maths', 'iot_hue_lightoff',
#             'iot_hue_lighton', 'transport_query', 'music_likeness', 'email_query', 'play_music',
#             'audio_volume_mute', 'social_post', 'alarm_set', 'qa_factoid', 'calendar_set',
#             'play_game', 'alarm_remove', 'lists_remove', 'transport_taxi', 'recommendation_movies',
#             'iot_coffee', 'music_query', 'play_podcasts', 'lists_query']


def parse_text(text): ## > '[date : 금요일]', '[time : 오전 아홉 시]', '에', '깨워줘'
    pattern = re.compile(r'\[.*?\]|\S+')
    matches = pattern.findall(text)
    return matches

def matching_intent(idx_intent, _INTENTS_KO):
    id_of_intent = idx_intent
    intent_str = _INTENTS_KO[id_of_intent]
    return intent_str

def slot_extract(annot_utt, slot_dict):
    result = []
    slot_entity_name = []
    slot_entity_element = []
    
    parsed_list = parse_text(annot_utt)
    
    for word in parsed_list:
        if ":" in word:
            word = word[1:-1]
            entity_name = word.split(':')[0].strip()
            entity_name_ko = slot_dict[entity_name]
            entity_element = word.split(':')[1].strip().split() # ['오후', '아홉시']
            for i in entity_element:    
                if i == entity_element[0]:
                    slot_entity_name.append("B_" + entity_name_ko)
                    slot_entity_element.append(i)
                else:
                    slot_entity_name.append("I_" + entity_name_ko)
                    slot_entity_element.append(i)
        else:
            entity_name = "O"
            entity_element = word
            slot_entity_name.append(entity_name)
            slot_entity_element.append(entity_element)
    
    # print(slot_entity_name)
    # print(slot_entity_name)
    for name, element in zip(slot_entity_name, slot_entity_element):
        result.append({"text": element, "entity": name})
        
    output_slot = " ".join(slot_entity_name)
    return result, output_slot

In [12]:
# # intent : English > Korean (make dict)

# for idx in filtered_test:
#     intent_class = matching_intent(idx['intent'], _INTENTS_KO)
#     if intent_class == '이메일_문의':
#         print(idx)
        
# # ## 'id': '7353' Intent 분류 잘못된 것 같음
# # ## 

{'id': '7353', 'locale': 'ko-KR', 'partition': 'test', 'scenario': 7, 'intent': 44, 'utt': '다음 주 강원도 날씨 알려줘', 'annot_utt': '[time : 다음 주] [place_name : 강원도] 날씨 알려줘', 'worker_id': '38', 'slot_method': {'slot': ['time', 'place_name'], 'method': ['translation', 'localization']}, 'judgments': {'worker_id': ['16', '3', '30'], 'intent_score': [1, 1, 0], 'slots_score': [1, 1, 1], 'grammar_score': [4, 4, 4], 'spelling_score': [2, 2, 2], 'language_identification': ['target', 'target', 'target']}}
{'id': '15809', 'locale': 'ko-KR', 'partition': 'test', 'scenario': 7, 'intent': 44, 'utt': '시우가 아직 이메일 안 보냈나', 'annot_utt': '[person : 시우가] 아직 이메일 안 보냈나', 'worker_id': '14', 'slot_method': {'slot': ['person'], 'method': ['localization']}, 'judgments': {'worker_id': ['5', '16', '30'], 'intent_score': [1, 1, 1], 'slots_score': [1, 1, 1], 'grammar_score': [4, 4, 4], 'spelling_score': [2, 2, 2], 'language_identification': ['target', 'target', 'target']}}
{'id': '15862', 'locale': 'ko-KR', 'partition': 'te

In [9]:
def making_data_structure(idx):
    intent_class = matching_intent(idx['intent'], _INTENTS_KO)
    
    ## slot
    slot_result, output = slot_extract(idx['annot_utt'], slot_dict)
    
    return {
        idx['id']: {
            "text" : idx['utt'],
            "intent" : intent_class,
            "slot_out" : output,
            "slot_mapping" : slot_result 
        }
    }

In [10]:
import json 
import re

train_json = {}
valid_json = {}
test_json = {}

for idx in filtered_train:
    result = making_data_structure(idx)
    train_json.update(result)

for idx in filtered_valid:
    result = making_data_structure(idx)
    valid_json.update(result)
    
for idx in filtered_test:
    result = making_data_structure(idx)
    test_json.update(result)

# Save to JSON file
with open('train_amazon_massive.json', 'w', encoding='utf-8') as json_file:
    json.dump(train_json, json_file, ensure_ascii=False, indent=4)
    
with open('valid_amazon_massive.json', 'w', encoding='utf-8') as json_file:
    json.dump(valid_json, json_file, ensure_ascii=False, indent=4)
    
with open('test_amazon_massive.json', 'w', encoding='utf-8') as json_file:
    json.dump(test_json, json_file, ensure_ascii=False, indent=4)    

print("JSON file created successfully.")
    

In [11]:
# print(len(train_json))
# print(len(valid_json))
# print(len(test_json))
