In [1]:
import copy
import json
import os
import sys
import numpy as np
import random

import datasets
from datasets import Dataset
import torch
import transformers
from torch.nn.utils.rnn import pad_sequence
import argparse


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_json_files(path, max_file_num:int = None, shuffle = True):
    json_files_content = {}
    content_list = []
    # Get the file names
    files = []
    for root, dirs, filenames in os.walk(path):
        for file in filenames:
            if file.endswith('.json') and file != "star_record.json":
                files.append(os.path.join(root, file))  
    # Shuffle the list
    if shuffle:
        random.shuffle(files)
    # Read the files
    file_num = 0         
    for file in files:
        file_num = file_num + 1
        with open(file, 'r', encoding='utf-8') as f:
            content = json.load(f)
            # json_files_content[file] = content
            for record in content:
                if type(record['votes']) == str:
                    if '萬' in record['votes']:
                        record['votes'] = int(record['votes'].replace('萬', '')) * 10000
            content_list += content
        if max_file_num is not None and file_num >= max_file_num:
            break
    return content_list


In [3]:
def get_prompt(title:str, description:str, star_num:str, mood:str) -> str:
    '''Format the instruction as a prompt for LLM.'''
    
    seed = random.random()
    comment_type = '好評' if star_num.split('.')[0] in ['4', '5'] else '差評' if star_num.split('.')[0] in ['1', '2'] else '中立評論'
    if mood == 'like':
        mood = '喜歡'
    elif mood == 'happiness':
        mood = '開心'
    elif mood == 'sadness':
        mood = '難過'
    elif mood == 'disgust':
        mood = '厭惡'
    elif mood == 'anger':
        mood = '生氣'
    elif mood == 'surprise':
        mood = '驚訝'
    elif mood == 'fear':
        mood = '害怕'
    elif mood == 'none':
        mood = '中立'
    
    return f"{seed} 你是人工智慧助理，以下是用戶和人工智能助理之間的對話。你要對用戶的問題提供有用、詳細的回答。\
USER: 請幫這部影片生出對應需求的{comment_type}。影片標題:[{title}]。影片敘述:[{description}]。需求情感:[{mood}]。\
ASSISTANT:"


In [8]:
if __name__ == '__main__':
    PATH = "./train_data/"
    content = read_json_files(PATH, 300, True)
    print(content[:5])
    dataset = Dataset.from_list(content)
    print(dataset)


[{'video_id': 'g5fBP8mhzw4', 'video_title': 'Unboxing📦 #15 吃一口回日本🇯🇵四間超道地日料小吃！鰻魚飯、章魚燒、文字燒...「有媽媽的味道🥹」｜阿部瑪利亞 Maria Abe', 'video_description': '這集開箱了四間「超道地的日本料理店」😋\n歡迎大家跟著我去吃吃看喔~🫶\n\n♔ 2𝐧𝐝 𝐒𝐢𝐧𝐠𝐥𝐞 ♔\nʟɪsᴛᴇɴ ɴᴏᴡ▸ https://fanlink.to/abunai...', 'cid': 'Ugy67A1Vb1VNroxxORF4AaABAg', 'comment_text': '這4種日本美食\n每一個看起來都很好吃的感覺🤤\n台灣的日本食物為了迎合台灣人的口味\n而會去調整口味\n不過由阿部桑介紹的這些美食\n有機會來去吃吃看😋', 'votes': 18, 'time': 1691436586.972862, 'star_num': 'star 4', 'mood': 'like'}, {'video_id': 'g5fBP8mhzw4', 'video_title': 'Unboxing📦 #15 吃一口回日本🇯🇵四間超道地日料小吃！鰻魚飯、章魚燒、文字燒...「有媽媽的味道🥹」｜阿部瑪利亞 Maria Abe', 'video_description': '這集開箱了四間「超道地的日本料理店」😋\n歡迎大家跟著我去吃吃看喔~🫶\n\n♔ 2𝐧𝐝 𝐒𝐢𝐧𝐠𝐥𝐞 ♔\nʟɪsᴛᴇɴ ɴᴏᴡ▸ https://fanlink.to/abunai...', 'cid': 'UgyKoa2Dl5AkPGNPIN94AaABAg', 'comment_text': '日本料理真的很好吃～～～\n尤其是迴轉壽司更是一級棒❤❤❤', 'votes': 16, 'time': 1691436586.975106, 'star_num': 'star 5', 'mood': 'like'}, {'video_id': 'g5fBP8mhzw4', 'video_title': 'Unboxing📦 #15 吃一口回日本🇯🇵四間超道地日料小吃！鰻魚飯、章魚燒、文字燒...「有媽媽的味道🥹」｜阿部瑪利亞 Maria Abe', 'video_description': '這集開箱了