In [1]:
import copy
import json
import os
import sys
import numpy as np
import random
import pandas as pd

import datasets
from datasets import Dataset
import torch
import transformers
import argparse


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def read_json_files(path) -> pd.DataFrame:
    '''
    Returns a dataframe of root directory, file names, and content(list) of json files
    '''
    roots = []
    file_names = []
    contents = []
    # Fast return if the path is a file
    if path.endswith('.json'):
        with open(path, 'r', encoding='utf-8') as f:
            content = json.load(f)
        for record in content:
            if type(record['votes']) == str:
                if '萬' in record['votes']:
                    record['votes'] = int(record['votes'].replace('萬', '')) * 10000
        return pd.DataFrame({'root': [path], 'file_name': [path.split('/')[-1]], 'content': [content]})
    
    # Get the file names
    for root, dirnames, filenames in os.walk(path):
        for file in filenames:
            if file.endswith('.json') and file != "star_record.json":
                with open(os.path.join(root,file), 'r', encoding='utf-8') as f:
                    content = json.load(f)
                for record in content:
                    if type(record['votes']) == str:
                        if '萬' in record['votes']:
                            record['votes'] = int(record['votes'].replace('萬', '')) * 10000
                roots.append(root)
                file_names.append(file)
                contents.append(content)
    return pd.DataFrame({'root': roots, 'file_name': file_names, 'content': contents})


In [3]:
def data_clean(data: pd.DataFrame, seed:int) :
    '''
    Randomly select data from each file and return a new dataframe
    '''
    pass

In [4]:
def get_prompt(title:str, description:str, star_num:str, mood:str) -> str:
    '''Format the instruction as a prompt for LLM.'''

    comment_type = '正面評論' if star_num.split('.')[0] in ['4', '5'] else '負面評論' if star_num.split('.')[0] in ['1', '2'] else '中立評論'
    if mood == 'like':
        mood = '喜歡'
    elif mood == 'happiness':
        mood = '開心'
    elif mood == 'sadness':
        mood = '難過'
    elif mood == 'disgust':
        mood = '厭惡'
    elif mood == 'anger':
        mood = '生氣'
    elif mood == 'surprise':
        mood = '驚訝'
    elif mood == 'fear':
        mood = '害怕'
    elif mood == 'none':
        mood = '中立'
    
    return f"你是人工智慧助理，以下是用戶和人工智能助理之間的對話。你要對用戶的問題提供有用、詳細的回答。\
USER: 請幫這部影片生出對應需求的{comment_type}。影片標題:[{title}]。影片敘述:[{description}]。需求情感:[{mood}]。\
ASSISTANT:"


In [7]:
if __name__ == '__main__':
    PATH = "./train_data/"
    content = read_json_files(PATH)
    print(content)


                                      root         file_name  \
0    ./train_data/UC2tRcusVoXSGqUcSu1GO4Ng  0feSmsxPxJU.json   
1    ./train_data/UC2tRcusVoXSGqUcSu1GO4Ng  0qUN__KR2Yo.json   
2    ./train_data/UC2tRcusVoXSGqUcSu1GO4Ng  58hgqzKp9Ww.json   
3    ./train_data/UC2tRcusVoXSGqUcSu1GO4Ng  6jqnfDMOB2A.json   
4    ./train_data/UC2tRcusVoXSGqUcSu1GO4Ng  6zmD-jGdumQ.json   
..                                     ...               ...   
288  ./train_data/UCD2KoUc0f4Bv2Bz0mbOah8g  VqUyMkUsn4U.json   
289  ./train_data/UCD2KoUc0f4Bv2Bz0mbOah8g  wOUaVIK6TG8.json   
290  ./train_data/UCD2KoUc0f4Bv2Bz0mbOah8g  xBb3sBPVABE.json   
291  ./train_data/UCD2KoUc0f4Bv2Bz0mbOah8g  XZf-cVBs5lg.json   
292  ./train_data/UCD2KoUc0f4Bv2Bz0mbOah8g  yi0HBexLP1k.json   

                                               content  
0    [{'video_id': '0feSmsxPxJU', 'video_title': '#...  
1    [{'video_id': '0qUN__KR2Yo', 'video_title': '哇...  
2    [{'video_id': '58hgqzKp9Ww', 'video_title': '挑...  
3  