In [1]:
import copy
import json
import os
import sys
import numpy as np
import random
import pandas as pd

import datasets
from datasets import Dataset
import torch
import transformers
import argparse


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_json_files(path) -> pd.DataFrame:
    '''
    Returns a dataframe of root directory, file names, and content(list) of json files
    '''
    roots = []
    file_names = []
    contents = []
    # Fast return if the path is a file
    if path.endswith('.json'):
        with open(path, 'r', encoding='utf-8') as f:
            content = json.load(f)
        for record in content:
            if type(record['votes']) == str:
                if '萬' in record['votes']:
                    record['votes'] = int(record['votes'].replace('萬', '')) * 10000
        return pd.DataFrame({'root': [path], 'file_name': [path.split('/')[-1]], 'content': [content]})
    
    # Get the root, file names, and content
    for root, dirnames, filenames in os.walk(path):
        for file in filenames:
            if file.endswith('.json') and file != "star_record.json":
                with open(os.path.join(root,file), 'r', encoding='utf-8') as f:
                    content = json.load(f)
                for record in content:
                    if type(record['votes']) == str:
                        if '萬' in record['votes']:
                            record['votes'] = int(record['votes'].replace('萬', '')) * 10000
                roots.append(root)
                file_names.append(file)
                contents.append(content)
    return pd.DataFrame({'root': roots, 'file_name': file_names, 'content': contents})


In [3]:
def data_clean(file_info: pd.DataFrame, num_video_per_channel = None, seed = None) -> pd.DataFrame :
    '''
    Randomly select data from each file and return a new dataframe of training data
    '''
    random.seed(seed)
    moods = ['like','happiness','sadness','anger','fear','surprise','disgust']
    
    dataset_df = pd.DataFrame(columns = file_info.at[0,'content'][0].keys())
    channels = file_info['root'].unique()
    for channel in channels:
        videos = file_info.loc[file_info['root'] == channel, ['file_name','content']]
        if num_video_per_channel is not None:
            videos = videos.sample(n = num_video_per_channel, random_state = seed).reset_index(drop=True)
        for vid in videos.index:
            content = pd.DataFrame(videos.loc[vid,'content'])
            for mood in moods:
                if  content.loc[content['mood'] == mood].size < 1: 
                    continue
                pick_data = content.loc[content['mood'] == mood].sample(n = 1, random_state = None).reset_index(drop=True)
                dataset_df = pd.concat([dataset_df,pick_data], ignore_index=True)
    return dataset_df

In [4]:
def prepare_dataset(path, num_video_per_channel = None, seed = None):
    '''
    Returns a dataset of json files
    '''
    file_info = read_json_files(path)
    datalist = data_clean(file_info, num_video_per_channel, seed=seed)
    return Dataset.from_pandas(datalist)

In [5]:
def get_prompt(title:str, description:str, star_num:str, mood:str) -> str:
    '''Format the instruction as a prompt for LLM.'''

    comment_type = '正面評論' if star_num.split('.')[0] in ['4', '5'] else '負面評論' if star_num.split('.')[0] in ['1', '2'] else '中立評論'
    moods = ['like','happiness','sadness','anger','fear','surprise','disgust']
    ch_moods = ['喜歡','開心','難過','生氣','害怕','驚訝','厭惡']
    if mood in moods:
        mood = ch_moods[moods.index(mood)]
    
    return f"你是人工智慧助理，以下是用戶和人工智能助理之間的對話。你要對用戶的問題提供有用、詳細的回答。\
USER: 請幫這部影片生出對應需求的{comment_type}。影片標題:[{title}]。影片敘述:[{description}]。需求情感:[{mood}]。\
ASSISTANT:"


In [15]:
if __name__ == '__main__':
    PATH = "./data/train_data/"
    info = read_json_files(PATH)
    data = data_clean(info, num_video_per_channel=4)
    print(Dataset.from_pandas(data))


Dataset({
    features: ['video_id', 'video_title', 'video_description', 'cid', 'comment_text', 'votes', 'time', 'star_num', 'mood'],
    num_rows: 151
})


  dataset_df = pd.concat([dataset_df,pick_data], ignore_index=True)
