In [7]:
import numpy as np

def log_cosh_loss(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=np.float64), np.array(y_pred, dtype=np.float64)
    
    # 動態縮放因子，僅根據 y_true 計算，防止溢位
    data_range = np.max(np.abs(y_true))
    if data_range > 1000000:
        scaling_factor = 1e-7  # 調整縮放因子
    elif data_range > 10000:
        scaling_factor = 1e-5  # 調整縮放因子
    elif data_range > 100:
        scaling_factor = 1e-3  # 調整縮放因子
    else:
        scaling_factor = 1.0
    
    def _log_cosh(x):
        # 使用數值穩定的公式來避免溢位
        return np.where(np.abs(x) > 20, np.abs(x) - np.log(2), np.log(np.cosh(x)))
    
    # 使用縮放後的差異值來計算
    loss = _log_cosh(scaling_factor * (y_pred - y_true))
    
    # 將損失除以縮放因子以保持量級
    return np.mean(loss) / scaling_factor

A = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7]
B = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 700]
print(log_cosh_loss(A, B))

A = [1, 2, 3, 4, 5, 6, 7]
B = [1, 2, 3, 4, 5, 6, 70]
print(log_cosh_loss(A, B))


16.100159367893955
8.900978974205723


In [1]:
# Manually extract and process the nested 'output' dictionary from each record
import json
import pandas as pd

# Load the JSONL file again
file_path = './data/finetuning_training_data.jsonl'

# Read the JSONL file into a list of dictionaries
data = [json.loads(line) for line in open(file_path, 'r', encoding='utf-8')]

# Extract the 'output' part of each record and create a new DataFrame
output_data = [record['output'] for record in data]
df = pd.DataFrame(output_data)

# Define the string and integer fields
string_fields = ["事故日期", "事發經過", "事故車出廠日期", "傷勢", "職業", "折舊方法", "被告肇責"]
int_fields = ["塗裝", "工資", "烤漆", "鈑金", "耐用年數", "修車費用", "醫療費用", "賠償金額總額", "保險給付金額", "居家看護天數", "居家看護費用", "每日居家看護金額"]

# Calculate average string length for string fields
string_averages = df[string_fields].applymap(lambda x: len(str(x)) if pd.notnull(x) else 0).mean()

# Calculate distribution (min, max, mean) for int fields
int_distributions = df[int_fields].apply(pd.to_numeric, errors='coerce').describe().loc[['min', 'max', 'mean']]

string_averages, int_distributions


  string_averages = df[string_fields].applymap(lambda x: len(str(x)) if pd.notnull(x) else 0).mean()


(事故日期        0.986672
 事發經過       17.992332
 事故車出廠日期     0.452437
 傷勢          1.014972
 職業          0.063173
 折舊方法        0.284097
 被告肇責        0.057331
 dtype: float64,
             塗裝            工資           烤漆       鈑金      耐用年數          修車費用  \
 min    4860.00    122.000000   100.000000    800.0  3.000000    233.000000   
 max   11000.00  59262.000000  8682.000000  26000.0  5.000000  76532.000000   
 mean   8333.75   5320.107143  4320.666667  10665.5  4.623306  12110.166667   
 
              醫療費用         賠償金額總額  保險給付金額      居家看護天數    居家看護費用     每日居家看護金額  
 min    100.000000     683.000000  2515.0    6.000000   26000.0   714.000000  
 max   5063.000000  106931.000000  2515.0  150.000000  120000.0  2200.000000  
 mean  1107.611111   26176.857143  2515.0   47.964286   71000.0  1316.333333  )