Licensed under the Apache License, Version 2.0

Based on https://github.com/google-research/google-research/blob/master/revisiting_neural_scaling_laws/revisiting_neural_scaling_laws.ipynb

In [None]:
CUTOFF = -1
XMAX = -1
SAVE_PATH = f"./results/x{'org' if XMAX == -1 else XMAX}_cutoff{'org' if CUTOFF == -1 else CUTOFF}"

# Imports

In [None]:
import numpy as np
import json
import shutil
import matplotlib.pyplot as plt
import pandas as pd
from methods import m1, m2, m3, m4
from data.get_test_data import get_bench_data, get_DD_data, get_nano_data
import os

def get_error(slaw, x, y):
  """Evaluate the scaling law estimator slaw on the test data (x, y).

  Args:
    x: 1d array containing data sizes.
    y: 1d array containing errors/losses.
  """
  yp = np.array([slaw.predict_loss(xi) for xi in x])
  error = (np.log(yp) - np.log(y)) ** 2
  err_mu = np.mean(error)
  err_std = np.sqrt(err_mu + np.std(error) / (len(yp)**0.5)) - np.sqrt(err_mu)
  # return mean and std error
  return np.sqrt(err_mu), err_std

def create_dir(dir_name):
  # if dir exists, remove it and its contents
  try:
    shutil.rmtree(dir_name)
  except:
    pass
  os.mkdir(dir_name)

np.random.seed(2021)

scaling_laws = {}
errors = {}

M1 = m1.Estimator
M2 = m2.Estimator
M3 = m3.Estimator
M4 = m4.Estimator

# make directory to save the fitted parameters
os.makedirs(SAVE_PATH, exist_ok=True)

## Load bench data

In [None]:
IC_data, NMT_data, LM_data, BB_data = get_bench_data('./data', cutoff=CUTOFF)

## Image Classification

In [None]:
IC_mode_params = {}
IC_log = []
os.makedirs(os.path.join(SAVE_PATH, 'IC'), exist_ok=True)
for i, (key, xc, yc, xt, yt) in enumerate(IC_data):
	print(f'Fitting {key} ({i+1}/{len(IC_data)})')
	xc, yc, xt, yt = np.array(xc), np.array(yc), np.array(xt), np.array(yt)
	fit_values = {x: y for x, y in zip(xc, yc)}

	mode_params = {
		'M1': {},
		'M2': {},
		'M3': {},
		'M4': {},
	}
	
	# train all estimators
	scaling_laws[key] = {}
	rmsle_list = []
	for mode in ['M1', 'M2', 'M3', 'M4']:
		print(mode)
		if mode == 'M1':
			scaling_laws[key][mode] = M1(fit_values)
		elif mode == 'M2':
			scaling_laws[key][mode] = M2(fit_values)
		elif mode == 'M3':
			scaling_laws[key][mode] = M3(fit_values)
		elif mode == 'M4':
			scaling_laws[key][mode] = M4(fit_values, err_inf=None, err_0=1,
										update_err_0=True, up_bound=1.0)
		# fit
		scaling_laws[key][mode].estimate_scaling_params(verbose=0,
														max_iterations=10_000)

		# report
		if mode == 'M1':
			beta, c = scaling_laws[key][mode].beta, scaling_laws[key][mode].c
			print('beta, c =\t\t %.2f, %0.2f' % (beta, c))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
		elif mode == 'M2':
			beta, c, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].err_inf
			print('beta, c, err_inf =\t\t %.2f, %0.2f, %0.2f' % (beta, c, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['err_inf']=err_inf
		elif mode == 'M3':
			beta, c, gamma = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].gamma
			print('beta, c, gamma =\t\t %.2f, %0.2f, %0.2f' %(beta, c, gamma))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['gamma']=gamma
		else:
			beta, c, alpha, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].alpha, scaling_laws[key][mode].err_inf
			print('beta, c, alpha, err_inf =\t %.2f, %0.2f, %0.2f, %0.2f' %(beta, c, alpha, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['alpha']=alpha
			mode_params[mode]['err_inf']=err_inf

		# record error
		rmsle_mean, rmsle_std = get_error(scaling_laws[key][mode], xt, yt)
		rmsle_mean, rmsle_std = rmsle_mean.item(), rmsle_std.item()
		rmsle_list.append(rmsle_mean)
		print('Extrapolation Loss =\t\t %.4f +- %.5f' %(rmsle_mean, rmsle_std))
		print()
			
		# save plot
		x = np.concatenate([xc, xt])
		yp = np.array([scaling_laws[key][mode].predict_loss(xi) for xi in x])
		plt.figure()
		plt.plot(xc, yc, color='black', label='context', marker='o')
		plt.plot(xt, yt, color=[0.0, 0.925, 0.0], label="target", marker='o')
		plt.plot(x, yp, color='blue', label='pred')
		plt.vlines(max(xc), 0, 1.1, linewidth=0.5, color="k", label="cutoff", linestyle='--')
		
		plt.xscale("log")
		plt.ylim(0, 1.1)
		plt.xlabel("Training Data Size")
		plt.ylabel("Test Error Rate")
		# show mode params in title with 3f precision
		plt.title(f"{key} / {mode} / {rmsle_mean:.4f}\n{', '.join([f'{k}={v:.3f}' for k, v in mode_params[mode].items()])}")
		str_key = ('_').join(key).replace('/', '_')
		plt.savefig(os.path.join(SAVE_PATH, 'IC', f'{str_key}_{mode}.png'))
		plt.show()
		plt.close()
	IC_mode_params[str(key)] = mode_params
	IC_log.append((*key, *rmsle_list))
	break

In [None]:
errors = [item[3:7] for item in IC_log]  # Get elements 3 to 6 for each tuple
IC_log.append(('IC', '', 'AVG', *[f'{avg:.4f}+-{std:.4f}' for avg, std in zip(np.mean(errors, axis=0), np.std(errors, axis=0))]))

# change elemenet type from float32 to float
for key in IC_mode_params:
    for mode in IC_mode_params[key]:
        for param in IC_mode_params[key][mode]:
            IC_mode_params[key][mode][param] = float(IC_mode_params[key][mode][param])

# Save the scaling law parameters as json'
with open(f'{SAVE_PATH}/params_IC.json', 'w') as f:
    json.dump(IC_mode_params, f, indent=4)

# Save the errors as csv
IC_df = pd.DataFrame(IC_log, columns=['domain', 'task', 'model', 'M1', 'M2', 'M3', 'M4'])
IC_df.to_csv(os.path.join(SAVE_PATH, 'errors_IC.csv'), index=False)

## Neural Machine Translation

In [None]:
NMT_mode_params = {}
NMT_log = []
os.makedirs(os.path.join(SAVE_PATH, 'NMT'), exist_ok=True)
for i, (key, xc, yc, xt, yt) in enumerate(NMT_data):
	print(f'Fitting {key} ({i+1}/{len(NMT_data)})')
	xc, yc, xt, yt = np.array(xc), np.array(yc), np.array(xt), np.array(yt)
	fit_values = {x: y for x, y in zip(xc, yc)}

	mode_params = {
		'M1': {},
		'M2': {},
		'M3': {},
		'M4': {},
	}
	
	# train all estimators
	scaling_laws[key] = {}
	rmsle_list = []
	for mode in ['M1', 'M2', 'M3', 'M4']:
		print(mode)
		if mode == 'M1':
			scaling_laws[key][mode] = M1(fit_values)
		elif mode == 'M2':
			scaling_laws[key][mode] = M2(fit_values)
		elif mode == 'M3':
			scaling_laws[key][mode] = M3(fit_values)
		elif mode == 'M4':
			scaling_laws[key][mode] = M4(fit_values, err_0=1.0, update_err_0=True,
										 up_bound=None)  # no upper bound since this is log-preplexity
		# fit
		scaling_laws[key][mode].estimate_scaling_params(verbose=0,
														max_iterations=10_000)

		# report
		if mode == 'M1':
			beta, c = scaling_laws[key][mode].beta, scaling_laws[key][mode].c
			print('beta, c =\t\t %.2f, %0.2f' % (beta, c))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
		elif mode == 'M2':
			beta, c, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].err_inf
			print('beta, c, err_inf =\t\t %.2f, %0.2f, %0.2f' % (beta, c, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['err_inf']=err_inf
		elif mode == 'M3':
			beta, c, gamma = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].gamma
			print('beta, c, gamma =\t\t %.2f, %0.2f, %0.2f' %(beta, c, gamma))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['gamma']=gamma
		else:
			beta, c, alpha, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].alpha, scaling_laws[key][mode].err_inf
			print('beta, c, alpha, err_inf =\t %.2f, %0.2f, %0.2f, %0.2f' %(beta, c, alpha, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['alpha']=alpha
			mode_params[mode]['err_inf']=err_inf

		# record error
		rmsle_mean, rmsle_std = get_error(scaling_laws[key][mode], xt, yt)
		rmsle_mean, rmsle_std = rmsle_mean.item(), rmsle_std.item()
		rmsle_list.append(rmsle_mean)
		print('Extrapolation Loss =\t\t %.4f +- %.5f' %(rmsle_mean, rmsle_std))
		print()
			
		# save plot
		x = np.concatenate([xc, xt])
		yp = np.array([scaling_laws[key][mode].predict_loss(xi) for xi in x])
		plt.figure()
		plt.plot(xc, yc, color='black', label='context', marker='o')
		plt.plot(xt, yt, color=[0.0, 0.925, 0.0], label="target", marker='o')
		plt.plot(x, yp, color='blue', label='pred')
		plt.vlines(max(xc), 0, 1.1, linewidth=0.5, color="k", label="cutoff", linestyle='--')
		
		plt.xscale("log")
		plt.ylim(0, 1.1)
		plt.xlabel("Training Data Size")
		plt.ylabel("Test Error Rate")
		# show mode params in title with 3f precision
		plt.title(f"{key} / {mode} / {rmsle_mean:.4f}\n{', '.join([f'{k}={v:.3f}' for k, v in mode_params[mode].items()])}")
		str_key = ('_').join(key).replace('/', '_')
		plt.savefig(os.path.join(SAVE_PATH, 'NMT', f'{str_key}_{mode}.png'))
		plt.show()
		plt.close()
	NMT_mode_params[str(key)] = mode_params
	NMT_log.append((*key, *rmsle_list))
	break

In [None]:
errors = [item[3:7] for item in NMT_log]  # Get elements 3 to 6 for each tuple
NMT_log.append(('NMT', '', 'AVG', *[f'{avg:.4f}+-{std:.4f}' for avg, std in zip(np.mean(errors, axis=0), np.std(errors, axis=0))]))

# change elemenet type from float32 to float
for key in NMT_mode_params:
    for mode in NMT_mode_params[key]:
        for param in NMT_mode_params[key][mode]:
            NMT_mode_params[key][mode][param] = float(NMT_mode_params[key][mode][param])

# Save the scaling law parameters as json'
with open(f'{SAVE_PATH}/params_NMT.json', 'w') as f:
    json.dump(NMT_mode_params, f, indent=4)

# Save the errors as csv
NMT_df = pd.DataFrame(NMT_log, columns=['domain', 'task', 'model', 'M1', 'M2', 'M3', 'M4'])
NMT_df.to_csv(os.path.join(SAVE_PATH, 'errors_NMT.csv'), index=False)

## Language Model

In [None]:
LM_mode_params = {}
LM_log = []
os.makedirs(os.path.join(SAVE_PATH, 'LM'), exist_ok=True)
for i, (key, xc, yc, xt, yt) in enumerate(LM_data):
	print(f'Fitting {key} ({i+1}/{len(LM_data)})')
	xc, yc, xt, yt = np.array(xc), np.array(yc), np.array(xt), np.array(yt)
	fit_values = {x: y for x, y in zip(xc, yc)}

	mode_params = {
		'M1': {},
		'M2': {},
		'M3': {},
		'M4': {},
	}
	
	# train all estimators
	scaling_laws[key] = {}
	rmsle_list = []
	for mode in ['M1', 'M2', 'M3', 'M4']:
		print(mode)
		if mode == 'M1':
			scaling_laws[key][mode] = M1(fit_values)
		elif mode == 'M2':
			scaling_laws[key][mode] = M2(fit_values)
		elif mode == 'M3':
			scaling_laws[key][mode] = M3(fit_values)
		elif mode == 'M4':
			scaling_laws[key][mode] = M4(fit_values, err_0=1.0,
										 update_err_0=True,
										 up_bound=None)  # no upper bound since this is cross-entropy loss
		# fit
		scaling_laws[key][mode].estimate_scaling_params(verbose=0,
														max_iterations=10_000)

		# report
		if mode == 'M1':
			beta, c = scaling_laws[key][mode].beta, scaling_laws[key][mode].c
			print('beta, c =\t\t %.2f, %0.2f' % (beta, c))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
		elif mode == 'M2':
			beta, c, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].err_inf
			print('beta, c, err_inf =\t\t %.2f, %0.2f, %0.2f' % (beta, c, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['err_inf']=err_inf
		elif mode == 'M3':
			beta, c, gamma = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].gamma
			print('beta, c, gamma =\t\t %.2f, %0.2f, %0.2f' %(beta, c, gamma))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['gamma']=gamma
		else:
			beta, c, alpha, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].alpha, scaling_laws[key][mode].err_inf
			print('beta, c, alpha, err_inf =\t %.2f, %0.2f, %0.2f, %0.2f' %(beta, c, alpha, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['alpha']=alpha
			mode_params[mode]['err_inf']=err_inf

		# record error
		rmsle_mean, rmsle_std = get_error(scaling_laws[key][mode], xt, yt)
		rmsle_mean, rmsle_std = rmsle_mean.item(), rmsle_std.item()
		rmsle_list.append(rmsle_mean)
		print('Extrapolation Loss =\t\t %.4f +- %.5f' %(rmsle_mean, rmsle_std))
		print()
			
		# save plot
		x = np.concatenate([xc, xt])
		yp = np.array([scaling_laws[key][mode].predict_loss(xi) for xi in x])
		plt.figure()
		plt.plot(xc, yc, color='black', label='context', marker='o')
		plt.plot(xt, yt, color=[0.0, 0.925, 0.0], label="target", marker='o')
		plt.plot(x, yp, color='blue', label='pred')
		plt.vlines(max(xc), 0, 1.1, linewidth=0.5, color="k", label="cutoff", linestyle='--')
		
		plt.xscale("log")
		plt.ylim(0, 1.1)
		plt.xlabel("Training Data Size")
		plt.ylabel("Test Error Rate")
		# show mode params in title with 3f precision
		plt.title(f"{key} / {mode} / {rmsle_mean:.4f}\n{', '.join([f'{k}={v:.3f}' for k, v in mode_params[mode].items()])}")
		str_key = ('_').join(key).replace('/', '_')
		plt.savefig(os.path.join(SAVE_PATH, 'LM', f'{str_key}_{mode}.png'))
		plt.show()
		plt.close()
	LM_mode_params[str(key)] = mode_params
	LM_log.append((*key, *rmsle_list))
	break

In [None]:
errors = [item[3:7] for item in LM_log]  # Get elements 3 to 6 for each tuple
LM_log.append(('LM', '', 'AVG', *[f'{avg:.4f}+-{std:.4f}' for avg, std in zip(np.mean(errors, axis=0), np.std(errors, axis=0))]))

# change elemenet type from float32 to float
for key in LM_mode_params:
    for mode in LM_mode_params[key]:
        for param in LM_mode_params[key][mode]:
            LM_mode_params[key][mode][param] = float(LM_mode_params[key][mode][param])

# Save the scaling law parameters as json'
with open(f'{SAVE_PATH}/params_LM.json', 'w') as f:
    json.dump(LM_mode_params, f, indent=4)

# Save the errors as csv
LM_df = pd.DataFrame(LM_log, columns=['domain', 'task', 'model', 'M1', 'M2', 'M3', 'M4'])
LM_df.to_csv(os.path.join(SAVE_PATH, 'errors_LM.csv'), index=False)

## Big Bench

In [None]:
BB_mode_params = {}
BB_log = []
os.makedirs(os.path.join(SAVE_PATH, 'BB'), exist_ok=True)
for i, (key, xc, yc, xt, yt) in enumerate(BB_data):
	print(f'Fitting {key} ({i+1}/{len(BB_data)})')
	xc, yc, xt, yt = np.array(xc), np.array(yc), np.array(xt), np.array(yt)
	fit_values = {x: y for x, y in zip(xc, yc)}

	mode_params = {
		'M1': {},
		'M2': {},
		'M3': {},
		'M4': {},
	}
	
	# train all estimators
	scaling_laws[key] = {}
	rmsle_list = []
	for mode in ['M1', 'M2', 'M3', 'M4']:
		print(mode)
		if mode == 'M1':
			scaling_laws[key][mode] = M1(fit_values)
		elif mode == 'M2':
			scaling_laws[key][mode] = M2(fit_values)
		elif mode == 'M3':
			scaling_laws[key][mode] = M3(fit_values)
		elif mode == 'M4':
			scaling_laws[key][mode] = M4(fit_values, err_inf=None, err_0=1.001,
										update_err_0=True, up_bound=1.0)
		# fit
		scaling_laws[key][mode].estimate_scaling_params(verbose=0,
														max_iterations=10_000)

		# report
		if mode == 'M1':
			beta, c = scaling_laws[key][mode].beta, scaling_laws[key][mode].c
			print('beta, c =\t\t %.2f, %0.2f' % (beta, c))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
		elif mode == 'M2':
			beta, c, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].err_inf
			print('beta, c, err_inf =\t\t %.2f, %0.2f, %0.2f' % (beta, c, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['err_inf']=err_inf
		elif mode == 'M3':
			beta, c, gamma = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].gamma
			print('beta, c, gamma =\t\t %.2f, %0.2f, %0.2f' %(beta, c, gamma))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['gamma']=gamma
		else:
			beta, c, alpha, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].alpha, scaling_laws[key][mode].err_inf
			print('beta, c, alpha, err_inf =\t %.2f, %0.2f, %0.2f, %0.2f' %(beta, c, alpha, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['alpha']=alpha
			mode_params[mode]['err_inf']=err_inf

		# record error
		rmsle_mean, rmsle_std = get_error(scaling_laws[key][mode], xt, yt)
		rmsle_mean, rmsle_std = rmsle_mean.item(), rmsle_std.item()
		rmsle_list.append(rmsle_mean)
		print('Extrapolation Loss =\t\t %.4f +- %.5f' %(rmsle_mean, rmsle_std))
		print()
			
		# save plot
		x = np.concatenate([xc, xt])
		yp = np.array([scaling_laws[key][mode].predict_loss(xi) for xi in x])
		plt.figure()
		plt.plot(xc, yc, color='black', label='context', marker='o')
		plt.plot(xt, yt, color=[0.0, 0.925, 0.0], label="target", marker='o')
		plt.plot(x, yp, color='blue', label='pred')
		plt.vlines(max(xc), 0, 1.1, linewidth=0.5, color="k", label="cutoff", linestyle='--')
		
		plt.xscale("log")
		plt.ylim(0, 1.1)
		plt.xlabel("Training Data Size")
		plt.ylabel("Test Error Rate")
		# show mode params in title with 3f precision
		plt.title(f"{key} / {mode} / {rmsle_mean:.4f}\n{', '.join([f'{k}={v:.3f}' for k, v in mode_params[mode].items()])}")
		str_key = ('_').join(key).replace('/', '_')
		plt.savefig(os.path.join(SAVE_PATH, 'BB', f'{str_key}_{mode}.png'))
		plt.show()
		plt.close()
	BB_mode_params[str(key)] = mode_params
	BB_log.append((*key, *rmsle_list))
	break

In [None]:
errors = [item[3:7] for item in BB_log]  # Get elements 3 to 6 for each tuple
BB_log.append(('BB', '', 'AVG', *[f'{avg:.4f}+-{std:.4f}' for avg, std in zip(np.mean(errors, axis=0), np.std(errors, axis=0))]))

# change elemenet type from float32 to float
for key in BB_mode_params:
    for mode in BB_mode_params[key]:
        for param in BB_mode_params[key][mode]:
            BB_mode_params[key][mode][param] = float(BB_mode_params[key][mode][param])

# Save the scaling law parameters as json'
with open(f'{SAVE_PATH}/params_BB.json', 'w') as f:
    json.dump(BB_mode_params, f, indent=4)

# Save the errors as csv
BB_df = pd.DataFrame(BB_log, columns=['domain', 'task', 'model', 'M1', 'M2', 'M3', 'M4'])
BB_df.to_csv(os.path.join(SAVE_PATH, 'errors_BB.csv'), index=False)

## Double Descent

In [None]:
DD_data, DD_labels = get_DD_data('./data', cutoff=CUTOFF)

In [None]:
DD_mode_params = {}
DD_log = []
os.makedirs(os.path.join(SAVE_PATH, 'DD'), exist_ok=True)
for i, (key, xc, yc, xt, yt) in enumerate(DD_data):
	task = key[1]
	print(f'Fitting {task} ({i+1}/{len(DD_data)})')
	xc, yc, xt, yt = np.array(xc), np.array(yc), np.array(xt), np.array(yt)

	y_max = max(np.max(yc), np.max(yt))
	yc_norm = yc / y_max
	yt_norm = yt / y_max

	fit_values = {x: y for x, y in zip(xc, yc_norm)}

	mode_params = {
		'M1': {},
		'M2': {},
		'M3': {},
		'M4': {},
	}
	
	# train all estimators
	scaling_laws[key] = {}
	rmsle_list = []
	for mode in ['M1', 'M2', 'M3', 'M4']:
		print(mode)
		if mode == 'M1':
			scaling_laws[key][mode] = M1(fit_values)
		elif mode == 'M2':
			scaling_laws[key][mode] = M2(fit_values)
		elif mode == 'M3':
			scaling_laws[key][mode] = M3(fit_values)#, use_epsilon=True)
		elif mode == 'M4':
			scaling_laws[key][mode] = M4(fit_values, err_inf=None, err_0=1.001,
										update_err_0=True, up_bound=1.0)
		# fit
		scaling_laws[key][mode].estimate_scaling_params(verbose=0,
														max_iterations=10_000)

		# report
		if mode == 'M1':
			beta, c = scaling_laws[key][mode].beta, scaling_laws[key][mode].c
			print('beta, c =\t\t %.2f, %0.2f' % (beta, c))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
		elif mode == 'M2':
			beta, c, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].err_inf
			print('beta, c, err_inf =\t\t %.2f, %0.2f, %0.2f' % (beta, c, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['err_inf']=err_inf
		elif mode == 'M3':
			beta, c, gamma = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].gamma
			print('beta, c, gamma =\t\t %.2f, %0.2f, %0.2f' %(beta, c, gamma))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['gamma']=gamma
		else:
			beta, c, alpha, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].alpha, scaling_laws[key][mode].err_inf
			print('beta, c, alpha, err_inf =\t %.2f, %0.2f, %0.2f, %0.2f' %(beta, c, alpha, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['alpha']=alpha
			mode_params[mode]['err_inf']=err_inf

		# record error
		rmsle_mean, rmsle_std = get_error(scaling_laws[key][mode], xt, yt)
		rmsle_mean, rmsle_std = rmsle_mean.item(), rmsle_std.item()
		rmsle_list.append(rmsle_mean)
		print('Extrapolation Loss =\t\t %.4f +- %.5f' %(rmsle_mean, rmsle_std))
		print()
			
		# save plot
		x = np.concatenate([xc, xt])
		yp = np.array([scaling_laws[key][mode].predict_loss(xi) for xi in x])
		yp_unnorm = yp * y_max
		x_min, x_max, y_min, y_max = min(xc.min().item(), xt.min().item()), max(xc.max().item(), xt.max().item()), min(yc.min().item(), yt.min().item()), max(yc.max().item(), yt.max().item())
		
		plt.figure()
		plt.plot(xc, yc, color='black', label='context', marker='o')
		plt.plot(xt, yt, color=[0.0, 0.925, 0.0], label="target", marker='o')
		plt.plot(x, yp_unnorm, color='blue', label='pred')
		plt.vlines(max(xc), y_min*.9, y_max*1.05, linewidth=0.5, color="k", label="cutoff", linestyle='--')
		plt.xlim(x_min*.865,x_max*1.05)
		plt.ylim(y_min*.9, y_max*1.05)
		x_label, y_label = DD_labels[task]
		plt.xlabel(x_label)
		plt.ylabel(y_label)
		# show mode params in title with 3f precision
		plt.title(f"{key} / {mode} / {rmsle_mean:.4f}\n{', '.join([f'{k}={v:.3f}' for k, v in mode_params[mode].items()])}")
		plt.savefig(os.path.join(SAVE_PATH, 'DD', f'{task}_{mode}.png'))
		plt.show()
		plt.close()
	DD_mode_params[str(key)] = mode_params
	DD_log.append((*key, *rmsle_list))
	break

In [None]:
errors = [item[3:7] for item in DD_log]  # Get elements 3 to 6 for each tuple
DD_log.append(('DD', '', 'AVG', *[f'{avg:.4f}+-{std:.4f}' for avg, std in zip(np.mean(errors, axis=0), np.std(errors, axis=0))]))

# change elemenet type from float32 to float
for key in DD_mode_params:
    for mode in DD_mode_params[key]:
        for param in DD_mode_params[key][mode]:
            DD_mode_params[key][mode][param] = float(DD_mode_params[key][mode][param])

# Save the scaling law parameters as json'
with open(f'{SAVE_PATH}/params_DD.json', 'w') as f:
    json.dump(DD_mode_params, f, indent=4)

# Save the errors as csv
DD_df = pd.DataFrame(DD_log, columns=['domain', 'task', 'model', 'M1', 'M2', 'M3', 'M4'])
DD_df.to_csv(os.path.join(SAVE_PATH, 'errors_DD.csv'), index=False)

## Nano

In [None]:
nano_data = get_nano_data('./data', cutoff=CUTOFF)

In [None]:
nano_mode_params = {}
nano_log = []
os.makedirs(os.path.join(SAVE_PATH, 'Nano'), exist_ok=True)
for i, (key, xc, yc, xt, yt) in enumerate(nano_data):
	task = key[1]
	print(f'Fitting {task} ({i+1}/{len(nano_data)})')
	xc, yc, xt, yt = np.array(xc), np.array(yc), np.array(xt), np.array(yt)

	y_max = max(np.max(yc), np.max(yt))
	yc_norm = yc / y_max
	yt_norm = yt / y_max

	fit_values = {x: y for x, y in zip(xc, yc_norm)}

	mode_params = {
		'M1': {},
		'M2': {},
		'M3': {},
		'M4': {},
	}
	
	# train all estimators
	scaling_laws[key] = {}
	rmsle_list = []
	for mode in ['M1', 'M2', 'M3', 'M4']:
		print(mode)
		if mode == 'M1':
			scaling_laws[key][mode] = M1(fit_values)
		elif mode == 'M2':
			scaling_laws[key][mode] = M2(fit_values)
		elif mode == 'M3':
			scaling_laws[key][mode] = M3(fit_values)#, use_epsilon=True)
		elif mode == 'M4':
			scaling_laws[key][mode] = M4(fit_values, err_inf=None, err_0=1.001,
										update_err_0=True, up_bound=1.0)
		# fit
		scaling_laws[key][mode].estimate_scaling_params(verbose=0,
														max_iterations=10_000)

		# report
		if mode == 'M1':
			beta, c = scaling_laws[key][mode].beta, scaling_laws[key][mode].c
			print('beta, c =\t\t %.2f, %0.2f' % (beta, c))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
		elif mode == 'M2':
			beta, c, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].err_inf
			print('beta, c, err_inf =\t\t %.2f, %0.2f, %0.2f' % (beta, c, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['err_inf']=err_inf
		elif mode == 'M3':
			beta, c, gamma = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].gamma
			print('beta, c, gamma =\t\t %.2f, %0.2f, %0.2f' %(beta, c, gamma))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['gamma']=gamma
		else:
			beta, c, alpha, err_inf = scaling_laws[key][mode].beta, scaling_laws[key][mode].c, scaling_laws[key][mode].alpha, scaling_laws[key][mode].err_inf
			print('beta, c, alpha, err_inf =\t %.2f, %0.2f, %0.2f, %0.2f' %(beta, c, alpha, err_inf))
			mode_params[mode]['beta']=beta
			mode_params[mode]['c']=c
			mode_params[mode]['alpha']=alpha
			mode_params[mode]['err_inf']=err_inf

		# record error
		rmsle_mean, rmsle_std = get_error(scaling_laws[key][mode], xt, yt)
		rmsle_mean, rmsle_std = rmsle_mean.item(), rmsle_std.item()
		rmsle_list.append(rmsle_mean)
		print('Extrapolation Loss =\t\t %.4f +- %.5f' %(rmsle_mean, rmsle_std))
		print()
			
		# save plot
		x = np.concatenate([xc, xt])
		yp = np.array([scaling_laws[key][mode].predict_loss(xi) for xi in x])
		yp_unnorm = yp * y_max
		x_min, x_max, y_min, y_max = min(xc.min().item(), xt.min().item()), max(xc.max().item(), xt.max().item()), min(yc.min().item(), yt.min().item()), max(yc.max().item(), yt.max().item())
		
		plt.figure()
		plt.plot(xc, yc, color='black', label='context', marker='o')
		plt.plot(xt, yt, color=[0.0, 0.925, 0.0], label="target", marker='o')
		plt.plot(x, yp_unnorm, color='blue', label='pred')
		plt.vlines(max(xc), y_min*.9, y_max*1.05, linewidth=0.5, color="k", label="cutoff", linestyle='--')
		plt.xscale("log")
		plt.xlim(x_min*.865,x_max*1.05)
		plt.ylim(y_min*.9, y_max*1.05)
		plt.xlabel('n_embed')
		plt.ylabel('val_loss')
		# show mode params in title with 3f precision
		plt.title(f"{key} / {mode} / {rmsle_mean:.4f}\n{', '.join([f'{k}={v:.3f}' for k, v in mode_params[mode].items()])}")
		plt.savefig(os.path.join(SAVE_PATH, 'Nano', f'{task}_{mode}.png'))
		plt.show()
		plt.close()
	nano_mode_params[str(key)] = mode_params
	nano_log.append((*key, *rmsle_list))
	break

In [None]:
errors = [item[3:7] for item in nano_log]  # Get elements 3 to 6 for each tuple
nano_log.append(('nano', '', 'AVG', *[f'{avg:.4f}+-{std:.4f}' for avg, std in zip(np.mean(errors, axis=0), np.std(errors, axis=0))]))

# change elemenet type from float32 to float
for key in nano_mode_params:
    for mode in nano_mode_params[key]:
        for param in nano_mode_params[key][mode]:
            nano_mode_params[key][mode][param] = float(nano_mode_params[key][mode][param])

# Save the scaling law parameters as json'
with open(f'{SAVE_PATH}/params_nano.json', 'w') as f:
    json.dump(nano_mode_params, f, indent=4)

# Save the errors as csv
nano_df = pd.DataFrame(nano_log, columns=['domain', 'task', 'model', 'M1', 'M2', 'M3', 'M4'])
nano_df.to_csv(os.path.join(SAVE_PATH, 'errors_nano.csv'), index=False)

In [None]:
# merge df and save
dataframes = [IC_df, BB_df, LM_df, NMT_df, DD_df, nano_df]

# Separate main parts and last rows
main_parts = [df.iloc[:-1] for df in dataframes]  # Exclude the last row of each dataframe
last_rows = [df.iloc[-1:] for df in dataframes]   # Only the last row of each dataframe

# Concatenate main parts and last rows
merged_df = pd.concat(main_parts, ignore_index=True)    # Merge main parts first
merged_df = pd.concat([merged_df] + last_rows, ignore_index=True)  # Add last rows at the end

# Save to a file
merged_df.to_csv(os.path.join(SAVE_PATH, "errors.csv" if CUTOFF == -1 else f"error_cutoff{CUTOFF}.csv"), index=False)
