<a href="https://colab.research.google.com/github/Besiroglu/AI-dict/blob/master/CEG_chinchilla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from scipy.optimize import basinhopping
import pandas as pd

#Kaplan
# Define the function L(N, D) as per the provided mathematical formula
def L_kaplan_with_compute(D, compute, alpha_N=0.076, alpha_D=0.103, N_c=6.4e13, D_c=1.8e13):
    # Calculate the function value
    N = compute/(6*D)
    value = ((N_c / N)**(alpha_N / alpha_D) + (D_c / D))**alpha_D
    return value

# Define the new function L(N, D) as per the provided mathematical formula
def L_chinchilla_with_compute(D, compute, E=1.69, A=406.4, B=410.7):
    N = compute / (6 * D)
    # Calculate the function value
    value = E + A / N**0.34 + B / D**0.28
    return value

# Define the new function L(N, D) as per the provided mathematical formula
def L_model_term(D, compute, E=1.69, A=406.4, B=410.7):
    N = compute / (6 * D)
    # Calculate the function value
    value = A / N**0.34
    return value

# Define the new function L(N, D) as per the provided mathematical formula
def L_data_term(D, compute, E=1.69, A=406.4, B=410.7):
    N = compute / (6 * D)
    # Calculate the function value
    value = B / D**0.28
    return value

# A function to perform the basinhopping optimization for L_chinchilla
def optimize_L_chinchilla(compute, initial_guess_D):
    result = basinhopping(
        L_chinchilla_with_compute,  # The function to minimize
        initial_guess_D,  # Initial guess for the variables
        minimizer_kwargs={
            'method': 'L-BFGS-B',
            'bounds': [(initial_guess_D/10000, initial_guess_D*10000)],
            'args': (compute,),  # Additional arguments passed to the objective function
            'options': {'ftol': tolerance},
        },
        niter=max_iterations,
        stepsize=step_size
    )
    return result.x, result.fun

In [None]:
import math
import random

# DataFrame to store the results
# Initialize DataFrame to store the results
results_df = pd.DataFrame(columns=[
    'Compute', 'CEG', 'Optimal D (Chinchilla)', 'Optimal D (Kaplan)',
    'Chinchilla Loss', 'Compute_needed Chinchilla to match Kaplan',
    'Loss model term (On Kaplan-scaling)', 'Loss data term (On Kaplan-scaling)'
])

compute_values = np.geomspace(1e20, 1e30, num=int(np.log2(1e30/1e20))+1)

compute = compute_values[0]
initial_guess_D = 10*compute**0.5

step_size = 0.0005
max_iterations = 10000

tolerance = 1e-4 # Tolerance for the difference in loss

for compute in compute_values:
  print("compute budget", compute)

  # Use the basinhopping algorithm to minimize L_kaplan with the given bounds
  result_basinhopping_kaplan = basinhopping(
      L_kaplan_with_compute,  # This function should be defined to accept 'compute' as an argument
      initial_guess_D,
      minimizer_kwargs={
          'method': 'L-BFGS-B',
          'bounds': [(compute**0.2, compute**0.85)],
          'args': (compute,),  # Pass 'compute' as an additional argument
          'options': {'ftol': tolerance/5}
      },
      niter=max_iterations,
      stepsize=initial_guess_D*step_size
  )

  # Use the basinhopping algorithm to minimize L_kaplan with the given bounds
  result_basinhopping_chinchilla = basinhopping(
      L_chinchilla_with_compute,  # This function should be defined to accept 'compute' as an argument
      initial_guess_D,
      minimizer_kwargs={
          'method': 'L-BFGS-B',
          'bounds': [(compute**0.25, compute**0.85)],
          'args': (compute,),  # Pass 'compute' as an additional argument
          'options': {'ftol': tolerance/5}
      },
      niter=max_iterations,
      stepsize=initial_guess_D*step_size
  )

  loss_kaplan = L_chinchilla_with_compute(result_basinhopping_kaplan.x, compute)
  loss_chinchilla_1 = L_chinchilla_with_compute(result_basinhopping_chinchilla.x, compute)

  print("optimal data Kaplan", result_basinhopping_kaplan.x)

  print("loss Kaplan-scale", loss_kaplan)
  print("loss Chinchilla-scale", loss_chinchilla_1)

  compute_needed = compute

  max_iterations = max_iterations
  for iteration in range(max_iterations):
      initial_guess_D = compute_needed**0.5

      # Use the basinhopping algorithm to minimize L_kaplan with the given bounds
      result_basinhopping_chinchilla = basinhopping(
          L_chinchilla_with_compute,  # This function should be defined to accept 'compute' as an argument
          initial_guess_D,
          minimizer_kwargs={
              'method': 'L-BFGS-B',
              'bounds': [(compute_needed**0.2, compute_needed**0.85)],
              'args': (compute_needed,),  # Pass 'compute' as an additional argument
              'options': {'ftol': tolerance}
          },
          niter=max_iterations,
          stepsize=initial_guess_D*step_size
      )

      loss_chinchilla = L_chinchilla_with_compute(result_basinhopping_chinchilla.x, compute)

      #print("loss Chinchilla:", loss_chinchilla, "with compute", compute_needed)
      # Compare the chinchilla loss to the kaplan loss
      absolute_difference = abs(loss_chinchilla - loss_kaplan)
      if absolute_difference < tolerance*10:
          # If the loss is close enough, break the loop
          break
      elif loss_chinchilla < loss_kaplan:
          # If the loss is too low, decrease the compute budget
          delta = (1 + 10*(absolute_difference + random.uniform(0, 0.005))+ random.uniform(0, 0.01))

          compute_needed /= delta
      else:
          # If the loss is too high, increase the compute budget
          compute_needed *= delta

  # Print the final compute budget and the corresponding D
  print(f"Compute_needed: {compute_needed}")
  print(f"Optimal D: {optimal_D}")
  print(f"Chinchilla loss: {loss_chinchilla_1}")
  CEG = compute/compute_needed
  initial_guess_D = optimal_D

  print("Loss Kaplan", L_chinchilla(result_basinhopping_kaplan.x))
  print("Checking by plugging D_opt back into chinchilla to check it matches Kaplan loss", L_chinchilla_with_compute(result_basinhopping_chinchilla.x, compute))
  print("CEG", CEG)


  L_data = L_data_term(result_basinhopping_kaplan.x, compute)
  L_model = L_model_term(result_basinhopping_kaplan.x, compute)

  # Creating a DataFrame from a dictionary with the current iteration's results
  iter_results_df = pd.DataFrame({
      'Compute': [compute],
      'CEG': [CEG[0]],  # Assuming CEG is a scalar
      'Optimal D (Chinchilla)': [result_basinhopping_chinchilla.x[0]],
      'Optimal D (Kaplan)': [result_basinhopping_kaplan.x[0]],
      'Chinchilla Loss': [loss_chinchilla],
      'Compute_needed Chinchilla to match Kaplan': [compute_needed],
      'Loss model term (On Kaplan-scaling)': [L_model],
      'Loss data term (On Kaplan-scaling)': [L_data]
  })

  # Append the results of the current iteration to the main results DataFrame
  results_df = pd.concat([results_df, iter_results_df], ignore_index=True)

compute budget 1e+20
optimal data Kaplan [1.01202056e+10]
loss Kaplan-scale [2.63747885]
loss Chinchilla-scale [2.59984975]


In [None]:
results_df_store = results_df.copy()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Assuming results_df is already defined and 'linear_fit' function is defined

# Convert 'CEG' and 'Compute' columns to numeric values, forcing NaNs if conversion fails
results_df['CEG'] = pd.to_numeric(results_df['CEG'])
results_df['Compute'] = pd.to_numeric(results_df['Compute'])

# Drop rows with NaN values that resulted from conversion failure
results_df = results_df.dropna(subset=['CEG', 'Compute'])

# Taking the log of 'Compute'
log_Compute = np.log(results_df['Compute'])
ceg = results_df['CEG']


# Plotting the original data and the fitted line
plt.figure(figsize=(10, 5))
plt.plot(results_df['Compute'], results_df['CEG'], label='Data Line', linestyle='-')  # Use linestyle='-' for a solid line
plt.xscale('log')  # Applying logarithmic scale to x-axis
plt.yscale('log')  # Applying logarithmic scale to x-axis

plt.xlabel('Compute')
plt.ylabel('Compute equivalent gain')
plt.title('Compute Budget Efficiency Gain (CEG) vs Compute Budget')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plotting Loss model term (On Kaplan-scaling) and Loss data term (On Kaplan-scaling) over Compute in log-log scale
plt.figure(figsize=(14, 7))

# Loss model term (On Kaplan-scaling) vs Compute
plt.subplot(1, 2, 1)
plt.plot(results_df['Compute'], results_df['Loss model term (On Kaplan-scaling)'], 'o-', label='Loss model term (On Kaplan-scaling)')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Compute (FLOP)')
plt.ylabel('Loss model term (On Kaplan-scaling)')
plt.title('Loss model term (On Kaplan-scaling) vs Compute (FLOP)')
plt.grid(True)
plt.legend()

# Loss data term (On Kaplan-scaling) vs Compute
plt.subplot(1, 2, 2)
plt.plot(results_df['Compute'], results_df['Loss data term (On Kaplan-scaling)'], 'o-', label='Loss data term (On Kaplan-scaling)')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Compute (FLOP)')
plt.ylabel('Loss data term (On Kaplan-scaling)')
plt.title('Loss data term (On Kaplan-scaling) vs Compute (FLOP)')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
results_df