In [1]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import get_model_memory
import get_resources_info
from Scraper import scrape

'''
use get_resources_info.py to get if gpu and memory
use scrape.py to get update the leaderboard and return sorted model list
use available memory to find the best model in range
deploy
'''

# get_resources_info.py
print(f'{"="*110}\nMeasuring Local Resources...\n')
local_resources = get_resources_info.get_resources_info()
have_gpu = False if local_resources[7] is None else True
if have_gpu:
    #gpu_info.append([gpu_id, gpu_name, gpu_memory_total, gpu_memory_free, gpu_load])
    if len(local_resources[7]) > 1:
        pass #TODO: address multiple GPU cases
    else:
        gpu_name, gpu_memory_total, gpu_memory_free = local_resources[7][0][1], local_resources[7][0][2], local_resources[7][0][3]
else:
    mem_total, mem_available = local_resources[5], local_resources[6]
print(f'\nMeasuring Local Resources Finish!\n{"="*110}\n\n')
    
# scrape.py
print(f'{"="*110}\nScraping HuggingFace open-llm-leaderboard (https://huggingfaceh4-open-llm-leaderboard.hf.space)...')
# scrape.scrape() # update leaderboard
print(f'\nScraping HuggingFace open-llm-leaderboard Finish!\nSee best-models.txt and best-models-deduplicate.txt for best models for each size and each kind.\n{"="*110}\n\n')

print(f'{"="*110}\nData Cleaning starts...')
# load leaderboard data
data = pd.read_csv('open-llm-leaderboard.csv') # read the full leaderboard
# data cleaning

# fixing a mistake for microsoft's phi-1_5
data.loc[data['Model_name_for_query']=='microsoft/phi-1_5', '#Params (B)'] = 1.3
# fixing a mistake for roneneldan/TinyStories-1M
data.loc[data['Model_name_for_query']=='roneneldan/TinyStories-1M', '#Params (B)'] = 0.001
# removing models with size 0
data = data.loc[data['#Params (B)'] != 0]
# change the type of lamini 774M to Finetuned
data.loc[data['Model_name_for_query']=='MBZUAI/LaMini-GPT-774M', 'Type'] = "fine-tuned"
print(f'\nData Cleaning Finish!\n{"="*110}\n\n')

Measuring Local Resources...

---------------------------------------- CPU Info ----------------------------------------
Physical cores: 10
Total cores: 10
CPU frequency information not available via psutil, trying sysctl...

Unexpected output format from sysctl.
---------------------------------------- Memory Information ----------------------------------------
Total: 16.00GB
Available: 6.25GB
No NVIDIA GPU detected
---------------------------------------- Python Version ----------------------------------------
3.11.5
---------------------------------------- PyTorch Version ----------------------------------------
2.1.0
---------------------------------------- TensorFlow Version ----------------------------------------
2.14.0

Measuring Local Resources Finish!


Scraping HuggingFace open-llm-leaderboard (https://huggingfaceh4-open-llm-leaderboard.hf.space)...

Scraping HuggingFace open-llm-leaderboard Finish!
See best-models.txt and best-models-deduplicate.txt for best models for each

In [4]:
data = data.loc[data['Available_on_the_hub'] == True]

In [6]:
data.loc[data['Model_name_for_query'] == "microsoft/phi-1_5"]

Unnamed: 0,Type,Model_repo,Model_experiment_details,Average,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,DROP,Precision,Hub_License,#Params (B),Hub_like,Available_on_the_hub,Model_sha,Model_name_for_query
1025,pretrained,https://huggingface.co/microsoft/phi-1_5,https://huggingface.co/datasets/open-llm-leade...,41.6,52.9,63.79,43.89,40.89,72.22,12.43,5.04,torch.bfloat16,other,1.3,916.0,True,ea95720a352172db6fcbcd89032bfb1cb8481797,microsoft/phi-1_5


In [44]:
# Prompt the user to input something
# TODO: Refine the logic: skip character and more while loops
print(f'{"="*110}\nPlease input your desired task types...')
train_or_inference = input("Train or Inference: ")
while (train_or_inference not in {'inf', 'inf_vLLM', 'inf_ggml', 'trn'}):
    train_or_inference = input("Invalid input. Please pick from {'inf', 'inf_vLLM', 'inf_ggml', 'trn'}.\nTrain or Inference: ")
if train_or_inference == 'trn':
    train_method = input("Train method: ")
    while (train_method not in {'full_trn','lora_trn','qlora'}):
        train_method = input("Invalid input. Please pick from {'full_trn','lora_trn','qlora'}.\nTrain method: ")
    optimizer = input("Optimizer: ")
    while (optimizer not in {'adam_opt', 'sgd_opt'}):
        optimizer = input("Invalid input. Please pick from {'adam_opt', 'sgd_opt'}.\nOptimizer: ")
    gradient_checkpointing = True if input("Gradient checkpointing? {'y', 'n'} ")=="y" else False
    quant, prompt_len, tokens_to_generate = None, None, 1
else: # inference
    quant = input("Quantization method: ")
    while (quant not in {'no_quant', 'bnb_int8', 'bnb_q4', 'ggml_Q2_K', 'ggml_Q3_K_L','ggml_Q3_K_M', 'ggml_QK4_0','ggml_QK4_1','ggml_QK4_K_M','ggml_QK4_K_S', 'ggml_QK5_0', 'ggml_QK5_1', 'ggml_QK5_K_M', 'ggml_Q6_K', 'ggml_QK8_0'}):
        quant = input("Invalid input. Please pick from {'no_quant', 'bnb_int8', 'bnb_q4', 'ggml_Q2_K', 'ggml_Q3_K_L','ggml_Q3_K_M', 'ggml_QK4_0','ggml_QK4_1','ggml_QK4_K_M','ggml_QK4_K_S', 'ggml_QK5_0', 'ggml_QK5_1', 'ggml_QK5_K_M', 'ggml_Q6_K', 'ggml_QK8_0'}.\nQuantization method: ")
    try:
        prompt_len = int(input("Prompt length in tokens: "))
        tokens_to_generate = int(input("Output length in tokens: "))
    except Exception as e:
        print(f'prompt_len and output_len should be positive int. We will proceed with default value:{300, 300}')
        prompt_len, tokens_to_generate = 300, 300
    train_method, optimizer, gradient_checkpointing = None, None, None
batch_size = int(input("Batch Size: ")) # modify
print(f'\nBased on your input, the task variables are \n\
      train_or_inference: {train_or_inference}, \n\
      train_method: {train_method},\n\
      optimizer: {optimizer}, \n\
      gradient_checkpointing: {gradient_checkpointing},\n\
      quant: {quant}, \n\
      prompt_len: {prompt_len}, \n\
      tokens_to_generate: {tokens_to_generate}, \n\
      batch_size: {batch_size} \n\
      \n{"="*110}')

Please input your desired task types...
Train or Inference: inf
Quantization method: no_quant
Prompt length in tokens: 300
Output length in tokens: 300
Batch Size: 1

Based on your input, the task variables are 
      train_or_inference: inf, 
      train_method: None,
      optimizer: None, 
      gradient_checkpointing: None,
      quant: no_quant, 
      prompt_len: 300, 
      tokens_to_generate: 300, 
      batch_size: 1 
      


In [45]:
print(f'{"="*110}\nData Augmentation using get_model_memory starts...')
# Wrapper function for applying get_model_memory with error handling
# TODO: enable users to choose training or inference
def apply_get_model_memory(row):
    try:
        return get_model_memory.findMemoryRequirement(row['Model_name_for_query'], train_or_inference, train_method, optimizer, quant, prompt_len, tokens_to_generate, batch_size, gradient_checkpointing)['Total']
    except Exception as e:
        # assign a negative value as indicator
        return -1

# Apply the function to each row and create a new column
data['Memory'] = data.apply(apply_get_model_memory, axis=1)
print(f'\nData Augmentation Finish!\n{"="*110}\n\n')

Data Augmentation using get_model_memory starts...

Data Augmentation Finish!




In [46]:
data = data.loc[data['Memory'] != -1] # TODO: give estimates to models not in the list

In [47]:
def find_best_model(models):
    best_model_names = set()
    for i in range(len(seperation)+1):
        low = seperation[i-1] if i != 0 else 0
        high = seperation[i] if i != len(seperation) else None
        sub_models = models[models['#Params (B)'] >= low]
        if high: sub_models = sub_models[sub_models['#Params (B)'] < high] # if high is not None
        if len(sub_models) == 0: continue # skip if there are no models within this size range
        max_score_index = sub_models['Average'].idxmax()
        print(f'({low},{high})', sub_models.loc[max_score_index]["Model_name_for_query"])
        best_model_names.add(((low, high), sub_models.loc[max_score_index]["Model_name_for_query"]))
    return best_model_names

In [48]:
print(f'{"="*110}\nSearching for best models...')
if have_gpu:
    deployable_models = data[data['Memory'] < gpu_memory_free*1024]
else:
    deployable_models = data[data['Memory'] < mem_available*1024]
max_score_index = deployable_models['Average'].idxmax()
model_name = deployable_models.loc[max_score_index]["Model_name_for_query"]
print(f'\nBest Model Found!\n{"="*110}\n\n')

Searching for best models...

Best Model Found!




In [49]:
model_name

'microsoft/phi-1_5'

In [53]:
data.loc[data['Model_name_for_query'] == "microsoft/phi-1_5"]

Unnamed: 0,Type,Model_repo,Model_experiment_details,Average,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,DROP,Precision,Hub_License,#Params (B),Hub_like,Available_on_the_hub,Model_sha,Model_name_for_query,Memory
1025,pretrained,https://huggingface.co/microsoft/phi-1_5,https://huggingface.co/datasets/open-llm-leade...,41.6,52.9,63.79,43.89,40.89,72.22,12.43,5.04,torch.bfloat16,other,1.3,916.0,True,ea95720a352172db6fcbcd89032bfb1cb8481797,microsoft/phi-1_5,4381
