In [1]:
# Set cache directory and load Huggingface api key

import os

username = os.getenv('USER')
directory_path = os.path.join('/scratch',username)

# Set Huggingface cache directory to be on scratch drive
if os.path.exists(directory_path):
    hf_cache_dir = os.path.join(directory_path,'hf_cache')
    if not os.path.exists(hf_cache_dir):
        os.mkdir(hf_cache_dir)
    print(f"Okay, using {hf_cache_dir} for huggingface cache. Models will be stored there.")
    assert os.path.exists(hf_cache_dir)
    os.environ['TRANSFORMERS_CACHE'] = f'/scratch/{username}/hf_cache/'
else:
    error_message = f"Are you sure you entered your username correctly? I couldn't find a directory {directory_path}."
    raise FileNotFoundError(error_message)

# Load Huggingface api key
api_key_loc = os.path.join('/home', username, '.apikeys', 'huggingface_api_key.txt')

if os.path.exists(api_key_loc):
    print('Huggingface API key loaded.')
else:
    error_message = f'Huggingface API key not found. You need to get an HF API key from the HF website and store it at {api_key_loc}.\n' \
                    'The API key will let you download models from Huggingface.'
    raise FileNotFoundError(error_message)


Okay, using /scratch/dixizil/hf_cache for huggingface cache. Models will be stored there.
Huggingface API key loaded.


In [2]:
# Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline

## GPT4All 13b Snoozy

In [3]:
model_id = 'nomic-ai/gpt4all-13b-snoozy'
print('Loading tokenizer')
tokenizer = AutoTokenizer.from_pretrained(model_id)

print('Loading model')
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map='auto')

print('Instantiating pipeline')
pipe = pipeline(
    "text-generation",
    model=model,
    do_sample=True,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=1,
    top_p=0.95,
    repetition_penalty=1.2)

print('Instantiating HuggingFacePipeline')
local_llm = HuggingFacePipeline(pipeline=pipe)

Loading tokenizer


Downloading (…)okenizer_config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Loading model


Downloading (…)lve/main/config.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00006.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00006.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00006.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00004-of-00006.bin:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

Downloading (…)l-00005-of-00006.bin:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

Downloading (…)l-00006-of-00006.bin:   0%|          | 0.00/2.49G [00:00<?, ?B/s]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/dixizil/.conda/envs/llms_env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/dixizil/.conda/envs/llms_env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Instantiating pipeline
Instantiating HuggingFacePipeline


In [None]:
prompt = "Who will win the bundesliga this year?"

print(prompt + local_llm(prompt))

#prompt = create_fsl_prompt(labeled_pirs, unlabeled_pir)

## Vicuna 13B v1.1

In [None]:
#requires adjusted weights
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-13b-delta-v1.1")

model = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-13b-delta-v1.1")

print('Loading model')
model = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-13b-delta-v1.1", load_in_8bit=True, device_map='auto')

print('Instantiating pipeline')
pipe = pipeline(
    "text-generation",
    model=model,
    do_sample=True,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=1,
    top_p=0.95,
    repetition_penalty=1.2)
local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
prompt = "Who will win the bundesliga this year?"
print(prompt + local_llm(prompt))

## Alpaca 7B

In [4]:
model_id = 'chavinlo/alpaca-native'
print('Loading tokenizer')
# tokenizer = AutoTokenizer.from_pretrained(model_id)

print('Loading model')
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                             load_in_8bit=False, 
                                             # device_map='auto', 
                                             trust_remote_code=True)
model.to('cpu')

print('Instantiating pipeline')
pipe = pipeline(
    "text-generation",
    model=model,
    do_sample=True,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=1,
    top_p=0.95,
    repetition_penalty=1.2)

print('Instantiating HuggingFacePipeline')
local_llm = HuggingFacePipeline(pipeline=pipe)

Loading tokenizer
Loading model


Downloading (…)lve/main/config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Instantiating pipeline
Instantiating HuggingFacePipeline


In [5]:
prompt = "Count to ten 1 2 3 "
print(prompt + local_llm(prompt))



Count to ten 1 2 3 4 5 6 7 8 9 10 (Count in a different language) Uno (Spanish): Uno, dos, tres, cuatro, cinco, seis, siete, ocho, nueve, diez. Two Languages: English and Spanish Together: Uno, dos, tres, four, five, six, seven, eight, nine, ten English-Language Instructions for the Player: • The player draws one card from the deck of cards • He or she looks at the card and decides if they would like to “keep” it or “pass” it to the next player • If they choose to keep the card, they put it face up on the playing surface • Players can also discard any other card from their hand when they pass a card • Once all players have passed their turn, the next player begins their turn by drawing a card from the remaining cards in the deck • The game is over when a player has only two cards left in his or her hands or the deck of cards is emptied out


In [51]:
text='Any minute now! Any minute now I will start listing the names of characters from Star Wars!' * 50
tok_text = tokenizer(text)
text_word_len = len(text.split(' '))
text_tok_len = len(tok_text.input_ids)
print(f'Word length: {text_word_len}\ntoken length: {text_tok_len}')

Word length: 801
token length: 951


In [52]:
local_llm(text)

'Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star Wars!Any minute now! Any minute now I will start listing the names of characters from Star War

In [53]:
next(model.parameters()).device

device(type='cpu')

## Mosaic mpt 7B

In [None]:
model_id = 'mosaicml/mpt-7b'

print('Loading tokenizer')
tokenizer = AutoTokenizer.from_pretrained(model_id)

print('Loading model')
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map='auto', trust_remote_code=True)

print('Instantiating pipeline')
pipe = pipeline(
    "text-generation",
    model=model,
    do_sample=True,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=.45,
    top_p=0.95,
    repetition_penalty=1.2)

print('Instantiating HuggingFacePipeline')
local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
prompt = "Count to ten 1 2 3"
print(prompt + local_llm(prompt))

## Mosaic mpt 7B Instruct

In [None]:
model_id = 'mosaicml/mpt-7b-instruct'
print('Loading tokenizer')
tokenizer = AutoTokenizer.from_pretrained(model_id)

print('Loading model')
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map='auto', trust_remote_code=True)

print('Instantiating pipeline')
pipe = pipeline(
    "text-generation",
    model=model,
    do_sample=True,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=.45,
    top_p=0.95,
    repetition_penalty=1.2)

print('Instantiating HuggingFacePipeline')
local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
prompt = "Count to ten 1 2 3"
print(prompt + local_llm(prompt))

## OASST-Llama-30B

## Alpaca 30B

In [None]:
# VicUnlocked-alpaca-30B

In [3]:

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Aeala/VicUnlocked-alpaca-30b")

model = AutoModelForCausalLM.from_pretrained("Aeala/VicUnlocked-alpaca-30b")

print('Instantiating pipeline')
pipe = pipeline(
    "text-generation",
    model=model,
    do_sample=True,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=1,
    top_p=0.95,
    repetition_penalty=1.2)

print('Instantiating HuggingFacePipeline')
local_llm = HuggingFacePipeline(pipeline=pipe)


Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

Downloading (…)l-00003-of-00007.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)l-00004-of-00007.bin:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

Downloading (…)l-00005-of-00007.bin:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

Downloading (…)l-00006-of-00007.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)l-00007-of-00007.bin:   0%|          | 0.00/5.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Instantiating pipeline
Instantiating HuggingFacePipeline


In [4]:
prompt = "Who will win the bundesliga this year?"
print(prompt + local_llm(prompt))

Who will win the bundesliga this year? Bayern is still favorite but there are other teams that can surprise and make it interesting. The Bundesliga will be thrilling as usual, and we just cannot wait for August 2019 to come by.


In [None]:
# GPT4-x-AlpacaDente2-30B

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Aeala/GPT4-x-AlpacaDente2-30b")

model = AutoModelForCausalLM.from_pretrained("Aeala/GPT4-x-AlpacaDente2-30b")

print('Instantiating pipeline')
pipe = pipeline(
    "text-generation",
    model=model,
    do_sample=True,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=1,
    top_p=0.95,
    repetition_penalty=1.2)

print('Instantiating HuggingFacePipeline')
local_llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Instantiating pipeline
Instantiating HuggingFacePipeline


In [4]:
prompt = "Who will win the bundesliga this year?"
print(prompt + local_llm(prompt))

Who will win the bundesliga this year?
Bayern Munich is a strong favorite to retain their Bundesliga title, with Dortmund also vying for the top spot. However, it's notoriously difficult to predict football outcomes so anything could happen!


In [None]:
#Alpacino30B

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("digitous/Alpacino30b")

model = AutoModelForCausalLM.from_pretrained("digitous/Alpacino30b")

print('Instantiating pipeline')
pipe = pipeline(
    "text-generation",
    model=model,
    do_sample=True,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=1,
    top_p=0.95,
    repetition_penalty=1.2)

print('Instantiating HuggingFacePipeline')
local_llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Instantiating pipeline
Instantiating HuggingFacePipeline


In [5]:
prompt = "Who will win the bundesliga this year?"
print(prompt + local_llm(prompt))

Who will win the bundesliga this year?
Hmmm.... Bayern obviously but also Dortmund and Leverkusen.
