In [1]:
%%capture
!pip install transformers==4.39.3
!pip install accelerate==0.28.0

# Downloading Specific File

In [2]:
from huggingface_hub import hf_hub_download

model_id = "microsoft/Mistral-7B-v0.1-onnx"
revision = "main" # or a specific commit SHA
filename = "README.md" 

hf_hub_download(repo_id=model_id, filename=filename, revision=revision)

README.md:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

'/root/.cache/huggingface/hub/models--microsoft--Mistral-7B-v0.1-onnx/snapshots/3d5ce13f65bd660957092e9e9d703f7f2d512f7e/README.md'

# Downloading The Entire Model Repo

snapshot_download() downloads an entire repository at a given revision. It uses internally hf_hub_download() which means all downloaded files are also cached on your local disk. Downloads are made concurrently to speed-up the process.

In [3]:
!mkdir phi-2

In [4]:
from huggingface_hub import snapshot_download

phi2_folder="phi-2"

snapshot_download(repo_id="microsoft/phi-2",
                  repo_type="model",
                  revision="main",
                  cache_dir=phi2_folder,
                  local_dir=phi2_folder,
#                   local_files_only=True
                 )

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

NOTICE.md:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

SECURITY.md:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

CODE_OF_CONDUCT.md:   0%|          | 0.00/444 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.04k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

'/kaggle/working/phi-2'

In [5]:
!ls phi-2

CODE_OF_CONDUCT.md  configuration_phi.py	      models--microsoft--phi-2
LICENSE		    generation_config.json	      special_tokens_map.json
NOTICE.md	    merges.txt			      tokenizer.json
README.md	    model-00001-of-00002.safetensors  tokenizer_config.json
SECURITY.md	    model-00002-of-00002.safetensors  vocab.json
added_tokens.json   model.safetensors.index.json
config.json	    modeling_phi.py


In [6]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer


model = AutoModelForCausalLM.from_pretrained(phi2_folder)
tokenizer = AutoTokenizer.from_pretrained(phi2_folder)

# https://github.com/huggingface/transformers/blob/09f9f566de83eef1f13ee83b5a1bbeebde5c80c1/src/transformers/configuration_utils.py#L49
pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                device="cuda",
#                 device_map="auto", # work with accelerate installed
                trust_remote_code=True)
pipe

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

2024-04-04 04:19:54.306041: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 04:19:54.306220: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 04:19:54.516073: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<transformers.pipelines.text_generation.TextGenerationPipeline at 0x7ff396f85300>

In [7]:
pipe.device

device(type='cuda')

In [8]:
pipe("Hello")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, world!")\n\n# Call the function\nprint(greet("Alice"))\n'}]

In [9]:
pipe("The weather in Melbourne is")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The weather in Melbourne is usually warm and sunny, with temperatures ranging from 20 to 30 degrees Celsius.'}]