In [13]:
!pip install autoawq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting autoawq
  Downloading autoawq-0.2.6-cp39-cp39-manylinux2014_x86_64.whl.metadata (18 kB)
Collecting torch==2.3.1 (from autoawq)
  Downloading torch-2.3.1-cp39-cp39-manylinux1_x86_64.whl.metadata (26 kB)
Collecting zstandard (from autoawq)
  Downloading zstandard-0.23.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting autoawq-kernels (from autoawq)
  Downloading autoawq_kernels-0.0.8-cp39-cp39-manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.1->autoawq)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.1->autoawq)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.1->autoawq)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collec

In [1]:
import os
os.chdir("/home/feline/master-generation")

# Cell 1: Import necessary libraries
import base64
import datetime
import hashlib
import json
import os
import re
from pathlib import Path
from time import sleep
import requests
import tqdm
from requests.adapters import HTTPAdapter
from requests.exceptions import ConnectionError, RequestException, Timeout
from tqdm.contrib.concurrent import thread_map

base = os.environ.get("HF_ENDPOINT") or "https://huggingface.co"

# Cell 2: Define the ModelDownloader class

class ModelDownloader:
    def __init__(self, max_retries=5):
        self.max_retries = max_retries
        self.session = self.get_session()

    def get_session(self):
        session = requests.Session()
        if self.max_retries:
            session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=self.max_retries))
            session.mount('https://huggingface.co', HTTPAdapter(max_retries=self.max_retries))

        if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
            session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))

        try:
            from huggingface_hub import get_token
            token = get_token()
        except ImportError:
            token = os.getenv("HF_TOKEN")

        if token is not None:
            session.headers = {'authorization': f'Bearer {token}'}

        return session

    def sanitize_model_and_branch_names(self, model, branch):
        if model[-1] == '/':
            model = model[:-1]

        if model.startswith(base + '/'):
            model = model[len(base) + 1:]

        model_parts = model.split(":")
        model = model_parts[0] if len(model_parts) > 0 else model
        branch = model_parts[1] if len(model_parts) > 1 else branch

        if branch is None:
            branch = "main"
        else:
            pattern = re.compile(r"^[a-zA-Z0-9._-]+$")
            if not pattern.match(branch):
                raise ValueError(
                    "Invalid branch name. Only alphanumeric characters, period, underscore and dash are allowed.")

        return model, branch

    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None):
        session = self.session
        page = f"/api/models/{model}/tree/{branch}"
        cursor = b""

        links = []
        sha256 = []
        classifications = []
        has_pytorch = False
        has_pt = False
        has_gguf = False
        has_safetensors = False
        is_lora = False
        while True:
            url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "")
            r = session.get(url, timeout=10)
            r.raise_for_status()
            content = r.content

            dict = json.loads(content)
            if len(dict) == 0:
                break

            for i in range(len(dict)):
                fname = dict[i]['path']
                if specific_file not in [None, ''] and fname != specific_file:
                    continue

                if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
                    is_lora = True

                is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname)
                is_safetensors = re.match(r".*\.safetensors", fname)
                is_pt = re.match(r".*\.pt", fname)
                is_gguf = re.match(r'.*\.gguf', fname)
                is_tiktoken = re.match(r".*\.tiktoken", fname)
                is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname) or is_tiktoken
                is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
                if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_tokenizer, is_text)):
                    if 'lfs' in dict[i]:
                        sha256.append([fname, dict[i]['lfs']['oid']])

                    if is_text:
                        links.append(f"{base}/{model}/resolve/{branch}/{fname}")
                        classifications.append('text')
                        continue

                    if not text_only:
                        links.append(f"{base}/{model}/resolve/{branch}/{fname}")
                        if is_safetensors:
                            has_safetensors = True
                            classifications.append('safetensors')
                        elif is_pytorch:
                            has_pytorch = True
                            classifications.append('pytorch')
                        elif is_pt:
                            has_pt = True
                            classifications.append('pt')
                        elif is_gguf:
                            has_gguf = True
                            classifications.append('gguf')

            cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
            cursor = base64.b64encode(cursor)
            cursor = cursor.replace(b'=', b'%3D')

        if (has_pytorch or has_pt or has_gguf) and has_safetensors:
            has_gguf = False
            for i in range(len(classifications) - 1, -1, -1):
                if classifications[i] in ['pytorch', 'pt', 'gguf']:
                    links.pop(i)

        if has_gguf and specific_file is None:
            has_q4km = False
            for i in range(len(classifications) - 1, -1, -1):
                if 'q4_k_m' in links[i].lower():
                    has_q4km = True

            if has_q4km:
                for i in range(len(classifications) - 1, -1, -1):
                    if 'q4_k_m' not in links[i].lower():
                        links.pop(i)
            else:
                for i in range(len(classifications) - 1, -1, -1):
                    if links[i].lower().endswith('.gguf'):
                        links.pop(i)

        is_llamacpp = has_gguf and specific_file is not None
        return links, sha256, is_lora, is_llamacpp

    def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, model_dir=None):
        if model_dir:
            base_folder = model_dir
        else:
            base_folder = 'models' if not is_lora else 'loras'

        if is_llamacpp:
            return Path(base_folder)

        output_folder = f"{'_'.join(model.split('/')[-2:])}"
        if branch != 'main':
            output_folder += f'_{branch}'

        output_folder = Path(base_folder) / output_folder
        return output_folder

    def get_single_file(self, url, output_folder, start_from_scratch=False):
        filename = Path(url.rsplit('/', 1)[1])
        output_path = output_folder / filename

        max_retries = 7
        attempt = 0
        while attempt < max_retries:
            attempt += 1
            session = self.session
            headers = {}
            mode = 'wb'

            try:
                if output_path.exists() and not start_from_scratch:
                    r = session.get(url, stream=True, timeout=20)
                    total_size = int(r.headers.get('content-length', 0))
                    if output_path.stat().st_size >= total_size:
                        return

                    headers = {'Range': f'bytes={output_path.stat().st_size}-'}
                    mode = 'ab'

                with session.get(url, stream=True, headers=headers, timeout=30) as r:
                    r.raise_for_status()
                    total_size = int(r.headers.get('content-length', 0))
                    block_size = 1024 * 1024

                    filename_str = str(filename)

                    tqdm_kwargs = {
                        'total': total_size,
                        'unit': 'B',
                        'unit_scale': True,
                        'unit_divisor': 1024,
                        'bar_format': '{desc}{percentage:3.0f}%|{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]',
                        'desc': f"{filename_str}: "
                    }

                    if 'COLAB_GPU' in os.environ:
                        tqdm_kwargs.update({
                            'position': 0,
                            'leave': True
                        })

                    with open(output_path, mode) as f:
                        with tqdm.tqdm(**tqdm_kwargs) as t:
                            count = 0
                            for data in r.iter_content(block_size):
                                f.write(data)
                                t.update(len(data))
                                if total_size != 0 and self.progress_bar is not None:
                                    count += len(data)
                                    self.progress_bar(float(count) / float(total_size), f"{filename_str}")

                    break
            except (RequestException, ConnectionError, Timeout) as e:
                print(f"Error downloading {filename}: {e}.")
                print(f"That was attempt {attempt}/{max_retries}.", end=' ')
                if attempt < max_retries:
                    print(f"Retry begins in {2 ** attempt} seconds.")
                    sleep(2 ** attempt)
                else:
                    print("Failed to download after the maximum number of attempts.")

    def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=4):
        thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)

    def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=4, specific_file=None, is_llamacpp=False):
        self.progress_bar = progress_bar

        output_folder.mkdir(parents=True, exist_ok=True)

        if not is_llamacpp:
            metadata = f'url: https://huggingface.co/{model}\n' \
                       f'branch: {branch}\n' \
                       f'download date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'

            sha256_str = '\n'.join([f'    {item[1]} {item[0]}' for item in sha256])
            if sha256_str:
                metadata += f'sha256sum:\n{sha256_str}'

            metadata += '\n'
            (output_folder / 'huggingface-metadata.txt').write_text(metadata)

        if specific_file:
            print(f"Downloading {specific_file} to {output_folder}")
        else:
            print(f"Downloading the model to {output_folder}")

        self.start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads)

    def check_model_files(self, model, branch, links, sha256, output_folder):
        validated = True
        for i in range(len(sha256)):
            fpath = (output_folder / sha256[i][0])

            if not fpath.exists():
                print(f"The following file is missing: {fpath}")
                validated = False
                continue

            with open(output_folder / sha256[i][0], "rb") as f:
                bytes = f.read()
                file_hash = hashlib.sha256(bytes).hexdigest()
                if file_hash != sha256[i][1]:
                    print(f'Checksum failed: {sha256[i][0]}  {sha256[i][1]}')
                    validated = False
                else:
                    print(f'Checksum validated: {sha256[i][0]}  {sha256[i][1]}')

        if validated:
            print('[+] Validated checksums of all model files!')
        else:
            print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')

# Cell 3: Define input parameters and run the downloader

# Instead of argparse, directly define the arguments here
model = 'meta-llama/Meta-Llama-3.1-8B'  # Example: You can change this
branch = 'main'
threads = 4
text_only = False
specific_file = None
output = None
model_dir = None
clean = False
check = False
max_retries = 5

# Initialize the downloader
downloader = ModelDownloader(max_retries=max_retries)

# Clean up the model/branch names
model, branch = downloader.sanitize_model_and_branch_names(model, branch)

# Get the download links from Hugging Face
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=text_only, specific_file=specific_file)

# Get the output folder
output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp, model_dir=model_dir)

if check:
    # Check previously downloaded files
    downloader.check_model_files(model, branch, links, sha256, output_folder)
else:
    # Download files
    downloader.download_model_files(model, branch, links, sha256, output_folder, threads=threads, specific_file=specific_file, is_llamacpp=is_llamacpp)


Downloading the model to models/meta-llama_Meta-Llama-3.1-8B
Error downloading USE_POLICY.md: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/resolve/main/USE_POLICY.md.
That was attempt 1/7. Retry begins in 2 seconds.


README.md: 100%|██████████████████████████████████████████████████| 39.9k/39.9k [00:00<00:00, 411kB/s]


Error downloading config.json: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/resolve/main/config.json.
That was attempt 1/7. Retry begins in 2 seconds.
Error downloading generation_config.json: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/resolve/main/generation_config.json.
That was attempt 1/7. Retry begins in 2 seconds.
Error downloading model-00001-of-00004.safetensors: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/resolve/main/model-00001-of-00004.safetensors.
That was attempt 1/7. Retry begins in 2 seconds.
Error downloading USE_POLICY.md: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/resolve/main/USE_POLICY.md.
That was attempt 2/7. Retry begins in 4 seconds.
Error downloading config.json: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/resolve/main/conf

KeyboardInterrupt: 

In [12]:
from transformers import LlamaTokenizer, AutoTokenizer, LlamaTokenizerFast, PreTrainedTokenizerFast
import os
os.chdir("/home/feline/master-generation")

def count_tokens_starting_with_Q(tokenizer, tokenizer_name):
    count = 0
    
    # Loop over the tokenizer's vocabulary
    for token_id in range(tokenizer.vocab_size):
        token = tokenizer.convert_ids_to_tokens(token_id)
        
        # Check if the token starts with 'X'
        if token.startswith('Q') or token.startswith('ĠQ') or token.startswith('▁Q'):
            count += 1

    print(f"{count} tokens start with 'X' in {tokenizer_name}")
    return count

import re
from transformers import LlamaTokenizer, AutoTokenizer

def count_tokens_starting_with_X(tokenizer, tokenizer_name):
    count = 0
    
    # Define a regex to remove any non-alphabetic characters from the start of the token
    non_alpha_prefix = re.compile(r'^[^a-zA-Z]+')

    # Loop over the tokenizer's vocabulary
    for token_id in range(tokenizer.vocab_size):
        token = tokenizer.convert_ids_to_tokens(token_id)
        
        # Remove any non-alphabetic characters at the start of the token
        stripped_token = non_alpha_prefix.sub('', token)
        
        # Check if the cleaned token starts with 'X'
        if stripped_token.startswith('Q'):
            count += 1

    print(f"{count} tokens start with 'Q' in {tokenizer_name}")
    return count

# Define your text input
text_input = "This is a test sentence for tokenization comparison. How do you like it?"

# Load LlamaTokenizer (for LLaMA 2 or earlier)
llama_tokenizer = LlamaTokenizer.from_pretrained('models/dolphin-2.6-mistral-7b-Mistral-7B-Instruct-v0.1')


# Load AutoTokenizer (for LLaMA 3)
#auto_tokenizer = PreTrainedTokenizerFast.from_pretrained('models/Meta-Llama-3.1-8B')
auto_tokenizer = LlamaTokenizerFast.from_pretrained('models/dolphin-2.6-mistral-7b-Mistral-7B-Instruct-v0.1')
auto_tokenizer.encode_special_tokens = True

print(len(llama_tokenizer))
print(len(auto_tokenizer))


# Tokenize the input using LlamaTokenizer
llama_tokens = llama_tokenizer.tokenize(text_input)
llama_token_ids = llama_tokenizer.encode(text_input)

# Tokenize the input using AutoTokenizer
auto_tokens = auto_tokenizer.tokenize(text_input)
auto_token_ids = auto_tokenizer.encode(text_input)

# Print results for comparison
print("LlamaTokenizer (LLaMA 2 or earlier):")
print(f"Tokens: {llama_tokenizer.decode(llama_token_ids)}")
print(f"Token IDs: {llama_token_ids}")

print("\nAutoTokenizer (LLaMA 3):")
print(f"Tokens: {auto_tokenizer.encode(auto_tokenizer.decode(auto_token_ids))}")
print(f"Token IDs: {auto_token_ids}")


# Count tokens that start with "Q" in both tokenizers
count_tokens_starting_with_Q(llama_tokenizer, 'LlamaTokenizer (LLaMA 2)')
count_tokens_starting_with_Q(auto_tokenizer, 'AutoTokenizer (LLaMA 3)')

# Count tokens that start with "X" in both tokenizers
count_tokens_starting_with_X(llama_tokenizer, 'LlamaTokenizer (LLaMA 2)')
count_tokens_starting_with_X(auto_tokenizer, 'AutoTokenizer (LLaMA 3)')


llama_tokenizer.add_bos_token = False
auto_tokenizer.add_bos_token = False
vocab=list(llama_tokenizer.get_vocab().values())
vocabfast=list(auto_tokenizer.get_vocab().values())
print(len(vocab))
print(len(vocabfast))
if vocab == vocabfast:
    print("The lists are identical")
else:
    print("The lists are not identical")

print(vocab[:10])
print(vocabfast[:10])

print(vocab[-10:])
print(vocabfast[-10:])

32000
32000
LlamaTokenizer (LLaMA 2 or earlier):
Tokens: <s>This is a test sentence for tokenization comparison. How do you like it?
Token IDs: [1, 851, 349, 264, 1369, 12271, 354, 6029, 1837, 10367, 28723, 1602, 511, 368, 737, 378, 28804]

AutoTokenizer (LLaMA 3):
Tokens: [1, 1, 28705, 851, 349, 264, 1369, 12271, 354, 6029, 1837, 10367, 28723, 1602, 511, 368, 737, 378, 28804]
Token IDs: [1, 851, 349, 264, 1369, 12271, 354, 6029, 1837, 10367, 28723, 1602, 511, 368, 737, 378, 28804]
33 tokens start with 'X' in LlamaTokenizer (LLaMA 2)
33 tokens start with 'X' in AutoTokenizer (LLaMA 3)
33 tokens start with 'Q' in LlamaTokenizer (LLaMA 2)
33 tokens start with 'Q' in AutoTokenizer (LLaMA 3)
32000
32000
The lists are not identical
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[18674, 26125, 10886, 19441, 21894, 22135, 11377, 5580, 2682, 10510]
[31990, 31991, 31992, 31993, 31994, 31995, 31996, 31997, 31998, 31999]
[15297, 14026, 30131, 26791, 31362, 4817, 23476, 21378, 23568, 2371]
