# **Carrega o conjunto de dados fornecido (corpus) 💪**



In [None]:
import os
import zipfile

if not os.path.exists("corpus"):
  # Baixa arquivo corpus.zip
  url = "https://raw.githubusercontent.com/AngryLeaderBB/NLP_class_2024/refs/heads/main/Atividade_1/corpus.zip"
  cmd = ! wget {url}

  # Extrai corpus.zip na pasta corpus
  with zipfile.ZipFile("corpus.zip", 'r') as zip_ref:
      zip_ref.extractall("corpus")

  # Exclui arquivo corpus.zip
  os.remove("corpus.zip")



# **Importa as funções implementadas do algoritmo BPE ⬇️**

**Arquivo bpe.py**


In [None]:
import os
# Arquivo .py que implementa o algoritmo bpe
if not os.path.exists("bpe.py"):
  url = "https://raw.githubusercontent.com/AngryLeaderBB/NLP_class_2024/refs/heads/main/Atividade_1/bpe.py"
  cmd = ! wget {url}


**Arquivo tokenizer.pkl (bpe já treinado)**

In [None]:
import os

# Uma versão pré-treinada com tamanho 500 de vocabulário
# caso não queira treinar-lá aqui
if not os.path.exists("tokenizer.pkl"):
  url = "https://raw.githubusercontent.com/AngryLeaderBB/NLP_class_2024/refs/heads/main/Atividade_1/tokenizer.pkl"
  cmd = ! wget {url}

# **Aplica o algoritmo BPE no conjunto de dados 🏋️‍♂️**

In [4]:
from bpe import Tokenizer
import pickle
import json

# Inicia tokenizer
bpe = Tokenizer()

string_buffer = []
# Lê arquivos em uma string
for file in os.listdir("corpus"):
  if file.endswith(".json"):
    file_path = f"corpus/{file}"
    with open(file_path) as f:
      json_file = json.load(f)

    string_buffer.append(json_file["title"])
    string_buffer.append(json_file["text"])

print("Json files have been read\nStarting training")

# Treina BPE
bpe.train("\n\n".join(string_buffer), 500, True)

print("BPE tokenizer has been trained")

# Salva tokenizer
with open("tokenizer.pkl", "wb") as f:
  pickle.dump(bpe, f)

print("Tokenizer has been saved")

Json files have been read
Starting training
[-                                                 ] 2.0491803278688523%, time = 198.835346698761
[--                                                ] 4.098360655737705%, time = 184.72049045562744
[---                                               ] 6.147540983606557%, time = 183.13305950164795
[----                                              ] 8.19672131147541%, time = 170.96518206596375
[-----                                             ] 10.245901639344263%, time = 166.09898376464844
[------                                            ] 12.295081967213115%, time = 166.48824381828308
[-------                                           ] 14.344262295081966%, time = 157.00936818122864
[--------                                          ] 16.39344262295082%, time = 157.08249497413635
[---------                                         ] 18.0327868852459%, time = 122.19477796554565
[----------                                        ] 20.081967213

# **Exibe os resultados da segmentação das subpalavras no final do processo 🖼️**

In [5]:
import pickle

# Lê arquivo tokenizer.pkl que armazena a classe Tokenizer salva
with open("tokenizer.pkl", "rb") as f:
  bpe = pickle.load(f)

print("Vocabulary:\n", bpe.vocab, "\n")
print("Merges:\n", bpe.merges, "\n")

# Texto de teste
text = """A blast furnace is a type of metallurgical furnace used for smelting to produce industrial
metals, generally pig iron, but also others such as lead or copper. Blast refers to the
combustion air being supplied above atmospheric pressure.

In a blast furnace, fuel (coke), ores, and flux (limestone) are continuously supplied through the
top of the furnace, while a hot blast of air (sometimes with oxygen enrichment) is blown into the
lower section of the furnace through a series of pipes called tuyeres, so that the chemical
reactions take place throughout the furnace as the material falls downward. The end products
are usually molten metal and slag phases tapped from the bottom, and waste gases (flue gas)
exiting from the top of the furnace.[1] The downward flow of the ore along with the flux in
contact with an upflow of hot, carbon monoxide-rich combustion gases is a countercurrent
exchange and chemical reaction process.[2]

In contrast, air furnaces (such as reverberatory furnaces) are naturally aspirated, usually by the
convection of hot gases in a chimney flue. According to this broad definition, bloomeries for
iron, blowing houses for tin, and smelt mills for lead would be classified as blast furnaces.
However, the term has usually been limited to those used for smelting iron ore to produce pig
iron, an intermediate material used in the production of commercial iron and steel, and the shaft
furnaces used in combination with sinter plants in base metals smelting.[3][4]

Blast furnaces are estimated to have been responsible for over 4% of global greenhouse gas emissions between 1900 and 2015, but
are difficult to decarbonize.[5] """

# Imprime lista de bytes codificado
encoded_text = bpe.encode(text)
print("Encoded text:\n", encoded_text, "\n")

# Checa se texto decodificado é o original
decoded_text = bpe.decode(encoded_text)
print("The decoding of the enconding is the originial text? (",
      decoded_text == text, ")\n")

# Imprime texto segmentado na forma [ 'token' ]
print("Segmented text:\n", bpe.text_to_tokens(text))


Vocabulary:
 {0: b'\x00', 1: b'\x01', 2: b'\x02', 3: b'\x03', 4: b'\x04', 5: b'\x05', 6: b'\x06', 7: b'\x07', 8: b'\x08', 9: b'\t', 10: b'\n', 11: b'\x0b', 12: b'\x0c', 13: b'\r', 14: b'\x0e', 15: b'\x0f', 16: b'\x10', 17: b'\x11', 18: b'\x12', 19: b'\x13', 20: b'\x14', 21: b'\x15', 22: b'\x16', 23: b'\x17', 24: b'\x18', 25: b'\x19', 26: b'\x1a', 27: b'\x1b', 28: b'\x1c', 29: b'\x1d', 30: b'\x1e', 31: b'\x1f', 32: b' ', 33: b'!', 34: b'"', 35: b'#', 36: b'$', 37: b'%', 38: b'&', 39: b"'", 40: b'(', 41: b')', 42: b'*', 43: b'+', 44: b',', 45: b'-', 46: b'.', 47: b'/', 48: b'0', 49: b'1', 50: b'2', 51: b'3', 52: b'4', 53: b'5', 54: b'6', 55: b'7', 56: b'8', 57: b'9', 58: b':', 59: b';', 60: b'<', 61: b'=', 62: b'>', 63: b'?', 64: b'@', 65: b'A', 66: b'B', 67: b'C', 68: b'D', 69: b'E', 70: b'F', 71: b'G', 72: b'H', 73: b'I', 74: b'J', 75: b'K', 76: b'L', 77: b'M', 78: b'N', 79: b'O', 80: b'P', 81: b'Q', 82: b'R', 83: b'S', 84: b'T', 85: b'U', 86: b'V', 87: b'W', 88: b'X', 89: b'Y', 90: b'