# **Baixa e divide os arquivos em treino (80%) e teste (20%)** 🎲

**Baixa e extrai o corpus localmente**

In [1]:
import os
import zipfile

if not os.path.exists("corpus"):
  # Baixa arquivo corpus.zip
  url = "https://raw.githubusercontent.com/AngryLeaderBB/NLP_class_2024/refs/heads/main/Atividade_1/corpus.zip"
  cmd = ! wget {url}

  # Extrai corpus.zip na pasta corpus
  with zipfile.ZipFile("corpus.zip", 'r') as zip_ref:
      zip_ref.extractall("corpus")

  # Exclui arquivo corpus.zip
  os.remove("corpus.zip")

print("Corpus disponível!")

Corpus disponível!


**Divide arquivos 80% treino e 20% teste**

In [2]:
import json
import os
import random

# Lê nome de arquivos
files = os.listdir("corpus")
# Permuta a lista de nomes
random.shuffle(files)

# Corta em aproximadamente 80% treino e 20% teste
cut_index = int(len(files) * 0.8)
train_file_names, test_file_names = files[:cut_index], files[cut_index:]
files.clear()

# Função de leitura de arquivos (retorna string)
def read_files(file_names):
  string_buffer = []
  for file in file_names:
    if file.endswith(".json"):
      file_path = f"corpus/{file}"
      with open(file_path) as f:
        json_file = json.load(f)

      string_buffer.append(json_file["title"])
      string_buffer.append("\n\n")
      string_buffer.append(json_file["text"])
      string_buffer.append("<|endoftext|>")

  return "".join(string_buffer)

# Lê arquivos pela lista de nomes
train = read_files(train_file_names)
test = read_files(test_file_names)



# Imprime resultados -----------------------------
print("-"*15 + " Treino " + "-"*15)
print(f"Primeiros 5 de {len(train_file_names)} arquivos:")
print("",train_file_names[:5])
print("\nTexto:\n\n", train[:40]+"...")
print(f"{'-'*37}\n")

print("-"*15 + " Teste " + "-"*15)
print(f"Primeiros 5 de {len(test_file_names)} arquivos:")
print("",test_file_names[:5])
print("\nTexto:\n\n", test[:40]+"...")
print("-"*37)

# Clear names
train_file_names.clear()
test_file_names.clear()


--------------- Treino ---------------
Primeiros 5 de 8000 arquivos:
 ['51723.json', '101925.json', '35149.json', '87064.json', '25829.json']

Texto:

 Marinha

A marinha constitui — no seu se...
-------------------------------------

--------------- Teste ---------------
Primeiros 5 de 2000 arquivos:
 ['95788.json', '113446.json', '45162.json', '79773.json', '32616.json']

Texto:

 Coracora

Coracora é uma embarcação típi...
-------------------------------------


# **Baixa as funções implementadas do algoritmo de bigrama ⬇️**

Baixa bigram.py

In [4]:
import os
# Arquivo .py que implementa o algoritmo bigrama
if not os.path.exists("bigram.py"):
  url = "https://raw.githubusercontent.com/AngryLeaderBB/NLP_class_2024/refs/heads/main/Atividade_2/bigram.py"
  cmd = ! wget {url}

Baixa o tiktoken

In [5]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


Treina e salva modelo bigrama

In [5]:
import tiktoken
import pickle
from bigram import Bigram

# Gera bigrama com limitação de tamanho da
# lista de entrada tokens (3*10^5 tokens)
gen = Bigram(300000)

# Treina bigrama com debug ativo
gen.train(train, True)

# Salva bigrama
with open("bigram.pkl", "wb") as f:
  pickle.dump(gen, f)

[-                                                 ] 2.000006666688889%, time = 0.0973658561706543
[--                                                ] 4.000013333377778%, time = 0.08343172073364258
[---                                               ] 6.000020000066667%, time = 0.08260130882263184
[----                                              ] 8.000026666755556%, time = 0.09054684638977051
[-----                                             ] 10.000033333444446%, time = 0.08206939697265625
[------                                            ] 12.000040000133334%, time = 0.08269476890563965
[-------                                           ] 14.000046666822223%, time = 0.10077881813049316
[--------                                          ] 16.00005333351111%, time = 0.08959794044494629
[---------                                         ] 18.0000600002%, time = 0.08018326759338379
[----------                                        ] 20.00006666688889%, time = 0.08922338485717773
[-

Carrega bigrama pré-treinado

In [6]:
import pickle

with open("bigram.pkl", "rb") as f:
  gen = pickle.load(f)

print(gen.idx_to_token)
print(gen.probabilities.shape)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 131097, 28: 28, 29: 30, 30: 31, 31: 32, 32: 33, 33: 34, 34: 35, 35: 36, 36: 37, 37: 38, 38: 39, 39: 40, 40: 41, 41: 42, 42: 43, 43: 44, 44: 45, 45: 46, 46: 47, 47: 48, 48: 49, 49: 50, 50: 51, 51: 52, 52: 53, 53: 54, 54: 55, 55: 56, 56: 57, 57: 58, 58: 59, 59: 60, 60: 61, 61: 62, 62: 64, 63: 65, 64: 66, 65: 67, 66: 68, 67: 69, 68: 70, 69: 71, 70: 72, 71: 73, 72: 74, 73: 75, 74: 76, 75: 77, 76: 78, 77: 79, 78: 80, 79: 81, 80: 82, 81: 83, 82: 84, 83: 85, 84: 86, 85: 87, 86: 88, 87: 89, 88: 90, 89: 91, 90: 92, 91: 131162, 92: 131187, 93: 115, 94: 116, 95: 121, 96: 123, 97: 131200, 98: 135, 99: 131223, 100: 131242, 101: 131256, 102: 131288, 103: 220, 104: 131293, 105: 233, 106: 236, 107: 237, 108: 238, 109: 239, 110: 249, 111: 250, 112: 251, 113: 131325, 114: 258, 115: 259, 116: 260, 117: 261,

Gera texto aleatória

In [27]:
print(gen.generate_text(30))

 Física Teidecanede Santo Agostino do Brasil * Anysio (2022, foi enviada para a Academia Militar. 4 e em


Gera texto via seed

In [34]:
seed = 1234567
print(gen.generate_text(30, seed))

 provavelmente devido à prosseguir as cem anos, Rio de recursos minerais mais sutrasão, Condado Barrette, aquelas em 2003, variante


Calcula perplexidade

In [31]:
# perplexity of generated text
print(gen.perplexity(gen.generate_text(1000)))
print()

# perplexity of test up to 5 token
for i in range(1, 11):
  print(gen.perplexity(test[:i]))

print()
# perplexity of compiled test
print(gen.perplexity(test))


2.3025369710696597

20261.00390625
142.34115600585938
27.261747360229492
inf
inf
inf
21.356151580810547
inf
inf
inf

inf
