In [1]:
import torch

import matplotlib.pyplot as plt

from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset
from wordcloud import WordCloud

In [2]:
# Carregar o modelo pré-treinado e o tokenizador
model_name = "facebook/bart-large-cnn"

In [3]:
# Tokenização 
tokenizer = BartTokenizer.from_pretrained(model_name)

In [4]:
# Carregando modelo treinado
model = BartForConditionalGeneration.from_pretrained(model_name)

In [5]:
from faker import Faker

# Inicializar o gerador de texto aleatório
fake = Faker()

# Gerar 20 textos aleatórios
random_texts = [fake.text() for _ in range(20)]

# Mostrar os textos gerados
for i, text in enumerate(random_texts, 1):
    print(f"Texto {i}: {text}")
    print("="*50)

Texto 1: Direction claim oil someone heart above. Buy lay water early agree research thus wonder. Pay walk himself student threat marriage.
Texto 2: Work stand rate important. Good there later billion wear could table.
Never town career card.
Texto 3: Should sign risk vote paper. Man full nation sometimes technology school.
Texto 4: But meeting herself stuff. Far move idea. Many toward senior to keep prove game.
Think manager service possible everyone. Yet child within recently.
Texto 5: Pull human feeling what all born. Situation change hotel help.
White rather his message democratic move. Present if media college treat beat. Out goal marriage remember I.
Texto 6: Professional actually ability land prove tell down top. Wait public in speech pattern. Conference report ability hotel decision politics.
Texto 7: Prove thing another future she magazine.
Least across candidate. Development back traditional actually then find guess. Once deep president against.
Texto 8: Majority physical who

In [6]:
# Inicializar o gerador de texto aleatório
fake = Faker()

# Gerar 20 textos aleatórios
random_texts = [fake.text() for _ in range(20)]

# Criar o arquivo do dataset
with open("fake_dataset.txt", "w", encoding="utf-8") as file:
    file.writelines("\n".join(random_texts))

In [7]:
from faker import Faker
from datasets import load_dataset
from datasets import Dataset

# Carregar o dataset a partir do arquivo 'fake_dataset.txt'
dataset = load_dataset("text", data_files={"train": "fake_dataset.txt"})

# Mostrar os primeiros 5 exemplos do dataset
print(dataset["train"][:5])

Downloading and preparing dataset text/default to C:/Users/rafae/.cache/huggingface/datasets/text/default-8934ad665618eb34/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to C:/Users/rafae/.cache/huggingface/datasets/text/default-8934ad665618eb34/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

{'text': ['Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star.', 'And show quality stage tell. Like upon wait car attorney eye establish.', 'Baby down page vote. Environment begin main common each race.', 'Much national hear drive. Several race hope of Republican once though give.', 'Really across walk spend close.']}


In [8]:
from faker import Faker
from datasets import Dataset

# Inicializar o gerador de texto aleatório
fake = Faker()

# Gerar 20 textos aleatórios
random_texts = [fake.text() for _ in range(20)]

# Criar o dataset customizado
dataset_dict = {
    "text": random_texts
}

# Criar o objeto Dataset
custom_dataset = Dataset.from_dict(dataset_dict)

# Mostrar os primeiros 5 exemplos do dataset
print(custom_dataset["text"][:5])

['Region remain responsibility usually start program. Keep improve one effect. Per product put itself together leader tell.', 'From president cause start husband example guy. Individual four radio current. Social work actually painting live raise nature.', 'Best store yes. Show fine step eat summer develop sure. Family owner young generation. Success room will together.\nConference model couple type.', 'Certain know set. Eat office manager beautiful eye executive successful front. Within position major task.\nPut open wonder himself heart. Particular business away born get him no.', 'Oil turn wrong significant detail. Interest try order bag. Decision by if collection growth style rich money.']


In [9]:
# Carregar o dataset a partir do arquivo 'fake_dataset.txt'
dataset = load_dataset("text", data_files={"train": "fake_dataset.txt"})
dataset

Found cached dataset text (C:/Users/rafae/.cache/huggingface/datasets/text/default-8934ad665618eb34/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 35
    })
})

In [10]:
# Acessar a coluna "text" do dataset
texts = dataset["train"]["text"]
texts

['Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star.',
 'And show quality stage tell. Like upon wait car attorney eye establish.',
 'Baby down page vote. Environment begin main common each race.',
 'Much national hear drive. Several race hope of Republican once though give.',
 'Really across walk spend close.',
 'Chair account ever another month could cup. Sense across board decision at. Land purpose state response main certain.',
 'Drop five bad exactly nature. Also lose indeed service. Summer nice record.',
 'Color woman structure visit bad often group break. Good want interview same land voice actually. Around because discussion live body attack serve.',
 'Mrs former before we door letter whom. They woman born.',
 'Question development lay front.',
 'Performance cost campaign prepare expect ago indeed. Economy PM himself because. Charge culture dinner stage those early.',
 'Network establish where popular wall provide theory. Suddenl

In [11]:
# Tokenizar os textos e garantir que eles sejam tratados como um lote (batch)
inputs = tokenizer(texts, 
                   max_length=100, 
                   return_tensors="pt", 
                   padding=True, 
                   truncation=True)
inputs

{'input_ids': tensor([[    0, 11329, 37292,  ...,     1,     1,     1],
        [    0,  2409,   311,  ...,     1,     1,     1],
        [    0, 30047,   159,  ...,     1,     1,     1],
        ...,
        [    0, 40866,   171,  ...,     1,     1,     1],
        [    0, 36088,  8559,  ...,     1,     1,     1],
        [    0, 33867,  3392,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [12]:
# Realizar a inferência com o modelo
with torch.no_grad():
    outputs = model.generate(**inputs)



In [13]:
# Decodificar as sequências geradas pelo modelo
predicted_topics = tokenizer.batch_decode(outputs, skip_special_tokens=True)
predicted_topics

['Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star. "I want to be a singer-songwriter. I want to write a song about my life" "I don\'t know why I\'m here. I just want to live my life. I don\'t want to die."',
 'And show quality stage tell. Like upon wait car attorney eye establish. And show quality of the car. LikeUpon wait car lawyer eye establish, and show quality on the road. And like upon waiting car attorney attorney eye established, and the road ahead. And the road before.',
 'Baby down page vote. Environment begin main common each race. Baby up page vote for the first time. Baby down page for the second time for the third time. baby up for the fourth time for both candidates. baby down for the fifth time for each candidate. baby on the left for the sixth.',
 'Much national hear drive. Several race hope of Republican once though give. Many race hope for Republican once again. Much national hearing drive. Some hope for GOP once th

In [14]:
# Juntar todos os tópicos em um único texto para a nuvem de palavras
all_topics_text = " ".join(predicted_topics)
all_topics_text

'Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star. "I want to be a singer-songwriter. I want to write a song about my life" "I don\'t know why I\'m here. I just want to live my life. I don\'t want to die." And show quality stage tell. Like upon wait car attorney eye establish. And show quality of the car. LikeUpon wait car lawyer eye establish, and show quality on the road. And like upon waiting car attorney attorney eye established, and the road ahead. And the road before. Baby down page vote. Environment begin main common each race. Baby up page vote for the first time. Baby down page for the second time for the third time. baby up for the fourth time for both candidates. baby down for the fifth time for each candidate. baby on the left for the sixth. Much national hear drive. Several race hope of Republican once though give. Many race hope for Republican once again. Much national hearing drive. Some hope for GOP once though. Many ho

In [17]:
# Mostrar os resultados
for i, topic in enumerate(predicted_topics):
    print(f"Tópico {i+1}: {topic}")
    print("="*50)

Tópico 1: Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star. "I want to be a singer-songwriter. I want to write a song about my life" "I don't know why I'm here. I just want to live my life. I don't want to die."
Tópico 2: And show quality stage tell. Like upon wait car attorney eye establish. And show quality of the car. LikeUpon wait car lawyer eye establish, and show quality on the road. And like upon waiting car attorney attorney eye established, and the road ahead. And the road before.
Tópico 3: Baby down page vote. Environment begin main common each race. Baby up page vote for the first time. Baby down page for the second time for the third time. baby up for the fourth time for both candidates. baby down for the fifth time for each candidate. baby on the left for the sixth.
Tópico 4: Much national hear drive. Several race hope of Republican once though give. Many race hope for Republican once again. Much national hearing drive. So

In [47]:
from transformers import BartTokenizer, BartForCausalLM

# Carregar o modelo pré-treinado e o tokenizador
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForCausalLM.from_pretrained(model_name)

# Treinar o modelo ou realizar outras operações
# Salvar o modelo e o tokenizador em um diretório específico
output_dir = "modelo_1"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Modelo BART foi salvo com sucesso.")

Some weights of the model checkpoint at facebook/bart-large-cnn were not used when initializing BartForCausalLM: ['model.encoder.layers.1.final_layer_norm.weight', 'model.encoder.layers.0.self_attn.out_proj.weight', 'model.encoder.layers.0.self_attn_layer_norm.weight', 'model.encoder.layers.11.self_attn.q_proj.weight', 'model.encoder.layers.2.final_layer_norm.weight', 'model.encoder.layers.8.final_layer_norm.weight', 'model.encoder.layers.10.self_attn.k_proj.bias', 'model.encoder.layers.11.fc1.bias', 'model.encoder.layers.2.self_attn.v_proj.bias', 'model.encoder.layers.7.self_attn_layer_norm.weight', 'model.encoder.layers.2.fc2.weight', 'model.encoder.layers.5.self_attn.v_proj.weight', 'model.encoder.layers.4.fc2.weight', 'model.encoder.embed_tokens.weight', 'model.encoder.layers.8.final_layer_norm.bias', 'model.encoder.layers.1.self_attn.out_proj.weight', 'model.encoder.layers.7.self_attn.k_proj.bias', 'model.encoder.layers.9.final_layer_norm.bias', 'model.encoder.layers.11.fc2.weight

Modelo BART foi salvo com sucesso.


# Palavras chaves com - Bart

In [18]:
# Carregar o modelo pré-treinado e o tokenizador
model_name = "facebook/bart-large-cnn"

In [19]:
# Tokenização dados
tokenizer = BartTokenizer.from_pretrained(model_name)

In [20]:
%%time

# Modelo
model = BartForConditionalGeneration.from_pretrained(model_name)
model

CPU times: total: 1.23 s
Wall time: 3.37 s


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [21]:
# Acessar a coluna "text" do dataset com os textos gerados
texts = dataset["train"]["text"]

In [22]:
# Função para gerar resumo com diversidade usando top-k sampling
def generate_summary_with_diversity(text, max_length=100, temperature=0.8):
    # Codificar o texto para entrada no modelo
    inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True)

    # Realizar a inferência para gerar um resumo do texto com diversidade (top-k sampling)
    with torch.no_grad():
        output = model.generate(
            inputs.input_ids,
            max_length=max_length,
            do_sample=True,
            temperature=temperature
        )

    # Decodificar o resumo gerado
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    return summary

In [23]:
# Resumir os textos gerados com diversidade
for text in texts:
    summary = generate_summary_with_diversity(text, temperature=0.8)
    print("Texto gerado:")
    print(text)
    print("Resumo:")
    print(summary)
    print("=" * 5)

Texto gerado:
Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star.
Resumo:
Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star. "I want to be a doctor. I want to help people. I don't want to hurt people," she said. "That's what I do. I help people."
=====
Texto gerado:
And show quality stage tell. Like upon wait car attorney eye establish.
Resumo:
And show quality stage tell. Like upon wait car attorney eye establish. LikeUpon wait car lawyer eye establish, like upon wait vehicle attorney eye established. And show qualitystage tell. like upon waiting car attorneyEye establish. and like uponWait Vehicle Attorney Eye establish. And like upon Wait Vehicle AttorneyEye establish, Like upon Wait Car AttorneyEye established.
=====
Texto gerado:
Baby down page vote. Environment begin main common each race.
Resumo:
Baby down page vote. Environment begin main common each race. Baby up page vote for ea

In [27]:
# Definir as stop words para o idioma inglês
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Download dos recursos necessários do NLTK (apenas na primeira execução)
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Acessar a coluna "text" do dataset com os textos gerados
texts = dataset["train"]["text"]

# Lista para armazenar os resultados
results = []

# Função para gerar resumo com diversidade usando top-k sampling
def generate_summary_with_diversity(text, max_length=100, temperature=0.8):
    # Codificar o texto para entrada no modelo
    inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True)

    # Realizar a inferência para gerar um resumo do texto com diversidade (top-k sampling)
    with torch.no_grad():
        output = model.generate(
            inputs.input_ids,
            max_length=max_length,
            do_sample=True,
            temperature=temperature
        )

    # Decodificar o resumo gerado
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    return summary

# Resumir os textos gerados com diversidade e armazenar os resultados
for text in texts:
    summary = generate_summary_with_diversity(text, temperature=0.8)
    results.append({"Texto Gerado": text, "Resumo": summary})

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rafae\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rafae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Salvando o resultados

In [28]:
import pandas
import pandas as pd

# Criar DataFrame com os resultados
df = pd.DataFrame(results)

# Salvar DataFrame em um arquivo CSV
df.to_csv("resultados.csv", index=False)

In [29]:
# Visualizando os tópicos
data = pd.read_csv("resultados.csv")
data.head()

Unnamed: 0,Texto Gerado,Resumo
0,Thousand off operation feel for because. Choic...,Thousand off operation feel for because. Choic...
1,And show quality stage tell. Like upon wait ca...,And show quality stage tell. Like upon wait ca...
2,Baby down page vote. Environment begin main co...,Baby down page vote. Environment begin main co...
3,Much national hear drive. Several race hope of...,Much national hear drive. Several race hope of...
4,Really across walk spend close.,Really across walk close. Really across walk s...


In [30]:
data["Resumo"].head(1)

0    Thousand off operation feel for because. Choic...
Name: Resumo, dtype: object

## Extrair palavras chaves - NLTK

In [31]:
# Carregar o dataset a partir do arquivo 'fake_dataset.txt'
df = load_dataset("text", data_files={"train": "fake_dataset.txt"})
df

Found cached dataset text (C:/Users/rafae/.cache/huggingface/datasets/text/default-8934ad665618eb34/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 35
    })
})

In [32]:
# Acessar a coluna "text" do dataset
texts = df["train"]["text"]
texts

['Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star.',
 'And show quality stage tell. Like upon wait car attorney eye establish.',
 'Baby down page vote. Environment begin main common each race.',
 'Much national hear drive. Several race hope of Republican once though give.',
 'Really across walk spend close.',
 'Chair account ever another month could cup. Sense across board decision at. Land purpose state response main certain.',
 'Drop five bad exactly nature. Also lose indeed service. Summer nice record.',
 'Color woman structure visit bad often group break. Good want interview same land voice actually. Around because discussion live body attack serve.',
 'Mrs former before we door letter whom. They woman born.',
 'Question development lay front.',
 'Performance cost campaign prepare expect ago indeed. Economy PM himself because. Charge culture dinner stage those early.',
 'Network establish where popular wall provide theory. Suddenl

In [33]:
# Tokenização e remoção de stopwords
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

In [34]:
# Contagem das palavras
word_counts = Counter(filtered_tokens)

In [35]:
# Número de palavras-chave que você deseja extrair
num_keywords = 15

In [36]:
# Extrair palavras-chave
def extract_keywords(text, top_n=5):
    # Tokenização do texto
    tokens = word_tokenize(text.lower())

    # Remoção de pontuações e números
    words = [word for word in tokens if word.isalpha()]

    # Remoção de stop words
    words = [word for word in words if word not in stop_words]

    # Contagem das palavras
    word_freq = Counter(words)

    # Obtenção das palavras mais frequentes (palavras-chave)
    keywords = word_freq.most_common(top_n)

    return keywords

# Exemplo de extração de palavras-chave para o primeiro texto gerado
text = texts[10]
keywords = extract_keywords(text)
print("Texto")
print()
print(text)
print()
print("Palavras-chave")
print()
print(keywords)
print()

Texto

Performance cost campaign prepare expect ago indeed. Economy PM himself because. Charge culture dinner stage those early.

Palavras-chave

[('performance', 1), ('cost', 1), ('campaign', 1), ('prepare', 1), ('expect', 1)]



In [37]:
# Exemplo de extração de palavras-chave para o primeiro texto gerado
text = texts[10]

# Definir o número de palavras-chave desejado
num_keywords = 25 
keywords = extract_keywords(text, top_n=num_keywords)

print("Texto gerado")
print()
print(text)
print()
print("Palavras-chave extraidas")
print()

for keyword, freq in keywords:
    print(f"{keyword}: {freq}")

Texto gerado

Performance cost campaign prepare expect ago indeed. Economy PM himself because. Charge culture dinner stage those early.

Palavras-chave extraidas

performance: 1
cost: 1
campaign: 1
prepare: 1
expect: 1
ago: 1
indeed: 1
economy: 1
pm: 1
charge: 1
culture: 1
dinner: 1
stage: 1
early: 1


# Extrair palavras chaves - Texto geral

In [38]:
# Exemplo de uso
file_path = "fake_dataset.txt"  # Ou "caminho/para/o/arquivo.docx"

In [39]:
# Função para extrair texto de um arquivo .txt
def extract_text_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    return text

In [40]:
# Ou extract_text_from_docx(file_path)
texto_do_arquivo = extract_text_from_txt(file_path)

In [41]:
num_keywords = 25  # Definir o número de palavras-chave desejado

In [42]:
keywords = extract_keywords(texto_do_arquivo, top_n=num_keywords)

In [43]:
print("Texto do arquivo")
print()
print(texto_do_arquivo)
print()
print("Palavras-chave:")
print()

for keyword, freq in keywords:
    print(f"{keyword}: {freq}")

Texto do arquivo

Thousand off operation feel for because. Choice federal chair. Southern song spring wish study star.
And show quality stage tell. Like upon wait car attorney eye establish.
Baby down page vote. Environment begin main common each race.
Much national hear drive. Several race hope of Republican once though give.
Really across walk spend close.
Chair account ever another month could cup. Sense across board decision at. Land purpose state response main certain.
Drop five bad exactly nature. Also lose indeed service. Summer nice record.
Color woman structure visit bad often group break. Good want interview same land voice actually. Around because discussion live body attack serve.
Mrs former before we door letter whom. They woman born.
Question development lay front.
Performance cost campaign prepare expect ago indeed. Economy PM himself because. Charge culture dinner stage those early.
Network establish where popular wall provide theory. Suddenly of school short according 