In [26]:
import json

# Ruta del archivo corpus.jsonl
ruta_corpus = r"corpus.jsonl"

# Leer los documentos
with open(ruta_corpus, 'r', encoding='utf-8') as archivo:
    for i, linea in enumerate(archivo):
        documento = json.loads(linea)
        doc_id = documento.get("_id", "")
        doc_text = documento.get("text", "")
        
        print(f"Documento {i+1}")
        print(f"ID: {doc_id}")
        print(f"Texto: {doc_text[:200]}...")  # Mostrar primeros 200 caracteres
        print("-" * 50)

        if i == 2:  # Muestra solo los 3 primeros documentos
            break


Documento 1
ID: 11542
Texto: What's your Supreme Commander 2 build order. I don't just want "6 mass extractors, 2 power and a factory". List of building and units out to the second or third factory, please....
--------------------------------------------------
Documento 2
ID: 89376
Texto: You can view how many bone shards, paintings and runes you've found when you are done with your mission. Can I view how much of the items I've already found while doing the mission?...
--------------------------------------------------
Documento 3
ID: 11545
Texto: Does that mean that if no one screws up and everybody gets his daily ration, our water sources will be depleted in 2.85 more days? (The city started existing less than a day ago at 23:36.) Even if we ...
--------------------------------------------------


In [27]:

#imprime el número de documentos totales del corpus
with open(ruta_corpus, 'r', encoding='utf-8') as archivo:
    total_documentos = sum(1 for _ in archivo)
print(f"Número total de documentos en el corpus: {total_documentos}")

Número total de documentos en el corpus: 45301


In [28]:
#imprimir documentos en un dataframe
import pandas as pd
# Leer el corpus y convertirlo en un DataFrame
pd.read_json(ruta_corpus, lines=True)


Unnamed: 0,_id,title,text,metadata
0,11542,Supreme Commander 2 - Build Orders,What's your Supreme Commander 2 build order. I...,{'tags': ['supreme-commander-2']}
1,89376,Can I see how many mission items I've found du...,"You can view how many bone shards, paintings a...",{'tags': ['dishonored']}
2,11545,Our city has 40 citizens. Our well has 114 wat...,Does that mean that if no one screws up and ev...,{'tags': ['die2nite']}
3,89379,How do I use rags to stop bleeding?,I have some clean rags that I found in a first...,{'tags': ['neo-scavenger']}
4,11549,Are there any gameplay relevant benefits of pr...,I know of the extra class slots and the variou...,{'tags': ['call-of-duty-black-ops']}
...,...,...,...,...
45296,88975,80 tabards in 80 dungeons at level 85/90,I know you can't go to an 85 or 90 dungeon and...,{'tags': ['world-of-warcraft']}
45297,88974,What is a research credit?,Interrogating a sectoid resulted in my earning...,{'tags': ['xcom-enemy-unknown']}
45298,38341,What is the maximum level for assassins?,"In Assassin's Creed: Revelations, what is the ...",{'tags': ['ac-revelations']}
45299,103093,"When you prestige, do you always start at leve...",A friend of mine didn't realise you had to do ...,{'tags': ['call-of-duty-black-ops-2']}


In [29]:
#tokenizar los documentos
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
# Tokenizar todos los documentos y guardarlos en una lista
tokenized_documents = []
with open(ruta_corpus, 'r', encoding='utf-8') as archivo:
    for linea in archivo:
        documento = json.loads(linea)
        doc_text = documento.get("text", "")
        tokens = word_tokenize(doc_text)
        tokenized_documents.append(tokens)
# Imprimir los primeros 3 documentos tokenizados
for i, tokens in enumerate(tokenized_documents[:3]):
    print(f"Documento {i+1} tokenizado:")
    print(tokens)
    print("-" * 50)
    #mostrar numero de documentos tokenizados
print(f"Número total de documentos tokenizados: {len(tokenized_documents)}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ELI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Documento 1 tokenizado:
['What', "'s", 'your', 'Supreme', 'Commander', '2', 'build', 'order', '.', 'I', 'do', "n't", 'just', 'want', '``', '6', 'mass', 'extractors', ',', '2', 'power', 'and', 'a', 'factory', "''", '.', 'List', 'of', 'building', 'and', 'units', 'out', 'to', 'the', 'second', 'or', 'third', 'factory', ',', 'please', '.']
--------------------------------------------------
Documento 2 tokenizado:
['You', 'can', 'view', 'how', 'many', 'bone', 'shards', ',', 'paintings', 'and', 'runes', 'you', "'ve", 'found', 'when', 'you', 'are', 'done', 'with', 'your', 'mission', '.', 'Can', 'I', 'view', 'how', 'much', 'of', 'the', 'items', 'I', "'ve", 'already', 'found', 'while', 'doing', 'the', 'mission', '?']
--------------------------------------------------
Documento 3 tokenizado:
['Does', 'that', 'mean', 'that', 'if', 'no', 'one', 'screws', 'up', 'and', 'everybody', 'gets', 'his', 'daily', 'ration', ',', 'our', 'water', 'sources', 'will', 'be', 'depleted', 'in', '2.85', 'more', 'days'

In [30]:
#imprimir los tokens del primer documento en array
print("Tokens del primer documento:")
print(tokenized_documents[0])

Tokens del primer documento:
['What', "'s", 'your', 'Supreme', 'Commander', '2', 'build', 'order', '.', 'I', 'do', "n't", 'just', 'want', '``', '6', 'mass', 'extractors', ',', '2', 'power', 'and', 'a', 'factory', "''", '.', 'List', 'of', 'building', 'and', 'units', 'out', 'to', 'the', 'second', 'or', 'third', 'factory', ',', 'please', '.']


In [31]:
#convertir los tokenizados a lower case
tokenized_documents_lower = [[token.lower() for token in tokens] for tokens in tokenized_documents]
# Imprimir el primer documento tokenizado en minúsculas
print("Tokens del primer documento en minúsculas:")
print(tokenized_documents_lower[0])

Tokens del primer documento en minúsculas:
['what', "'s", 'your', 'supreme', 'commander', '2', 'build', 'order', '.', 'i', 'do', "n't", 'just', 'want', '``', '6', 'mass', 'extractors', ',', '2', 'power', 'and', 'a', 'factory', "''", '.', 'list', 'of', 'building', 'and', 'units', 'out', 'to', 'the', 'second', 'or', 'third', 'factory', ',', 'please', '.']


In [32]:
from nltk import regexp_tokenize

texto = " ".join(tokenized_documents_lower[0])
# Aplicar la expresión regular para volver a tokenizar
regex_tokens = regexp_tokenize(texto, pattern=r'\w[a-z]+')

print(regex_tokens)


['what', 'your', 'supreme', 'commander', 'build', 'order', 'do', 'just', 'want', 'mass', 'extractors', 'power', 'and', 'factory', 'list', 'of', 'building', 'and', 'units', 'out', 'to', 'the', 'second', 'or', 'third', 'factory', 'please']


In [33]:
for doc_text in tokenized_documents_lower:
    print(doc_text[:2])  # Imprimir los primeros 10 tokens de cada documento

['what', "'s"]
['you', 'can']
['does', 'that']
['i', 'have']
['i', 'know']
['i', "'ve"]
['at', 'the']
['i', 'have']
['i', "'ve"]
['what', 'are']
['we', "'re"]
['in', '_the']
['i', 'downloaded']
['i', 'like']
['in', '_mario']
['i', 'recently']
['in', 'pixel']
['in', 'fantasica']
['i', "'m"]
['in', 'minecraft']
['why', 'did']
['why', 'has']
['i', "'ve"]
['on', 'ftl']
['yesterday', 'i']
['so', 'yesterday']
['i', 'noticed']
['i', 'keep']
['auctioning', 'in']
['i', 'tried']
['when', 'you']
['i', "'d"]
['i', 'sold']
['i', "'m"]
['>', '*']
['>', '*']
['i', "'ve"]
['>', '*']
['>', '*']
['is', 'it']
['i', 'had']
['i', 'have']
['i', 'just']
['i', "'m"]
['>', '*']
['i', "'m"]
['>', '*']
['>', '*']
['>', '*']
['i', 'placed']
['i', 'have']
['i', 'owned']
['i', 'tried']
['most', 'player-created']
['a', 'dragon']
['i', 'was']
['it', "'s"]
['i', 'would']
['so', 'the']
['version', '0.18']
['around', '0:40']
['according', 'to']
['the', 'rofup1']
['i', "'ve"]
['i', "'m"]
['another', 'question']
['assault