In [15]:
import json

# Ruta del archivo corpus.jsonl
ruta_corpus = r"corpus.jsonl"

# Leer los documentos
with open(ruta_corpus, 'r', encoding='utf-8') as archivo:
    for i, linea in enumerate(archivo):
        documento = json.loads(linea)
        doc_id = documento.get("_id", "")
        doc_text = documento.get("text", "")
        
        print(f"Documento {i+1}")
        print(f"ID: {doc_id}")
        print(f"Texto: {doc_text[:200]}...")  # Mostrar primeros 200 caracteres
        print("-" * 50)

        if i == 2:  # Muestra solo los 3 primeros documentos
            break


Documento 1
ID: 11542
Texto: What's your Supreme Commander 2 build order. I don't just want "6 mass extractors, 2 power and a factory". List of building and units out to the second or third factory, please....
--------------------------------------------------
Documento 2
ID: 89376
Texto: You can view how many bone shards, paintings and runes you've found when you are done with your mission. Can I view how much of the items I've already found while doing the mission?...
--------------------------------------------------
Documento 3
ID: 11545
Texto: Does that mean that if no one screws up and everybody gets his daily ration, our water sources will be depleted in 2.85 more days? (The city started existing less than a day ago at 23:36.) Even if we ...
--------------------------------------------------


In [16]:

#imprime el número de documentos totales del corpus
with open(ruta_corpus, 'r', encoding='utf-8') as archivo:
    total_documentos = sum(1 for _ in archivo)
print(f"Número total de documentos en el corpus: {total_documentos}")

Número total de documentos en el corpus: 45301


In [17]:
#imprimir documentos en un dataframe
import pandas as pd
# Leer el corpus y convertirlo en un DataFrame
corpusdf= pd.read_json(ruta_corpus, lines=True)
print(corpusdf) 

          _id                                              title  \
0       11542                 Supreme Commander 2 - Build Orders   
1       89376  Can I see how many mission items I've found du...   
2       11545  Our city has 40 citizens. Our well has 114 wat...   
3       89379                How do I use rags to stop bleeding?   
4       11549  Are there any gameplay relevant benefits of pr...   
...       ...                                                ...   
45296   88975           80 tabards in 80 dungeons at level 85/90   
45297   88974                         What is a research credit?   
45298   38341           What is the maximum level for assassins?   
45299  103093  When you prestige, do you always start at leve...   
45300   38346               How do I make my companions friends?   

                                                    text  \
0      What's your Supreme Commander 2 build order. I...   
1      You can view how many bone shards, paintings a...   
2  

In [18]:
#tokenizar el corpus
from nltk.tokenize import word_tokenize
# Asegúrate de tener el paquete de NLTK descargado
import nltk
nltk.download('punkt')
# Tokenizar el texto de cada documento
def tokenizar_documento(texto):
    return word_tokenize(texto)
# Aplicar la tokenización a los documentos
with open(ruta_corpus, 'r', encoding='utf-8') as archivo:
    tokens = []
    for linea in archivo:
        documento = json.loads(linea)
        texto = documento.get("text", "")
        tokens.append(tokenizar_documento(texto))
#mostrar en el dataframe una nueva columna con los tokens por cada documento en el corpusdf
corpusdf['tokens'] = tokens
print(corpusdf[['text', 'tokens']])  



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ELI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                    text  \
0      What's your Supreme Commander 2 build order. I...   
1      You can view how many bone shards, paintings a...   
2      Does that mean that if no one screws up and ev...   
3      I have some clean rags that I found in a first...   
4      I know of the extra class slots and the variou...   
...                                                  ...   
45296  I know you can't go to an 85 or 90 dungeon and...   
45297  Interrogating a sectoid resulted in my earning...   
45298  In Assassin's Creed: Revelations, what is the ...   
45299  A friend of mine didn't realise you had to do ...   
45300  _But you can't have more than one companion._ ...   

                                                  tokens  
0      [What, 's, your, Supreme, Commander, 2, build,...  
1      [You, can, view, how, many, bone, shards, ,, p...  
2      [Does, that, mean, that, if, no, one, screws, ...  
3      [I, have, some, clean, rags, that, I

In [19]:
#usando regex_tokenize de nltk
from nltk.tokenize import regexp_tokenize
corpusdf['regex_tokens'] = corpusdf['text'].str.lower().apply(regexp_tokenize, pattern=r'\w[a-z]+')
print(corpusdf)

          _id                                              title  \
0       11542                 Supreme Commander 2 - Build Orders   
1       89376  Can I see how many mission items I've found du...   
2       11545  Our city has 40 citizens. Our well has 114 wat...   
3       89379                How do I use rags to stop bleeding?   
4       11549  Are there any gameplay relevant benefits of pr...   
...       ...                                                ...   
45296   88975           80 tabards in 80 dungeons at level 85/90   
45297   88974                         What is a research credit?   
45298   38341           What is the maximum level for assassins?   
45299  103093  When you prestige, do you always start at leve...   
45300   38346               How do I make my companions friends?   

                                                    text  \
0      What's your Supreme Commander 2 build order. I...   
1      You can view how many bone shards, paintings a...   
2  

In [20]:
from nltk.corpus import stopwords
nltk.download('stopwords')
def remove_stopwords(tokens):
    sw = set(stopwords.words('english'))
    for w in sw:
            try:
                tokens.remove(w)
            except ValueError:
                pass
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ELI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
corpusdf['sw_tokens'] = corpusdf['regex_tokens'].apply(remove_stopwords)

In [None]:
print(corpusdf)

In [None]:
from nltk.stem import WordNetLemmatizer as wnl
nltk.download('wordnet')


In [None]:
def lemmatized(tokens):
    return [wnl().lemmatize(t) for t in tokens]

In [None]:
corpusdf['lem_tokens'] = corpusdf['sw_tokens'].apply(lemmatized)

In [None]:
print(corpusdf)

In [None]:
corpusdf['preprocesado'] = corpusdf['lem_tokens'].str.join(' ')

In [None]:
print(corpusdf[['text', 'preprocesado']])