<a href="https://colab.research.google.com/github/AIAerospace/LLM/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP a la vieja usanza

Vamos a analizar textos sin utilizar LLMs.

Creamos un texto de ejemplo y lo cargamos en spacy

In [11]:
# Word tokenization
from spacy.lang.es import Spanish

# Cargar tokenizer, tagger, parser, NER y embedings
nlp = spacy.load("es_core_news_sm")
nlp = Spanish()

text = """María tenía un corderito blanco como la nieve. Los tipos que fuman puro tienen cara de canguro.
Nunca vi a un corderito fumar en puro. La nieve es blanca y suave."""

doc = nlp(text)



OSError: [E050] Can't find model 'es_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [6]:
# Tokens de palabra
token_list = []
for token in doc:
    token_list.append(token.text)
print(token_list)

['María', 'tenía', 'un', 'corderito', 'blanco', 'como', 'la', 'nieve', '.', 'Los', 'tipos', 'que', 'fuman', 'puro', 'tienen', 'cara', 'de', 'canguro', '.', '\n', 'Nunca', 'vi', 'a', 'un', 'corderito', 'fumar', 'en', 'puro', '.', 'La', 'nieve', 'es', 'blanca', 'y', 'suave', '.']


In [8]:
#Stop words

import spacy
spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS


print('Numero de stop words: %d' % len(spacy_stopwords))
print('priemros 10 stop words: %s' % list(spacy_stopwords)[:10])


#Lo aplicamos a nuestro texto
texto_filtrado=[]

# filtrando
for word in doc:
    if word.is_stop==False:
        texto_filtrado.append(word)
print("Resultado:",texto_filtrado)

Numero de stop words: 521
priemros 10 stop words: ['mucho', 'cuanta', 'debe', 'su', 'tres', 'otro', 'día', 'ese', 'sí', 'temprano']
Resultado: [María, corderito, blanco, nieve, ., tipos, fuman, puro, cara, canguro, ., 
, vi, corderito, fumar, puro, ., nieve, blanca, suave, .]


In [9]:
# Implementing lemmatization
lem = nlp("run runs running runner")
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

run 
runs 
running 
runner 


# Clasificación de textos

Cargamos el dataset. Contiene textos escritos por tres autores diferentes de la misma época y estilo literario:

* Edgar Alan Poe (EAP)
* H.P. Lovecraft (HPL)
* Mary Shelley (MWS)

El objetivo es entrenar un modelo que se capaz de reconocer entre estos tres el autor de un texto

In [None]:
import pandas as pd
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [None]:
print(train.shape)
print(test.shape)
print(train['author'].value_counts())

(19579, 3)
(8392, 2)
author
EAP    7900
MWS    6044
HPL    5635
Name: count, dtype: int64


### Tokenizar

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

corpus = [
          'María tenía un corderito blanco como la nieve',
          'Los tipos que fuman puro tienen cara de canguro',
          'Nunca vi a un corderito fumar en puro',
          'La nieve es blanca y suave',
 ]

vectorizer = TfidfVectorizer()
vectorizer2 = CountVectorizer()

# TD-IDF Matrix
X = vectorizer.fit_transform(corpus)
X2 = vectorizer2.fit_transform(corpus)

# extracting feature names
tfidf_tokens = vectorizer.get_feature_names_out()


In [None]:
X2.toarray()

array([[0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 0],
       [0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
        0, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        1, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0]])

In [None]:
X.toarray()

array([[0.        , 0.39264414, 0.        , 0.        , 0.39264414,
        0.30956515, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.30956515, 0.        , 0.39264414, 0.30956515,
        0.        , 0.        , 0.        , 0.        , 0.39264414,
        0.        , 0.        , 0.30956515, 0.        ],
       [0.        , 0.        , 0.34056989, 0.34056989, 0.        ,
        0.        , 0.34056989, 0.        , 0.        , 0.34056989,
        0.        , 0.        , 0.34056989, 0.        , 0.        ,
        0.        , 0.26850921, 0.34056989, 0.        , 0.        ,
        0.34056989, 0.34056989, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.32555709, 0.        , 0.41292788, 0.        , 0.        ,
        0.41292788, 0.        , 0.        , 0.        , 0.        ,
        0.41292788, 0.32555709, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.32555709, 0.41292788],
       [0.485

In [None]:
tfidf_tokens

array(['blanca', 'blanco', 'canguro', 'cara', 'como', 'corderito', 'de',
       'en', 'es', 'fuman', 'fumar', 'la', 'los', 'maría', 'nieve',
       'nunca', 'puro', 'que', 'suave', 'tenía', 'tienen', 'tipos', 'un',
       'vi'], dtype=object)