<a href="https://colab.research.google.com/github/AIAerospace/LLM/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP a la vieja usanza

Vamos a analizar textos sin utilizar LLMs.

# 1 - Analisis de textos

Creamos un texto de ejemplo y lo cargamos en spacy

In [1]:
!python -m spacy download es_core_news_sm
import spacy
import es_core_news_sm
sp = es_core_news_sm.load()

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
texto = """María tenía un corderito blanco como la nieve. Los tipos que fuman puro tienen cara de canguro.
Nunca vi a un corderito fumar en puro. La nieve es blanca y suave."""

doc = sp(texto)



In [4]:
# Tokens de palabra
token_list = []
for token in doc:
    token_list.append(token.text)
print(token_list)


# Tokens de frase
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

['María', 'tenía', 'un', 'corderito', 'blanco', 'como', 'la', 'nieve', '.', 'Los', 'tipos', 'que', 'fuman', 'puro', 'tienen', 'cara', 'de', 'canguro', '.', '\n', 'Nunca', 'vi', 'a', 'un', 'corderito', 'fumar', 'en', 'puro', '.', 'La', 'nieve', 'es', 'blanca', 'y', 'suave', '.']
['María tenía un corderito blanco como la nieve.', 'Los tipos que fuman puro tienen cara de canguro.\n', 'Nunca vi a un corderito fumar en puro.', 'La nieve es blanca y suave.']


In [5]:
#Stop words

import spacy
spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS


print('Numero de stop words: %d' % len(spacy_stopwords))
print('Primeros 10 stop words: %s' % list(spacy_stopwords)[:10])


#Lo aplicamos a nuestro texto
texto_filtrado=[]

# filtrando
for word in doc:
    if word.is_stop==False:
        texto_filtrado.append(word)
print("Resultado:",texto_filtrado)

Numero de stop words: 521
Primeros 10 stop words: ['nuestros', 'asi', 'ahí', 'grandes', 'nuestra', 'e', 'eso', 'saber', 'encima', 'tal']
Resultado: [María, corderito, blanco, nieve, ., tipos, fuman, puro, cara, canguro, ., 
, vi, corderito, fumar, puro, ., nieve, blanca, suave, .]


In [6]:
total_lemmas=[]
for word in doc:
        print(word.text + '  ===>', word.lemma_)
        lemmas=word.lemma_
        total_lemmas.append(lemmas)

María  ===> María
tenía  ===> tener
un  ===> uno
corderito  ===> corderito
blanco  ===> blanco
como  ===> como
la  ===> el
nieve  ===> nieve
.  ===> .
Los  ===> el
tipos  ===> tipo
que  ===> que
fuman  ===> fumar
puro  ===> puro
tienen  ===> tener
cara  ===> cara
de  ===> de
canguro  ===> canguro
.  ===> .

  ===> 

Nunca  ===> nunca
vi  ===> ver
a  ===> a
un  ===> uno
corderito  ===> corderito
fumar  ===> fumar
en  ===> en
puro  ===> puro
.  ===> .
La  ===> el
nieve  ===> nieve
es  ===> ser
blanca  ===> blanco
y  ===> y
suave  ===> suave
.  ===> .


In [7]:
for word in doc:
    print(word.text,word.pos_)

María PROPN
tenía VERB
un DET
corderito NOUN
blanco ADJ
como SCONJ
la DET
nieve NOUN
. PUNCT
Los DET
tipos NOUN
que PRON
fuman VERB
puro ADJ
tienen VERB
cara NOUN
de ADP
canguro NOUN
. PUNCT

 SPACE
Nunca ADV
vi VERB
a ADP
un DET
corderito NOUN
fumar VERB
en ADP
puro NOUN
. PUNCT
La DET
nieve NOUN
es AUX
blanca ADJ
y CCONJ
suave ADJ
. PUNCT


In [8]:
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities


[(New York City, 'GPE', 384),
 (Tuesday, 'DATE', 391),
 (At least 285, 'CARDINAL', 397),
 (September, 'DATE', 391),
 (Brooklyn, 'GPE', 384),
 (Williamsburg, 'GPE', 384),
 (four, 'CARDINAL', 397),
 (Zip, 'PERSON', 380),
 (Bill de Blasio, 'PERSON', 380),
 (Tuesday, 'DATE', 391),
 (Orthodox, 'NORP', 381),
 (Jews, 'NORP', 381),
 (as young as 6 months old, 'DATE', 391),
 (up to $1,000, 'MONEY', 394)]

In [9]:
displacy.render(nytimes, style = "ent",jupyter = True)

In [10]:

nlp = en_core_web_sm.load()



docp = nlp ("La casa de mi madre es vieja y enorme")

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

displacy.render(docp, style="dep", jupyter= True)

La casa casa ROOT casa
de mi mi appos casa
madre es es appos casa


#2 - Clasificación de textos

Cargamos el dataset. Contiene textos escritos por tres autores diferentes de la misma época y estilo literario:

* Edgar Alan Poe (EAP)
* H.P. Lovecraft (HPL)
* Mary Shelley (MWS)

El objetivo es entrenar un modelo que se capaz de reconocer entre estos tres el autor de un texto

In [10]:
import pandas as pd
train = pd.read_csv("train.csv")

train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [12]:
print(train.shape)

print(train['author'].value_counts())

(19579, 3)
author
EAP    7900
MWS    6044
HPL    5635
Name: count, dtype: int64


### Calcular los embeddings

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

corpus = [
          'María tenía un corderito blanco como la nieve',
          'Los tipos que fuman puro tienen cara de canguro',
          'Nunca vi a un corderito fumar en puro',
          'La nieve es blanca y suave',
 ]

vectorizer = TfidfVectorizer()
vectorizer2 = CountVectorizer()

# TD-IDF Matrix
X = vectorizer.fit_transform(corpus)
X2 = vectorizer2.fit_transform(corpus)

# extracting feature names
count_tokens = vectorizer.get_feature_names_out()
tfidf_tokens = vectorizer2.get_feature_names_out()

print(count_tokens)
print(tfidf_tokens)


['blanca' 'blanco' 'canguro' 'cara' 'como' 'corderito' 'de' 'en' 'es'
 'fuman' 'fumar' 'la' 'los' 'maría' 'nieve' 'nunca' 'puro' 'que' 'suave'
 'tenía' 'tienen' 'tipos' 'un' 'vi']
['blanca' 'blanco' 'canguro' 'cara' 'como' 'corderito' 'de' 'en' 'es'
 'fuman' 'fumar' 'la' 'los' 'maría' 'nieve' 'nunca' 'puro' 'que' 'suave'
 'tenía' 'tienen' 'tipos' 'un' 'vi']


In [14]:
X2.toarray()

array([[0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 0],
       [0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
        0, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        1, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0]])

In [15]:
X.toarray()

array([[0.        , 0.39264414, 0.        , 0.        , 0.39264414,
        0.30956515, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.30956515, 0.        , 0.39264414, 0.30956515,
        0.        , 0.        , 0.        , 0.        , 0.39264414,
        0.        , 0.        , 0.30956515, 0.        ],
       [0.        , 0.        , 0.34056989, 0.34056989, 0.        ,
        0.        , 0.34056989, 0.        , 0.        , 0.34056989,
        0.        , 0.        , 0.34056989, 0.        , 0.        ,
        0.        , 0.26850921, 0.34056989, 0.        , 0.        ,
        0.34056989, 0.34056989, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.32555709, 0.        , 0.41292788, 0.        , 0.        ,
        0.41292788, 0.        , 0.        , 0.        , 0.        ,
        0.41292788, 0.32555709, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.32555709, 0.41292788],
       [0.485

In [16]:
tfidf_tokens

array(['blanca', 'blanco', 'canguro', 'cara', 'como', 'corderito', 'de',
       'en', 'es', 'fuman', 'fumar', 'la', 'los', 'maría', 'nieve',
       'nunca', 'puro', 'que', 'suave', 'tenía', 'tienen', 'tipos', 'un',
       'vi'], dtype=object)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train['text'])
Y=train['author']
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

# Crear y entrenar el modelo de regresión logística
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred = classifier.predict(X_test)

# Evaluar el modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"Precisión del modelo: {accuracy * 100:.2f}%")

Precisión del modelo: 81.12%


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

vectorizer2 = TfidfVectorizer()
X = vectorizer2.fit_transform(train['text'])
Y=train['author']
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

# Crear y entrenar el modelo de regresión logística
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred = classifier.predict(X_test)

# Evaluar el modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"Precisión del modelo: {accuracy * 100:.2f}%")

Precisión del modelo: 80.76%


In [28]:
!pip install transformers torch datasets scikit-learn


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

# Assuming you already have a DataFrame named `df`
# df = pd.read_csv('your_file.csv') # if loading from a CSV

# Encode labels ('author' column) to integers
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['author'])

# Convert Pandas DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(train[['text', 'label']])


In [18]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import classification_report

# Load dataset
#dataset = load_dataset("ag_news", split='train[:2000]')

dataset = dataset.train_test_split(test_size=0.2)

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize dataset
def preprocess(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

tokenized_data = dataset.map(preprocess, batched=True)

# Set format for PyTorch
tokenized_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Load model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_steps=10,
    save_steps=10,
    logging_dir='./logs',
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
)

# Train model
trainer.train()

# Evaluate
predictions = trainer.predict(tokenized_data['test'])
pred_labels = np.argmax(predictions.predictions, axis=1)

print(classification_report(tokenized_data['test']['label'], pred_labels))


Map:   0%|          | 0/15663 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4657,0.433972
2,0.2225,0.40697


              precision    recall  f1-score   support

           0       0.88      0.85      0.87      1598
           1       0.90      0.88      0.89      1114
           2       0.83      0.89      0.86      1204

    accuracy                           0.87      3916
   macro avg       0.87      0.87      0.87      3916
weighted avg       0.87      0.87      0.87      3916

