In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torchtext.legacy import data
import spacy
import random
#from sklearn.metrics import *
import torch.optim as optim
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
torch.backends.cudnn.benchmark = True

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, confusion_matrix, classification_report
import regex

Armo un tensor con el corpus de palabras (TEXT) y otro con las etiquetas a predecir (LABEL). El tensor del corpus de palabras tokeniza con el paquete spacy. Separo los datos train y test en 70 - 30

In [None]:
#modulo pytorch https://torchtext.readthedocs.io/en/latest/data.html levanto y preparo datos data.----

SEED = 40
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True 
# TEXT = data.Field(tokenize='spacy',tokenizer_language='es',batch_first=True,include_lengths=True)
TEXT = data.Field(sequential = True,  lower = True, tokenize='spacy',tokenizer_language='es',batch_first=True,include_lengths=True,stop_words=('## your STOP WORDS'))
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [None]:
fields = [(None, None), ('text',TEXT),('label', LABEL)]
training_data = data.TabularDataset(path = '## your file ## ',format = 'csv',fields = fields,skip_header = True)
train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))
#print(vars(training_data.examples[3]))

TEXT.build_vocab(train_data,min_freq=20) #umbral minimo de repticiones
LABEL.build_vocab(train_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

BATCH_SIZE = 30 #Resamplea, le da estocasticidad lo que permite escapar de mínimos locales

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

#print("Size of TEXT vocabulary:",len(TEXT.vocab)) #tamanio del corpus de palabras
#print("Size of LABEL vocabulary:",len(LABEL.vocab)) #estuiquetas posibles
#print(TEXT.vocab.freqs.most_common(10)) #palabras mas repetidas
#print(TEXT.vocab.stoi) #lista de palabras
#print(TEXT.vocab.lookup_indices('supervielle'))
#print(TEXT.vocab.itos[0:30])
#print("Size of TEXT vocabulary:",len(TEXT.vocab))
#print(LABEL.vocab.itos)

# Si quiero llevarme el corpus a un DF

corpus = TEXT.vocab.itos[:]
corpus = pd.DataFrame(corpus,columns=['palabras'])  
corpus.to_csv('corpusNN.csv',encoding='utf-8')
# df = pd.DataFrame(list(TEXT.vocab.stoi.items()), columns=['palabra', 'rep'])
# df.to_csv('para_pipi.csv')


In [None]:
class classifier(nn.Module):
    
    #Definimos las capas de la red
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
         
        #Embedding layer https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #LSTM layer https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, #investigar implicancia del paramtro ##################### 
                           dropout=dropout,
                           batch_first=True)
        
        #Dense layer https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
        self.fc = nn.Linear(hidden_dim * 2, output_dim) 
        
        #Func activación
        self.act = nn.Sigmoid()

    def forward(self, text, text_lengths):        
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        dense_outputs=self.fc(hidden)
        outputs=self.act(dense_outputs)        
        return outputs

Arquitectura red

In [None]:
#Hiperparametros de la red
size_of_vocab = len(TEXT.vocab)
embedding_dim = 1  #salida capa embeding entrada lstm
num_hidden_nodes = 100
num_output_nodes = 1
num_layers = 10 #capas
bidirection = True
dropout = 0.3 #Apagas neuronas de forma aleatoria en train, ayuda a evitar overfiting

#Instanciamos modelo
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

#Arquitectura
print(model)

Cantidad de parametros

In [None]:
#Parametros a entrenar
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

Optimazier, func perdida y accuaracy

In [None]:
#Optim y func pérdida
optimizer = optim.Adam(model.parameters()) #Version de stochastic gradient descent (surge de la literatura). Learning rate: controla cuánto cambiar el modelo en respuesta al error estimado cada vez que se actualizan los pesos del modelo.
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.BCELoss()

#Acc
def binary_accuracy(preds, y):
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

model = model.to(device)
criterion = criterion.to(device)

Funcion para entrenar

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()  
    
    for batch in iterator:
        
        optimizer.zero_grad()   
        
        text, text_lengths = batch.text   
        
        predictions = model(text, text_lengths).squeeze()  
        
        loss = criterion(predictions, batch.label)        
        
        acc = binary_accuracy(predictions, batch.label)   
        
        loss.backward()       
        
        optimizer.step()      
        
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Funcion para evaluar (desactiva pesos)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
    
        for batch in iterator:
        
            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze()

            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Entrnamieno red

In [None]:
N_EPOCHS = 100 
best_valid_loss = float('inf')
iters = []
train_losses = []
val_losses = []
train_accuars = []
valid_accuars = []

for epoch in range(N_EPOCHS):
     
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    train_losses.append(float(train_loss)) #acumulo perdidas de train
    train_accuars.append(float(train_acc)) #acumulo acc train

    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    val_losses.append(float(valid_loss)) #acumulo perdidas de valid
    valid_accuars.append(float(valid_acc)) #acumulo acc val

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    if len(val_losses) > 10 and all(valid_loss >= loss for loss in val_losses[-5:]):
       print('Stopping early')
       break

    print(f'epoch {epoch + 1}: Train Loss: {train_loss:.5f}  Valid Loss: {valid_loss:.5f} | Train Acc: {train_acc*100:.2f}%  Valid Acc: {valid_acc*100:.2f}%')

In [None]:
data_entrenamiento = pd.DataFrame({'train_losses':train_losses})
data_entrenamiento['val_losses'] = val_losses
data_entrenamiento['train_accuars'] = train_accuars
data_entrenamiento['valid_accuars'] = valid_accuars
data_del_modelo = data_entrenamiento[-2:-1]
data_del_modelo 

In [None]:
plt.figure(figsize=(10,5))
plt.title("Training and Validation Loss")
plt.plot(val_losses,label="valid")
plt.plot(train_losses,label="train")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title("Training and Validation Accuracy")
plt.plot(valid_accuars,label="valid")
plt.plot(train_accuars,label="train")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Predigo sobre todo el dataset. Armo funcion para predecir con modelo entrenado (pesos)

In [None]:

path='saved_weights.pt'
model.load_state_dict(torch.load(path))
model.eval()

# #inference 
import spacy
nlp = spacy.load('es_core_news_sm') #https://spacy.io/models/es

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()  

control['Id'] = pd.DataFrame(list(range(len(control))))
control = control[['review','label','Id']]

clasf_report =  [ predict(model,(t))  for t in control['review']] #Vector de probabilidades en numpy
submission_report = pd.DataFrame(list(range(len(control))),columns =['Id'])
submission_report['Predicc'] = clasf_report # vector de probabilidades en df
control = control.merge(submission_report,on=['Id'])

# control['y_pred'] =''
# conditions = [
#     (control['Predicc'] >= 0.5), #ojo criterio subjetivo mio
#     (control['Predicc'] < 0.5) 
#         ]
    
# values = [1,0]
# control['y_pred'] = np.select(conditions, values)

control['y_pred'] = round(control['Predicc'],0)

y_true = control['label'].to_numpy()
y_pred = control['y_pred'].to_numpy()

roc = roc_auc_score(y_true, y_pred)
#print(confusion_matrix(y_true, [ predict(model,(t))  for t in control['review']]))
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true,y_pred))
print('ROC del modelo:', roc)
# fpr, tpr, thresholds = roc_curve(y_true,  y_pred)
fpr, tpr, thresholds = roc_curve(y_true,  control['Predicc'].to_numpy())
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr, label='Roc (area = %0.2f)' % roc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.savefig('Roc Red NN')
plt.show()

In [None]:

save_text_train = []
save_label_train = []
for value in train_data.examples:
    save_text_train.append(value.text)
    save_label_train.append(value.label)

save_train = pd.DataFrame({'text':save_text_train})
save_train['text'] = save_train['text'].apply('. '.join)
save_train["text"] = save_train["text"].str.replace(r'[^\w\s]', "",regex=True)
save_train['label'] = save_label_train

save_text_valid = []
save_label_valid = []
for value in valid_data.examples:
    save_text_valid.append(value.text)
    save_label_valid.append(value.label)

save_valid = pd.DataFrame({'text':save_text_valid})
save_valid['text'] = save_valid['text'].apply('. '.join)
save_valid["text"] = save_valid["text"].str.replace(r'[^\w\s]', "",regex=True)
save_valid['label'] = save_label_valid

In [None]:
clasf_report =  [ predict(model,(t))  for t in save_valid['text']]
submission_report = pd.DataFrame(list(range(len(save_valid))),columns =['Id'])
submission_report['Predicc'] = clasf_report
submission_report['Predicc'] = clasf_report 
save_valid.reset_index(inplace=True)
save_valid = save_valid.merge(submission_report,how='left',left_on='index',right_on='Id')
save_valid = save_valid[['text','label','Predicc']]
save_valid['y_pred'] = round(save_valid['Predicc'],0)
save_valid['label'] = save_valid['label'].astype('int')

y_true = save_valid['label'].to_numpy()
y_pred = save_valid['y_pred'].to_numpy()

roc = roc_auc_score(y_true, y_pred)
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true,y_pred))
print('ROC del modelo:', roc)
fpr, tpr, thresholds = roc_curve(y_true,  save_valid['Predicc'].to_numpy())
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr, label='Roc (area = %0.2f)' % roc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.savefig('Roc Red NN')
plt.show()

In [None]:
save_valid['label'].value_counts()

In [None]:
save_valid['y_pred'].value_counts()

In [None]:
clasf_report =  [ predict(model,(t))  for t in save_train['text']]
submission_report = pd.DataFrame(list(range(len(save_train))),columns =['Id'])
submission_report['Predicc'] = clasf_report
submission_report['Predicc'] = clasf_report 
save_train.reset_index(inplace=True)
save_train = save_train.merge(submission_report,how='left',left_on='index',right_on='Id')
save_train = save_train[['text','label','Predicc']]
save_train['y_pred'] = round(save_train['Predicc'],0)
save_train['label'] = save_train['label'].astype('int')

y_true = save_train['label'].to_numpy()
y_pred = save_train['y_pred'].to_numpy()

roc = roc_auc_score(y_true, y_pred)
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true,y_pred))
print('ROC del modelo:', roc)
fpr, tpr, thresholds = roc_curve(y_true,  save_train['Predicc'].to_numpy())
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr, label='Roc (area = %0.2f)' % roc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.savefig('Roc Red NN')
plt.show()


In [None]:
save_train['label'].value_counts()

In [None]:
save_train['y_pred'].value_counts()