##**Análisis de sentimientos - texto**##

###Instrucciones:

El notebook se puede ejecutar linealmente con el archivo train.csv, el cual se encuentra en la carpeta del drive llamada DATASET o se puede consultar en el siguiente link de la competencia de Kaggle: https://www.kaggle.com/c/petfinder-adoption-prediction/data

##**Importe de librerías**##

In [None]:
#Importando las librerías
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence as t2ws
from keras.preprocessing.text import one_hot
from __future__ import print_function
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

##**Lectura de los archivos**##

In [None]:
#Importando el drive al colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#Importando el dataset de la carpeta del drive
!ls '/content/gdrive/My Drive/MONOGRAFIA/DATASET'

 breed_labels.csv		 petfinder-adoption-prediction.zip
 BreedLabels.csv		 PetFinder-BreedLabels.csv
'Clasificación imagenes.ipynb'	 PetFinder-ColorLabels.csv
 color_labels.csv		 PetFinder-StateLabels.csv
 ColorLabels.csv		 state_labels.csv
'Copia de BreedLabels.csv'	 StateLabels.csv
'Copia de ColorLabels.csv'	 test
'Copia de state_labels.csv'	 test_sentiment
'Copia de StateLabels.csv'	 train
 fc9cf8b8d-1.jpg		 train_images
 ImagenesMuestra		 train_metadata
 Imagenes_Org			 train_sentiment


In [None]:
#Lectura del archivo de datos para el entrenamiento
train = pd.read_csv('/content/gdrive/My Drive/MONOGRAFIA/DATASET/train/train.csv') 

In [None]:
#Tipos de datos de las variables del dataset de entrenamiento
train.dtypes

Type               int64
Name              object
Age                int64
Breed1             int64
Breed2             int64
Gender             int64
Color1             int64
Color2             int64
Color3             int64
MaturitySize       int64
FurLength          int64
Vaccinated         int64
Dewormed           int64
Sterilized         int64
Health             int64
Quantity           int64
Fee                int64
State              int64
RescuerID         object
VideoAmt           int64
Description       object
PetID             object
PhotoAmt         float64
AdoptionSpeed      int64
dtype: object

In [None]:
#Dimensión de la variable descripción (Description)
Description_pet = train[['Description']]
Description_pet.shape

(14993, 1)

In [None]:
#Lectura de la variable objetivo
y_pets = train[['AdoptionSpeed']]
y_pets.head(80)

Unnamed: 0,AdoptionSpeed
0,2
1,0
2,3
3,2
4,2
...,...
75,0
76,4
77,3
78,1


##**Preprocesamiento de los textos**##

In [None]:
#Partiendo los datos en entrenamiento y test
x_train, x_test = train_test_split(Description_pet, test_size = 0.2)
y_train, y_test = train_test_split(y_pets, test_size = 0.2)

In [None]:
#Lectura de la fila 2000 de x_train
x_train.iloc[2000,0]

'He was dumped by his irresponsible owner when just few days after birth. Healthy and happy puppy. If you are willing to give him a home. Pls call me.'

In [None]:
#Tokenizando la variable de entrenamiento y prueba
t1 = Tokenizer()
t2 = Tokenizer()

#From dataframe to list
x_train_list = []
for i in range(x_train.shape[0]):
   x_train_list.append(x_train.iloc[i,0])
  
t1.fit_on_texts(str(x_train_list))

x_test_list = []
for j in range(x_test.shape[0]):
  x_test_list.append(x_test.iloc[j,0])

t2.fit_on_texts(str(x_test_list))

In [None]:
#Lectura de la variable x_train tokenizada
x_train_list

['Friendly',
 "The stray has wondered into our home and we have been taking care of it since. Am looking for a new home for Fei Chai as we can't afford to have two dogs due to space limitations.",
 'One month old kittens to be given away! They are very cute, playful and active. These 4 kittens were found nearby my house area. Can eat solid food and toilet trained. Kindly contact me ASAP via SMS or whatsapp for who are interested. I will contact you as soon as I can.',
 "Please call Marie at if you can help. Ideally they should be adopted together. ******************* URGENT: Foster carer or adopter needed for 2 mixed breed dogs - one male one female. They were rescued by Marie, have been spayed and vaccinated. Unfortunately, Marie lives in a condo and yesterday, while her maid was walking the dogs, one of them got loose and approached a resident who happened to be a Malay lady. She was not bitten and the dog didn't even touch her. But she lodged a report to MPAJ and MPAJ has given Mari

In [None]:
#Resumen de lo aprendido
print(t1.word_counts)
print(t1.document_count)
print(t1.word_index)
print(t1.word_docs)

OrderedDict([("'", 25174), ('f', 67316), ('r', 177108), ('i', 206164), ('e', 393967), ('n', 210737), ('d', 134090), ('l', 146750), ('y', 81722), ('t', 257236), ('h', 151001), ('s', 180079), ('a', 279319), ('w', 60202), ('o', 255289), ('u', 90947), ('m', 90535), ('v', 41780), ('b', 53609), ('k', 39793), ('g', 68201), ('c', 80305), ('p', 80509), ('4', 1608), ('2', 3907), ('x', 4248), ('j', 6322), ('q', 2364), ('1', 3383), ('0', 921), ('9', 388), ('3', 2498), ('5', 1464), ('6', 1114), ('7', 617), ('8', 486), ('z', 1664), ('’', 277), ('😊', 36), ('♥', 288), ('请', 74), ('好', 78), ('心', 73), ('人', 171), ('帮', 42), ('忙', 20), ('！', 45), ('他', 145), ('在', 164), ('我', 278), ('公', 12), ('司', 1), ('附', 14), ('近', 18), ('，', 695), ('它', 90), ('的', 516), ('主', 49), ('月', 57), ('尾', 7), ('要', 80), ('搬', 2), ('了', 139), ('们', 130), ('没', 54), ('地', 43), ('方', 20), ('躲', 2), ('。', 451), ('～', 9), ('◘', 8), ('•', 211), ('請', 14), ('求', 13), ('一', 227), ('個', 43), ('溫', 4), ('暖', 13), ('家', 161), ('是', 1

In [None]:
#Resumen de lo aprendido
print(t2.word_counts)
print(t2.document_count)
print(t2.word_index)
print(t2.word_docs)

OrderedDict([("'", 6235), ('5', 368), ('s', 44083), ('i', 50433), ('b', 13070), ('l', 35712), ('n', 51610), ('g', 16559), ('w', 14682), ('e', 96449), ('r', 43396), ('v', 10212), ('a', 67995), ('y', 20390), ('c', 19493), ('o', 62189), ('f', 16539), ('h', 36684), ('p', 19770), ('t', 62432), ('u', 22171), ('k', 9799), ('j', 1492), ('d', 32958), ('m', 22202), ('1', 805), ('7', 156), ('6', 279), ('x', 1072), ('3', 619), ('8', 124), ('2', 972), ('0', 212), ('4', 409), ('q', 563), ('9', 94), ('z', 433), ('’', 60), ('“', 18), ('”', 18), ('♥', 62), ('大', 19), ('小', 15), ('便', 1), ('笼', 2), ('子', 10), ('🍭', 1), ('❤', 2), ('💖', 8), ('√', 3), ('有', 18), ('人', 27), ('可', 18), ('以', 17), ('給', 2), ('孩', 7), ('們', 2), ('一', 27), ('個', 2), ('家', 31), ('嗎', 2), ('？', 3), ('剛', 4), ('在', 25), ('救', 10), ('的', 69), ('如', 8), ('果', 8), ('不', 26), ('他', 17), ('们', 24), ('会', 15), ('受', 5), ('风', 2), ('吹', 2), ('雨', 2), ('打', 9), ('，', 108), ('晚', 2), ('上', 6), ('还', 14), ('要', 8), ('见', 3), ('鬼', 4), ('（',

In [None]:
#Separando la variable x_train_list y haciendo conteo de las palabras
x_train_1 = []
for doc in x_train_list:
  d = str(doc).split()
  doc_idx = []
  for p in d:
    try:
      idx = t1.word_counts[p]
      doc_idx.append(idx)
    except:
      continue
  x_train_1.append(doc_idx)
x_train_1

[[],
 [279319],
 [1608],
 [3907,
  279319,
  279319,
  279319,
  279319,
  279319,
  279319,
  279319,
  279319,
  279319,
  279319],
 [],
 [279319],
 [],
 [279319, 279319, 279319, 279319],
 [210737, 206164],
 [279319, 279319],
 [3907],
 [279319, 279319],
 [279319, 90947, 210737, 210737, 279319],
 [1608, 279319],
 [],
 [279319],
 [3907, 3383, 3907],
 [2498, 279319],
 [1608],
 [],
 [],
 [279319, 279319, 279319],
 [279319, 279319, 279319, 279319, 279319, 2498, 279319, 279319],
 [],
 [279319, 90947],
 [279319, 279319, 279319, 279319, 279319],
 [279319, 1464],
 [2498, 279319, 279319],
 [],
 [3907],
 [3907, 279319, 279319],
 [279319],
 [279319, 279319, 279319, 279319, 279319],
 [3907, 1114],
 [279319, 279319, 279319],
 [279319, 279319, 2498, 279319, 2498, 1464],
 [279319, 279319, 206164, 279319],
 [3907, 3907, 3907],
 [279319, 3907, 279319, 1114, 2498],
 [279319, 90947],
 [],
 [1608, 3907, 279319],
 [],
 [279319, 279319, 279319],
 [279319],
 [210737, 279319],
 [279319, 279319],
 [279319],
 

In [None]:
#Convirtiendo en un array la variable x_train_1
x_train_1 = np.asarray(x_train_1)
print(type(x_train_1))
print(x_train_1[11000])

<class 'numpy.ndarray'>
[279319, 279319]


  return array(a, dtype, copy=False, order=order)


In [None]:
#Separando la variable x_test_list y haciendo conteo de las palabras
x_test_1 = []
for doc in x_test_list:
  d = str(doc).split()
  doc_idx = []
  for p in d:
    try:
      idx = t1.word_counts[p]
      doc_idx.append(idx)
    except:
      continue
  x_test_1.append(doc_idx)
x_test_1

[[1464, 279319, 279319, 279319],
 [279319, 206164, 279319, 279319, 279319],
 [1464, 279319, 279319],
 [],
 [206164, 279319, 279319, 206164, 2498],
 [279319, 279319, 279319, 206164, 279319, 2498, 1608],
 [2498, 279319],
 [3907, 279319, 206164, 279319, 279319],
 [],
 [210737, 210737, 210737, 210737, 279319, 279319, 206164, 279319],
 [],
 [279319],
 [1114, 279319, 3383, 1464, 2498, 279319, 279319, 1114, 279319],
 [],
 [279319, 1608, 1608, 279319, 279319],
 [279319, 279319, 279319],
 [2498, 279319, 279319, 279319, 279319, 279319, 279319],
 [279319, 279319],
 [1608, 279319, 3907],
 [279319, 279319, 279319, 279319],
 [],
 [279319, 279319, 279319, 279319, 279319, 279319, 279319],
 [],
 [],
 [3907, 2498],
 [],
 [279319, 279319, 90947],
 [279319, 279319, 279319, 3907],
 [],
 [2498],
 [279319, 279319, 279319],
 [],
 [279319],
 [279319],
 [3383, 3907, 3907, 279319],
 [2498, 279319],
 [288, 1464, 288, 288, 288],
 [279319, 279319],
 [],
 [],
 [1608, 279319, 3383],
 [279319, 279319, 279319],
 [68201

In [None]:
#Convirtiendo en un array la variable x_test_1
x_test_1 = np.asarray(x_test_1)
print(type(x_test_1))
print(x_test_1[100])

<class 'numpy.ndarray'>
[]


  return array(a, dtype, copy=False, order=order)


In [None]:
#Dimensión de la variable de entrenamiento
x_train_1.shape

(11994,)

In [None]:
#Dimensión de la variable de prueba
x_test_1.shape

(2999,)

In [None]:
#Aplicando one-hot encoding a la salida
from tensorflow.keras.utils import to_categorical
y_train_1 = to_categorical(y_train)
y_test_1 = to_categorical(y_test)

In [None]:
#Dimensión de la variable de salida de entrenamiento
y_train_1.shape

(11994, 5)

In [None]:
#Dimensión de la variable de salida de prueba
y_test_1.shape

(2999, 5)

##**Modelo de deep learning**##

In [None]:
#Máximo de la variable de entrenamiento
x_train_1.max()

[393967, 279319, 279319, 1608]

In [None]:
#Arquitectura del modelo
# Embedding
max_features = 392175
maxlen = 10
embedding_size = 50

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 10

# Training
batch_size = 100
epochs = 20

print(len(x_train_1), 'train sequences')
print(len(x_test_1), 'test sequences')

print('Pad sequences (samples x time)')
x_train_1 = sequence.pad_sequences(x_train_1, maxlen=maxlen)
x_test_1 = sequence.pad_sequences(x_test_1, maxlen=maxlen)
print('x_train shape:', x_train_1.shape)
print('x_test shape:', x_test_1.shape)

11994 train sequences
2999 test sequences
Pad sequences (samples x time)
x_train shape: (11994, 10)
x_test shape: (2999, 10)


In [None]:
#Construcción del modelo
print('Build model...')

model = Sequential()
model.add(Embedding(max_features+1, embedding_size, input_length=maxlen))    
model.add(Dropout(0.25))
model.add(LSTM(lstm_output_size))
model.add(Dense(5))
model.add(Activation('softmax'))

#Compilando el modelo
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train_1, y_train_1,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_1, y_test_1))
score, acc = model.evaluate(x_test_1, y_test_1, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

y_pred = model.predict(x_test_1)

Build model...
Train...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test score: 1.465680480003357
Test accuracy: 0.28109368681907654


In [None]:
#Predicción del modelo
y_pred = model.predict(x_test_1)
y_pred

array([[0.02110533, 0.20808537, 0.25681517, 0.2406977 , 0.27329642],
       [0.02513768, 0.20255645, 0.25461185, 0.24272425, 0.2749698 ],
       [0.02244226, 0.20611896, 0.2601594 , 0.23952277, 0.27175662],
       ...,
       [0.02318174, 0.20554358, 0.2623585 , 0.23895267, 0.2699636 ],
       [0.02146439, 0.20753327, 0.2575005 , 0.24054882, 0.27295294],
       [0.02208347, 0.22406909, 0.24548033, 0.23852742, 0.26983967]],
      dtype=float32)

In [None]:
#Probando la predicción del modelo
print(y_pred[2988])
y_test_1[2988]

[0.02513768 0.20255645 0.25461185 0.24272425 0.2749698 ]


array([0., 0., 0., 0., 1.], dtype=float32)

In [None]:
#Índice mayor de la predicción
np.argmax(y_pred[2988])

4

In [None]:
#Array de los máximos de la predicción y el valor real
yt = []; yp = []
for i,j in zip(y_test_1, y_pred):
  y_t, y_p = np.argmax(i), np.argmax(j)
  yt.append(y_t)
  yp.append(y_p)
y_t = np.asarray(yt)
y_p = np.asarray(yp)

In [None]:
#Resultado métrica f1_score
print('Macro F1: ',f1_score(y_t, y_p, average='macro'))

Macro F1:  0.13369832665580308


In [None]:
#Matriz de confusión de la predicción y el valor real con la métrica del f1_score
cm = confusion_matrix(y_t, y_p, labels=[0,1,2])
cm

array([[ 0,  0, 12],
       [ 0,  2, 72],
       [ 0,  2, 79]])

In [None]:
#Resultado del porcentaje de matriz de confusión sobre el y_t
traza = np.trace(cm)
total = y_t.shape[0]
acc = traza/total
acc

0.027009003001000332