### Primeiro modelo tensorflow com Keras.

In [9]:
import pandas as pd

filepath_dict = {
  'yelp':   './data/yelp_labelled.txt',
  'amazon': './data/amazon_cells_labelled.txt',
  'imdb':   './data/imdb_labelled.txt'
}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])


sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


### Baseline Model


In [10]:
#from sklearn.feature_extraction.text import CountVectorizer # vetoriza as sentenças do texto.
# Suponha as seguintes sentenças
#sentences = ['John likes ice cream', 'John hates chocolate.']

#vectorizer = CountVectorizer(min_df=0, lowercase=False)
#vectorizer.fit(sentences)
#vectorizer.vocabulary_                      # em tuplas
#vectorizer.transform(sentences).toarray()   # em arrays

### Criando um modelo Simples com SKlearn kit

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

vectorizer = CountVectorizer()  # Tokenização das palavras.
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

print(X_train)

  (0, 125)	1
  (0, 145)	1
  (0, 201)	1
  (0, 597)	1
  (0, 600)	1
  (0, 710)	1
  (0, 801)	2
  (0, 888)	1
  (0, 973)	1
  (0, 1042)	1
  (0, 1308)	1
  (0, 1345)	1
  (0, 1360)	1
  (0, 1494)	2
  (0, 1524)	2
  (0, 1587)	1
  (0, 1622)	1
  (0, 1634)	1
  (1, 63)	1
  (1, 136)	1
  (1, 597)	1
  (1, 616)	1
  (1, 638)	1
  (1, 725)	1
  (1, 1001)	1
  :	:
  (746, 1634)	1
  (747, 42)	1
  (747, 654)	1
  (747, 1193)	2
  (747, 1237)	1
  (747, 1494)	1
  (747, 1520)	1
  (748, 600)	1
  (748, 654)	1
  (748, 954)	1
  (748, 1001)	1
  (748, 1494)	1
  (749, 14)	1
  (749, 15)	1
  (749, 57)	1
  (749, 108)	1
  (749, 347)	1
  (749, 553)	1
  (749, 675)	1
  (749, 758)	1
  (749, 801)	1
  (749, 1010)	1
  (749, 1105)	1
  (749, 1492)	1
  (749, 1634)	2


### Utilizando regreção linear para classificar.

In [32]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

for source in df['source'].unique():
  df_source = df[df['source'] == source]
  sentences = df_source['sentence'].values
  y = df_source['label'].values

  sentences_train, sentences_test, y_train, y_test = train_test_split(
      sentences, y, test_size=0.25, random_state=1000)

  vectorizer = CountVectorizer()
  vectorizer.fit(sentences_train)
  X_train = vectorizer.transform(sentences_train)
  X_test  = vectorizer.transform(sentences_test)

  classifier = LogisticRegression()
  classifier.fit(X_train, y_train)
  score = classifier.score(X_test, y_test)
  print(f"Precisão de {score} para os dados: {source}")

  

Accuracy: 0.796
Precisão de 0.796 para os dados: yelp
Precisão de 0.796 para os dados: amazon
Precisão de 0.7486631016042781 para os dados: imdb


### Introdução ao Keras

In [39]:
import tensorflow as tf
from tensorflow import keras
from keras.backend import clear_session

# clear_session() -> limpa os pesos da antiga analise
clear_session()

input_dim = X_train.shape[1] # quantidade de features

# modelo baseado em uma pilha de layers, utilizando o layer mais comum Dense.
model = keras.Sequential()
model.add(keras.layers.Dense(10, input_dim = input_dim, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
# Configurando o modelo de treinamento.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary() # Mostra os paramestros disponíveis para treinar

"""
history -> callback automático do treinamento do modelo.
epochs -> quantas iterações de treinamento devem ser desenvolvidas
batch_size -> tamanho de amostras disponíveis para utilizar
validation_data -> data de treinamento.
"""
history = model.fit(
  X_train,
  y_train,
  epochs=100,
  verbose=False,
  validation_data=(X_test, y_test),
  batch_size=10
)

print(history)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print(f"Prediction no treinamento: {accuracy}")
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print(f"Prediction no teste: {accuracy}")



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                25060     
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 25,071
Trainable params: 25,071
Non-trainable params: 0
_________________________________________________________________




<keras.callbacks.History object at 0x0000014A836F44C0>
Prediction no treinamento: 1.0
Prediction no teste: 0.7700534462928772


## Embedding de palavras

### One Hot Encoding

In [46]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cities =["São Paulo", "Rio de Janeiro", "Presidente Prudente", "Brasília"]

encoder = LabelEncoder()
city_labels = encoder.fit_transform(cities)
print(city_labels)

# Agora com One Hot Encoding

encoder = OneHotEncoder(sparse=False)
city_labels = city_labels.reshape(len(cities), 1)
encoder.fit_transform(city_labels)



[3 2 1 0]


array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])

In [47]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1 # adiconando 1 pois 0 é um index reservado.

print(sentences_train[2])
print(X_train[2])

I am a fan of his ... This movie sucked really bad.  
[7, 150, 2, 932, 4, 49, 6, 11, 563, 45, 30]
