In [173]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords
import re

In [174]:
df = pd.read_csv("tweet19.csv", sep=";")

In [175]:
df

Unnamed: 0,assunto,texto
0,abertura das praias,Não mudou absolutamente NADA!!! Eu já não tenh...
1,abertura das praias,"Na minha opinião, não mudou nada, as pessoas e..."
2,abertura das praias,Uol incentivando as pessoas irem para a praia ...
3,abertura das praias,"Nos ônibus e trens lotados tbm, desde sempre.S..."
4,abertura das praias,novo normal não sei onde... O mais engraçado e...
5,abertura das praias,A uol deveria publicar algo mais produtivo do ...
6,abertura das praias,Daqui exatas 3 semanas vamos ter um aumento na...
7,abertura das praias,"E nós se fode aqui né,sem aula,sem poder traba..."
8,volta as aulas,Irão colocar as vidas de crianças e familiares...
9,volta as aulas,O meu filho tem um pouquinho mais de dificulda...


In [176]:
#pre processamento
#REPLACE_BY_SPACE = re.compile('[/(){}\[\]\|@,;]')
#BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('portuguese'))

In [177]:
df.columns = ['assunto','texto']
def limpar_texto(text):
    text = text.lower()
    text = re.sub(r'\W',' ',text)
    text = re.sub(r'\s+[a-zA-Z]\s+',' ',text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+',' ', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    text = re.sub(r'\W+', ' ', text)
    return text

df['texto'] = df['texto'].apply(limpar_texto)
df['texto'] = df['texto'].str.replace('\d+', '')

In [178]:
df.head()

Unnamed: 0,assunto,texto
0,abertura das praias,mudou absolutamente nada paciência papo novo n...
1,abertura das praias,opinião mudou nada pessoas agindo vírus sido d...
2,abertura das praias,uol incentivando pessoas irem praia plena pand...
3,abertura das praias,ônibus trens lotados tbm desde sempre praia po...
4,abertura das praias,novo normal sei onde engraçado povo posando fo...


In [179]:
#palavras únicas

In [180]:
df['texto'][2]

'uol incentivando pessoas irem praia plena pandemia sim plena pandemia isolamento social necessário caso contrário sairemos nunca dessa situação fiquememcasa'

In [181]:
#precisa-se retirar os acentos
#import unidecode
#import unicodedata

#sem_acentos = unidecode.unidecode(df['texto'])
#print(sem_acentos[0])

In [182]:
#stemming
#from nltk.stem import PorterStemmer
#ps = PorterStemmer()
#stemmed_words= ps.stem(X)
#print(df['texto'][5])

In [183]:
#Bag of words and Tokenize
from nltk.tokenize import RegexpTokenizer
#remover simbolos e números
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

vectorizer = CountVectorizer(min_df = 2, lowercase=True,ngram_range =(1,1),stop_words = STOPWORDS, tokenizer = token.tokenize)
text_counts = vectorizer.fit_transform(df['texto'])

  'stop_words.' % sorted(inconsistent))


In [184]:
X,y = df.iloc[:,1], df.iloc[:,0]

In [185]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [186]:
labelencoder.classes_

array(['abertura das praias', 'profissionais', 'transporte público',
       'vacina', 'volta as aulas'], dtype=object)

In [187]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1])

In [188]:
text_counts

<38x111 sparse matrix of type '<class 'numpy.int64'>'
	with 288 stored elements in Compressed Sparse Row format>

In [189]:
X_train, X_test, y_train, y_test = train_test_split(text_counts, y, test_size=0.2, random_state = 0)

In [190]:
X_train

<30x111 sparse matrix of type '<class 'numpy.int64'>'
	with 212 stored elements in Compressed Sparse Row format>

In [191]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [192]:
from sklearn.naive_bayes import GaussianNB
#Naive Bayes com somente Bag of Words
naivebayes = GaussianNB()
naivebayes.fit(X_train, y_train)
naive_predicted = naivebayes.predict(X_test)
print("Gaussian Acuracia:",accuracy_score(y_test, naive_predicted))

Gaussian Acuracia: 0.875


In [None]:
from sklearn.naive_bayes

In [193]:
#Arvore de decisão com somente Bag of Words

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)
forest_predict = text_classifier.predict(X_test)
print("Random Forest Acuracia:",accuracy_score(y_test, forest_predict))

Random Forest Acuracia: 0.75


In [194]:
#Feature Generation using TF-IDF
processed_features = X
vectorizer = TfidfVectorizer (max_features=500, min_df=2, max_df=0.8, stop_words=STOPWORDS)
processed_features = vectorizer.fit_transform(processed_features).toarray()

In [195]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    processed_features, y, test_size=0.3, random_state=123)

In [196]:
#Naive Bayes
naivebayes.fit(X_train2, y_train2)
naive_predicted = naivebayes.predict(X_test2)
print("Gaussian Acuracia:",accuracy_score(y_test2, naive_predicted))

Gaussian Acuracia: 0.75


In [197]:
#Arvore de decisão
text_classifier.fit(X_train2, y_train2)
forest_predict = text_classifier.predict(X_test2)
print("Random Forest Acuracia:",accuracy_score(y_test2, forest_predict))

Random Forest Acuracia: 0.25


In [199]:
#print(confusion_matrix(y_test2,forest_predict))
#print(classification_report(y_test2,forest_predict))
#print(accuracy_score(y_test2, florest_predict))

In [218]:
#Rede Neural
model = Sequential()
model.add(Dense(units=70, activation='relu',input_dim = 100))
model.add(Dense(units=30, activation='relu'))
model.add(Dense(units=15, activation='relu'))
model.add(Dense(units=5, activation='relu'))

In [219]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 70)                7070      
_________________________________________________________________
dense_32 (Dense)             (None, 30)                2130      
_________________________________________________________________
dense_33 (Dense)             (None, 15)                465       
_________________________________________________________________
dense_34 (Dense)             (None, 5)                 80        
Total params: 9,745
Trainable params: 9,745
Non-trainable params: 0
_________________________________________________________________


In [220]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=["accuracy"])

In [221]:
X_train2.shape, X_test2.shape

((26, 100), (12, 100))

In [222]:
model.fit(X_train2, y_train2, validation_data=(X_test2, y_test2), epochs=5, batch_size=10)

Train on 26 samples, validate on 12 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1e92af17288>