In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import unidecode
import unicodedata

In [2]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
STOPWORDS = set(stopwords.words('portuguese'))
from nltk.stem import RSLPStemmer

In [4]:
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
def loadData(dados):
    df = pd.read_csv(dados, sep=';')
    return df

In [6]:
def limpar_texto(text):
    #transformar em letras minusculas
    text = text.lower()
    #retirar characters especiais
    text = re.sub(r'\W',' ',text)
    text = re.sub(r'\s+[a-zA-Z]\s+',' ',text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+',' ', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    text = re.sub(r'\W+', ' ', text)
    
    return text
    

In [7]:
def sem_Acentos(sentence):
    texto_semAcentos = unicodedata.normalize("NFD", sentence)
    texto_semAcentos = texto_semAcentos.encode("ascii", "ignore")
    texto_semAcentos = texto_semAcentos.decode("utf-8")
    
    return texto_semAcentos

In [8]:
def Stemming(text):
    stemmer = SnowballStemmer(language='portuguese')
    token_words = word_tokenize(text)
    stem_text=[]
    for word in token_words:
        stem_text.append(stemmer.stem(word))
        stem_text.append(" ")
    return "".join(stem_text)

In [9]:

def Stemming2(text):
    stemmer = RSLPStemmer()
    token_words = word_tokenize(text)
    stem_text=[]
    for word in token_words:
        stem_text.append(stemmer.stem(word))
        stem_text.append(" ")
    return "".join(stem_text)

In [10]:
def bag_of_words(texto_df, ngrams):
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    vectorizer = CountVectorizer(min_df = 2, lowercase=True,ngram_range =(1,ngrams),stop_words = STOPWORDS, tokenizer = word_tokenize)
    text_counts = vectorizer.fit_transform(texto_df)
    
    return text_counts

In [11]:
def tfidf(X):
    
    processed_features = X
    vectorizer = TfidfVectorizer (max_features=500, min_df=2, max_df=0.8, stop_words=STOPWORDS)
    processed_features = vectorizer.fit_transform(processed_features).toarray()
    
    return processed_features

In [12]:
def preprocessing(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
    matrix_X = X_train
    
    X_train = X_train.toarray()
    X_test = X_test.toarray()

    return X_train, X_test, y_train, y_test, matrix_X

In [13]:
def preprocessing2(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
    matrix_X = X_train

    return X_train, X_test, y_train, y_test, matrix_X

In [14]:
def randomForest(X_train, X_test, y_train, y_test):
    forest = RandomForestClassifier(max_leaf_nodes=3, random_state=0)
    forest.fit(X_train, y_train)
    y_pred = forest.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred) * 100
    report = classification_report(y_test, y_pred)
    
    return score, report, forest


In [15]:
def gaussian(X_train, X_test, y_train, y_test):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred) * 100
    report = classification_report(y_test, y_pred)
    
    return score, report, gnb  

In [16]:
def neural(X_train, X_test, y_train, y_test,input_valor):
    
    model = Sequential()
    model.add(Dense(units=50, activation='relu',input_dim = input_valor))
    model.add(Dense(units=25, activation='relu'))
    model.add(Dense(units=10, activation='relu'))
    model.add(Dense(units=5, activation='relu'))
    
    summary=model.summary()
    compiled = model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=["accuracy"])
    fitted = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=7, batch_size=10)
    
    return summary, compiled, fitted
    

In [17]:
dataframe = loadData("tweet19.csv")

In [18]:
dataframe.columns = ['assunto','texto']

In [19]:
dataframe['texto'] = dataframe['texto'].apply(limpar_texto)

In [20]:
dataframe.head()

Unnamed: 0,assunto,texto
0,abertura das praias,mudou absolutamente nada paciência papo novo n...
1,abertura das praias,opinião mudou nada pessoas agindo vírus sido d...
2,abertura das praias,uol incentivando pessoas irem praia plena pand...
3,abertura das praias,ônibus trens lotados tbm desde sempre praia po...
4,abertura das praias,novo normal sei onde engraçado povo posando fo...


In [21]:
dataframe['texto'][2]

'uol incentivando pessoas irem praia plena pandemia sim plena pandemia isolamento social necessário caso contrário sairemos nunca dessa situação fiquememcasa'

In [22]:
text_counts = bag_of_words(dataframe['texto'],2) 

In [23]:
X,y = dataframe.iloc[:,1], dataframe.iloc[:,0]

In [24]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [25]:
labelencoder.classes_

array(['abertura das praias', 'profissionais', 'transporte público',
       'vacina', 'volta as aulas'], dtype=object)

In [26]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1])

In [27]:
text_counts

<38x113 sparse matrix of type '<class 'numpy.int64'>'
	with 267 stored elements in Compressed Sparse Row format>

In [28]:
X_train, X_test, y_train, y_test, matrix_train = preprocessing(text_counts,y)

In [29]:
matrix_train

<30x113 sparse matrix of type '<class 'numpy.int64'>'
	with 198 stored elements in Compressed Sparse Row format>

In [30]:
X_train

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [31]:
X_train.shape

(30, 113)

In [32]:
X_train.shape[1]

113

## Teste 1 - Apenas Bag of Words

In [33]:
gnb_score, gnb_report, gnb  = gaussian(X_train, X_test, y_train, y_test)
print("Accuracia Naive Bayes: ")
print(gnb_score,"%")
print("Report Naive Bayes: ")
print(gnb_report)

Accuracia Naive Bayes: 
87.5 %
Report Naive Bayes: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       1.00      0.50      0.67         2
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         3

    accuracy                           0.88         8
   macro avg       0.75      0.62      0.67         8
weighted avg       1.00      0.88      0.92         8



  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
forest_score, forest_report, forest  = randomForest(X_train, X_test, y_train, y_test)
print("Accuracia Random Forest: ")
print(forest_score,"%")
print("Report Random Forest: ")
print(forest_report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracia Random Forest: 
0.0 %
Report Random Forest: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       2.0
           3       0.00      0.00      0.00       3.0
           4       0.00      0.00      0.00       3.0

    accuracy                           0.00       8.0
   macro avg       0.00      0.00      0.00       8.0
weighted avg       0.00      0.00      0.00       8.0



In [35]:
summary, compiled, fitted = neural(X_train, X_test, y_train, y_test, X_train.shape[1])


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                5700      
_________________________________________________________________
dense_2 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_3 (Dense)              (None, 10)                260       
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 55        
Total params: 7,290
Trainable params: 7,290
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 30 samples, validate on 8 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


## Teste 2 - Apenas TFIDF

In [36]:
processed_X = tfidf(X)
processed_X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.33150395, 0.36060372,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [37]:
X_train, X_test, y_train, y_test, matrix_train = preprocessing2(processed_X, y)

In [38]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    processed_X, y, test_size=0.3, random_state=123)

In [39]:
gnb_score, gnb_report, gnb  = gaussian(X_train2, X_test2, y_train2, y_test2)
print("Accuracia Naive Bayes: ")
print(gnb_score,"%")
print("Report Naive Bayes: ")
print(gnb_report)

Accuracia Naive Bayes: 
75.0 %
Report Naive Bayes: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         1
           2       0.67      1.00      0.80         2
           3       0.67      1.00      0.80         2
           4       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.57      0.76      0.63        12
weighted avg       0.68      0.75      0.69        12



  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
forest_score, forest_report, forest  = randomForest(X_train2, X_test2, y_train2, y_test2)
print("Accuracia Random Forest: ")
print(forest_score,"%")
print("Report Random Forest: ")
print(forest_report)

Accuracia Random Forest: 
25.0 %
Report Random Forest: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       1.00      1.00      1.00         1
           2       0.00      0.00      0.00         2
           3       0.18      1.00      0.31         2
           4       0.00      0.00      0.00         5

    accuracy                           0.25        12
   macro avg       0.24      0.40      0.26        12
weighted avg       0.11      0.25      0.13        12



  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
summary, compiled, fitted = neural(X_train2, X_test2, y_train2, y_test2, X_train2.shape[1])


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 50)                5100      
_________________________________________________________________
dense_6 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_7 (Dense)              (None, 10)                260       
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 55        
Total params: 6,690
Trainable params: 6,690
Non-trainable params: 0
_________________________________________________________________
Train on 26 samples, validate on 12 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


## Teste 3 - Stemming e Bag of Words

In [42]:
data = loadData("tweet19.csv")
data.columns = ['assunto','texto']

In [43]:
data['texto'] = data['texto'].apply(limpar_texto)
data['texto'][12]

'escolas públicas falta porta janela trinco ventilador papel sanitário água livros bibliotecas laboratório etc agora milagre brotará máscara álcool gel qm correrá risco adultos'

In [44]:
data['texto'] = data['texto'].apply(Stemming)
data['texto'][12]

'escol públic falt port janel trinc ventil papel sanitári águ livr bibliotec laboratóri etc agor milagr brot másc álcool gel qm corr risc adult '

In [45]:
data.head()

Unnamed: 0,assunto,texto
0,abertura das praias,mud absolut nad paciênc pap nov normal gal tá ...
1,abertura das praias,opiniã mud nad pesso agind vírus sid debel bra...
2,abertura das praias,uol incentiv pesso irem pra plen pandem sim pl...
3,abertura das praias,ônibus trens lot tbm desd sempr pra pod woman ...
4,abertura das praias,nov normal sei onde engrac pov pos fot másc ac...


In [46]:
X2,y2 = data.iloc[:,1], data.iloc[:,0]

In [47]:
labelencoder = LabelEncoder()
y2 = labelencoder.fit_transform(y2)

In [48]:
text_counts2 = bag_of_words(data['texto'],2) 

In [49]:
X2_train, X2_test, y2_train, y2_test, matrix2_train = preprocessing(text_counts2,y2)

In [50]:
X2_train.shape

(30, 135)

In [51]:
gnb_score, gnb_report, gnb  = gaussian(X2_train, X2_test, y2_train, y2_test)
print("Accuracia Naive Bayes: ")
print(gnb_score,"%")
print("Report Naive Bayes: ")
print(gnb_report)

Accuracia Naive Bayes: 
75.0 %
Report Naive Bayes: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00         3
           4       0.75      1.00      0.86         3

    accuracy                           0.75         8
   macro avg       0.44      0.50      0.46         8
weighted avg       0.66      0.75      0.70         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
forest_score, forest_report, forest  = randomForest(X2_train, X2_test, y2_train, y2_test)
print("Accuracia Random Forest: ")
print(forest_score,"%")
print("Report Random Forest: ")
print(forest_report)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracia Random Forest: 
25.0 %
Report Random Forest: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       1.00      0.50      0.67         2
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00         3

    accuracy                           0.25         8
   macro avg       0.50      0.21      0.29         8
weighted avg       0.62      0.25      0.35         8



  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
summary, compiled, fitted = neural(X2_train, X2_test, y2_train, y2_test, X2_train.shape[1])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 50)                6800      
_________________________________________________________________
dense_10 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_11 (Dense)             (None, 10)                260       
_________________________________________________________________
dense_12 (Dense)             (None, 5)                 55        
Total params: 8,390
Trainable params: 8,390
Non-trainable params: 0
_________________________________________________________________
Train on 30 samples, validate on 8 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


## Teste Sem acentuação e Stemming

In [54]:
data = loadData("tweet19.csv")
data.columns = ['assunto','texto']

In [55]:
data['texto'] = data['texto'].apply(limpar_texto)
data['texto'][12]

'escolas públicas falta porta janela trinco ventilador papel sanitário água livros bibliotecas laboratório etc agora milagre brotará máscara álcool gel qm correrá risco adultos'

In [56]:
data = loadData("tweet19.csv")
data.columns = ['assunto','texto']

In [57]:
data['texto'] = data['texto'].apply(sem_Acentos)
data['texto'][12]

'Em nossas escolas publicas falta porta, janela, trinco, ventilador, papel sanitario, agua, livros, bibliotecas, laboratorio etc Agora, por milagre, brotara mascara e alcool gel? Qm mais correra risco serao os adultos.'

In [58]:
data['texto'][10]

'Impossivel eu mandar meu filho de 4 anos para escola , uma crianca dessa idade nao sabe os riscos que pode ocorrer caso ande sem mascara , divida lanche com os amigos, Nessa fase eles brincam muito juntos , se abracam, se beijam . Agora me diz ,como evita isso ,impossivel !!!'

In [59]:
data['texto'] = data['texto'].apply(limpar_texto)
data['texto'][10]

'impossivel mandar filho 4 anos escola crianca dessa idade nao sabe riscos pode ocorrer caso ande mascara divida lanche amigos nessa fase brincam juntos abracam beijam agora diz evita impossivel'

In [60]:
data['texto'] = data['texto'].apply(Stemming2)

In [61]:
data['texto'][10]

'impossi mand filh 4 ano escol crianc dess idad nao sab risc pod ocorr cas and masc div lanch amig ness fas brinc junt abrac beij agor diz evit impossi '

In [62]:
data.head()

Unnamed: 0,assunto,texto
0,abertura das praias,nao mud absolut nad ja nao pacienc pap nov nor...
1,abertura das praias,opinia nao mud nad pesso esta agind viru sid d...
2,abertura das praias,uol incentiv pesso ir prai plen pandem sim ple...
3,abertura das praias,onibu tr lot tbm desd sempr so prai nao pod wo...
4,abertura das praias,nov norm nao sei ond engrac pov pos fot masc a...


In [63]:
X2,y2 = data.iloc[:,1], data.iloc[:,0]

In [64]:
labelencoder = LabelEncoder()
y2 = labelencoder.fit_transform(y2)

In [65]:
text_counts2 = bag_of_words(data['texto'],3) 

In [66]:
X2_train, X2_test, y2_train, y2_test, matrix2_train = preprocessing(text_counts2,y2)

In [67]:
X2_train.shape

(30, 146)

In [68]:
gnb_score, gnb_report, gnb  = gaussian(X2_train, X2_test, y2_train, y2_test)
print("Accuracia Naive Bayes: ")
print(gnb_score,"%")
print("Report Naive Bayes: ")
print(gnb_report)

Accuracia Naive Bayes: 
75.0 %
Report Naive Bayes: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00         3
           4       0.75      1.00      0.86         3

    accuracy                           0.75         8
   macro avg       0.44      0.50      0.46         8
weighted avg       0.66      0.75      0.70         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
forest_score, forest_report, forest  = randomForest(X2_train, X2_test, y2_train, y2_test)
print("Accuracia Random Forest: ")
print(forest_score,"%")
print("Report Random Forest: ")
print(forest_report)

Accuracia Random Forest: 
37.5 %
Report Random Forest: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       1.00      0.50      0.67         2
           3       1.00      0.67      0.80         3
           4       0.00      0.00      0.00         3

    accuracy                           0.38         8
   macro avg       0.50      0.29      0.37         8
weighted avg       0.62      0.38      0.47         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
STOPWORDS

{'a',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as',
 'até',
 'com',
 'como',
 'da',
 'das',
 'de',
 'dela',
 'delas',
 'dele',
 'deles',
 'depois',
 'do',
 'dos',
 'e',
 'ela',
 'elas',
 'ele',
 'eles',
 'em',
 'entre',
 'era',
 'eram',
 'essa',
 'essas',
 'esse',
 'esses',
 'esta',
 'estamos',
 'estas',
 'estava',
 'estavam',
 'este',
 'esteja',
 'estejam',
 'estejamos',
 'estes',
 'esteve',
 'estive',
 'estivemos',
 'estiver',
 'estivera',
 'estiveram',
 'estiverem',
 'estivermos',
 'estivesse',
 'estivessem',
 'estivéramos',
 'estivéssemos',
 'estou',
 'está',
 'estávamos',
 'estão',
 'eu',
 'foi',
 'fomos',
 'for',
 'fora',
 'foram',
 'forem',
 'formos',
 'fosse',
 'fossem',
 'fui',
 'fôramos',
 'fôssemos',
 'haja',
 'hajam',
 'hajamos',
 'havemos',
 'hei',
 'houve',
 'houvemos',
 'houver',
 'houvera',
 'houveram',
 'houverei',
 'houverem',
 'houveremos',
 'houveria',
 'houveriam',
 'houvermos',
 'houverá',
 'houverão',
 'houveríamos',
 'houvesse',


In [71]:
summary, compiled, fitted = neural(X2_train, X2_test, y2_train, y2_test, X2_train.shape[1])

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 50)                7350      
_________________________________________________________________
dense_14 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_15 (Dense)             (None, 10)                260       
_________________________________________________________________
dense_16 (Dense)             (None, 5)                 55        
Total params: 8,940
Trainable params: 8,940
Non-trainable params: 0
_________________________________________________________________
Train on 30 samples, validate on 8 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
