In [46]:
import pandas as pd
import numpy as np
import re
import spacy

In [47]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [48]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
STOPWORDS = set(stopwords.words('portuguese'))

In [49]:
from keras.models import Sequential
from keras.layers import Dense

In [50]:
def loadData(dados):
    df = pd.read_csv(dados, sep=';')
    return df

In [51]:
def limpar_texto(text):
    #transformar em letras minusculas
    text = text.lower()
    #retirar 
    #retirar acentos
    #retirar characters especiais
    text = re.sub(r'\W',' ',text)
    text = re.sub(r'\s+[a-zA-Z]\s+',' ',text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+',' ', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    text = re.sub(r'\W+', ' ', text)
    
    return text
    

In [52]:
def Tokenize(sentence):
    sentence = sentence.lower()
    sentence = nltk.word_tokenize(sentence)
    return sentence

In [53]:
def Stemming(sentence):
    stemmer = SnowballStemmer(language='portuguese')
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
    return phrase

In [54]:
def Lemmatization(sentence):    
    for word in sentence:
        print(word.text, word.lemma_)

In [55]:
def bag_of_words(texto_df, ngrams):
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    vectorizer = CountVectorizer(min_df = 2, lowercase=True,ngram_range =(1,ngrams),stop_words = STOPWORDS, tokenizer = token.tokenize)
    text_counts = vectorizer.fit_transform(texto_df)
    
    return text_counts

In [56]:
def tfidf(X):
    processed_features = X
    vectorizer = TfidfVectorizer (max_features=500, min_df=2, max_df=0.8, stop_words=STOPWORDS)
    processed_features = vectorizer.fit_transform(processed_features).toarray()
    
    return processed_features

In [74]:
def preprocessing(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
    matrix_X = X_train
    X_train = X_train.toarray()
    X_test = X_test.toarray()

    return X_train, X_test, y_train, y_test, matrix_X

In [75]:
def randomForest(X_train, X_test, y_train, y_test):
    forest = RandomForestClassifier(max_leaf_nodes=3, random_state=0)
    forest.fit(X_train, y_train)
    y_pred = forest.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred) * 100
    report = classification_report(y_test, y_pred)
    
    return score, report, forest


In [76]:
def gaussian(X_train, X_test, y_train, y_test):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred) * 100
    report = classification_report(y_test, y_pred)
    
    return score, report, gnb  

In [103]:
def neuralNetwork(X_train, X_test, y_train, y_test, input_valor):
    model = Sequential(X_train, X_test, y_train, y_test)
    model.add(Dense(units=50, activation='relu',input_dim = input_valor))
    model.add(Dense(units=25, activation='relu'))
    model.add(Dense(units=10, activation='relu'))
    model.add(Dense(units=5, activation='relu'))
    
    summary=model.summary()
    compiled = model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=["accuracy"])
    fitted = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=7, batch_size=10)
    
    return summary, compiled, fitted
    

In [104]:
dataframe = loadData("tweet19.csv")

In [105]:
dataframe.columns = ['assunto','texto']

In [106]:
dataframe['texto'] = dataframe['texto'].apply(limpar_texto)

In [107]:
dataframe.head()

Unnamed: 0,assunto,texto
0,abertura das praias,mudou absolutamente nada paciência papo novo n...
1,abertura das praias,opinião mudou nada pessoas agindo vírus sido d...
2,abertura das praias,uol incentivando pessoas irem praia plena pand...
3,abertura das praias,ônibus trens lotados tbm desde sempre praia po...
4,abertura das praias,novo normal sei onde engraçado povo posando fo...


In [108]:
dataframe['texto'][2]

'uol incentivando pessoas irem praia plena pandemia sim plena pandemia isolamento social necessário caso contrário sairemos nunca dessa situação fiquememcasa'

In [109]:
text_counts = bag_of_words(dataframe['texto'],2) 

  'stop_words.' % sorted(inconsistent))


In [110]:
y = dataframe.iloc[:,0]

In [111]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [112]:
labelencoder.classes_

array(['abertura das praias', 'profissionais', 'transporte público',
       'vacina', 'volta as aulas'], dtype=object)

In [113]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1])

In [114]:
text_counts

<38x133 sparse matrix of type '<class 'numpy.int64'>'
	with 335 stored elements in Compressed Sparse Row format>

In [115]:
X_train, X_test, y_train, y_test, matrix_train = preprocessing(text_counts,y)

In [116]:
matrix_train

<30x133 sparse matrix of type '<class 'numpy.int64'>'
	with 249 stored elements in Compressed Sparse Row format>

In [117]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [118]:
X_train.shape

(30, 133)

In [119]:
X_train.shape[1]

133

In [120]:
gnb_score, gnb_report, gnb  = gaussian(X_train, X_test, y_train, y_test)
print("Accuracia Naive Bayes: ")
print(gnb_score,"%")
print("Report Naive Bayes: ")
print(gnb_report)

Accuracia Naive Bayes: 
87.5 %
Report Naive Bayes: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       1.00      0.50      0.67         2
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         3

    accuracy                           0.88         8
   macro avg       0.75      0.62      0.67         8
weighted avg       1.00      0.88      0.92         8



  _warn_prf(average, modifier, msg_start, len(result))


In [121]:
forest_score, forest_report, forest  = gaussian(X_train, X_test, y_train, y_test)
print("Accuracia Random Forest: ")
print(forest_score,"%")
print("Report Random Forest: ")
print(forest_report)

Accuracia Random Forest: 
87.5 %
Report Random Forest: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       1.00      0.50      0.67         2
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         3

    accuracy                           0.88         8
   macro avg       0.75      0.62      0.67         8
weighted avg       1.00      0.88      0.92         8



  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
summary, compiled, fitted = neuralNetwork(X_train, X_test, y_train, y_test)
print("Sumário Rede Neural:")
print(summary,"%")
print("Rede Neural....")
print(fitted)

TypeError: __init__() takes from 1 to 3 positional arguments but 5 were given