In [1]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report

import string
import re
from bs4 import BeautifulSoup
import string
from unicodedata import normalize as norm
from nltk.corpus import stopwords
import joblib
import pymongo
import socket
from datetime import datetime
import requests
import json

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
# DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
# DATASET_ENCODING = "ISO-8859-1"
# TRAIN_SIZE = 0.8

In [4]:
df = pd.read_csv('../data/portuguese-tweets-for-sentiment-analysis/NoThemeTweets.csv')

In [5]:
df.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu já dei na época :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)
2,1031760962372689920,Eu só queria conseguir comer alguma coisa pra ...,Tue Aug 21 04:32:37 +0000 2018,Positivo,:)
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:)
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!é uma ""oferta...",Tue Aug 21 04:32:21 +0000 2018,Positivo,:)


In [12]:
def cleaning_text(text):
    
    text = text.lower()
    
    text = re.sub('(\\n)+', ' ', text)
    
    text = ' '.join([word for word in text.split() if word not in stopwords.words('portuguese') and word not in string.punctuation])
    
    text = norm('NFKD', text).encode('ascii', 'ignore').decode()
    
    text = re.sub('\@\S*', '', text)
    
#     text = ''.join([char for char in text if char not in string.punctuation])
    
    return text

In [6]:
Counter(df['sentiment'])
# 0 - Negative
# 4 - Positive

Counter({'Positivo': 263107, 'Negativo': 522707})

In [7]:
df['text_clean'] = df['tweet_text'].apply(cleaning_text)

NameError: name 'cleaning_text' is not defined

In [6]:
522707-263107

259600

In [7]:
index_to_remove = df[df['sentiment'] == 'Negativo'].head(259600).index

In [8]:
df_final = df.drop(index_to_remove)

In [9]:
df_final.reset_index(inplace=True, drop=True)

In [10]:
df_final.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu já dei na época :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)
2,1031760962372689920,Eu só queria conseguir comer alguma coisa pra ...,Tue Aug 21 04:32:37 +0000 2018,Positivo,:)
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:)
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!é uma ""oferta...",Tue Aug 21 04:32:21 +0000 2018,Positivo,:)


In [28]:
df_final[df_final['sentiment'] == 'Negativo']

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
119690,1038869201522515970,"@tascmamoca Eiiiii :((( Vaaaa eu pago, é no MC...",Sun Sep 09 19:18:13 +0000 2018,Negativo,:(
119691,1038869049663516672,@KangjoonBrazil Aaaa eu fiz aula mas infelizme...,Sun Sep 09 19:17:37 +0000 2018,Negativo,:(
119692,1038868962874994688,@_fairygic_ eu tava comendo brigadeiro :(,Sun Sep 09 19:17:16 +0000 2018,Negativo,:(
119693,1038868838316687360,@eduarda_silvv vc deixou ele gritar cmg :(,Sun Sep 09 19:16:46 +0000 2018,Negativo,:(
119694,1038868667394674688,"oi biaaa, tudo bom? você vai fazer o seesaw ch...",Sun Sep 09 19:16:06 +0000 2018,Negativo,:(
...,...,...,...,...,...
524280,1049138434835714048,@tearsgio Meu estado me envergonhou demais :((...,Mon Oct 08 03:24:29 +0000 2018,Negativo,:(
524281,1049138371388493825,Acho que alguém me bloqueou mkkkkkkkkkkkkkkk :(,Mon Oct 08 03:24:14 +0000 2018,Negativo,:(
524282,1049138283199062017,@cirosugarpainho Ah painhoooo... Que saudades ...,Mon Oct 08 03:23:53 +0000 2018,Negativo,:(
524283,1049138232615735297,@jpedrx mulheres incríveis pra caralho. nossa ...,Mon Oct 08 03:23:41 +0000 2018,Negativo,:(


In [27]:
df_final['tweet_text'].values[[1,2,7]]

array(['@drexalvarez O meu like eu já dei na época :)',
       'Eu só queria conseguir comer alguma coisa pra poder dormir :)',
       'Aquela mina da limpeza, que tinha um marido com meu problema me adicionou aqui no face, esquisito a princípio mas ah, nega velha e pa, veio perguntar como eu tô :) Ela deve ter tomado as dor, me viu malzao e pá.'],
      dtype=object)

In [17]:
Counter(df_final['sentiment'])

Counter({'Positivo': 263107, 'Negativo': 263107})

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df_final['tweet_text'].values[[1,2,3,4,5, 119690, 119691, 119692, 119692, 119693]], df_final['sentiment'].values[[1,2,3,4,5, 119690, 119691, 119692, 119692, 119693]], random_state=42, test_size=0.3)

In [30]:
pipeline= Pipeline([('vectorizer', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', RandomForestClassifier(n_estimators=30))])

In [31]:
pipeline

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [32]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [33]:
preds = pipeline.predict(X_test)

In [34]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

    Negativo       1.00      0.50      0.67         2
    Positivo       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



In [35]:
joblib.dump(pipeline, 'sentiment-classifier-v0.pkl')

['sentiment-classifier-v0.pkl']

In [37]:
mongo_client = pymongo.MongoClient('mongodb+srv://brunobelluzzo:bruno0911@twitteranalysis-zrfix.mongodb.net/test?retryWrites=true&w=majority')

In [38]:
db = mongo_client.Twitter

In [39]:
db.tweets.find_one({'hashtag': '#foragabi'})['tweets']

TypeError: 'NoneType' object is not subscriptable

In [46]:
for x in db.tweets.find({'hashtag': '#palmeiras', 'tweets': {'sentiment.preds': 'Positivo'}}):
    print(x)

# BIA NARRATIVAS

In [2]:
CONNECTION = pymongo.MongoClient("mongodb://{}:{}/".format("54.233.200.11", "7017"),
                                 username="narrativas",
                                 password="N4rr4Y!v!@#1",
                                 authSource="narrativas",
                                 connect=False)

In [3]:
DB = CONNECTION["narrativas"]

In [4]:
DB.collection_names()

  """Entry point for launching an IPython kernel.


['model_dev',
 'model_old',
 'trainning',
 'model_new_validated',
 'usuario',
 'model_backupOLD_29.06.19',
 'instances',
 'usuario_dev',
 'model_bkp_17072019',
 'model_new',
 'phrases_blacklist_dev',
 'model',
 'usuario_bkp_03_07',
 'checkpoints_dev',
 'checkpoints',
 'phrases_blacklist',
 'model_results_dev',
 'model_bkp_03072019',
 'model_results']

In [11]:
df = pd.read_csv("/home/bruno/BRUNO/Projetos/BIA/instances.csv", sep="\t", header=None)

In [13]:
df['status'] = "waiting"

In [15]:
df.columns = ["instance_id","instance_ip", "instance_name", "status"]

In [16]:
df.head()

Unnamed: 0,instance_id,instance_ip,instance_name,status
0,i-00186fdd28d4e2d91,ip-172-31-88-147.ec2.internal,BiaTrainingTestC51,waiting
1,i-085b1b5307a2b2f9f,ip-172-31-82-149.ec2.internal,BiaTrainingTestC526,waiting
2,i-0de6058e3df19abd6,ip-172-31-87-148.ec2.internal,BiaTrainingTestC546,waiting
3,i-0f42f6720992fafdc,ip-172-31-89-157.ec2.internal,BiaTrainingTestC548,waiting
4,i-06595cf63cf642e16,ip-172-31-85-28.ec2.internal,BiaTrainingTestC523,waiting


In [17]:
dict_ = df.to_dict()

In [22]:
for i, row in df.iterrows():
    
    DB.instances.insert_one(row.to_dict())

In [35]:
a = DB.instances.find_one({"instance_id": 'i-00186fdd28d4e2d91'})
         

In [66]:
DB.instances.update({"instance_id": 'i-085b1b5307a2b2f9f'},
                               {"$set": {"status": "waiting"}})

  


{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

In [6]:
for x in DB.trainning.find():
    print(x)

{'_id': ObjectId('5e71fe64e739d3001f629f70'), 'narrativas': {'periodo': {'vb1': ['O <indice> começou em <valorinicio> e finalizou em <valorfim> no período de <inicio> a <fim>.']}}, 'grupos': [2], 'id': 13579, 'porta': '8090', 'status': 'done', 'instance_id': 'i-085b1b5307a2b2f9f', 'instance_ip': 'ip-172-31-82-149.ec2.internal'}
{'_id': ObjectId('5e71fe78e739d3001f629f71'), 'narrativas': {'periodo': {'vb1': ['O <indice> começou em <valorinicio> e finalizou em <valorfim> no período de <inicio> a <fim>.']}}, 'grupos': [2], 'id': 24680, 'porta': '8090', 'status': 'done', 'instance_id': 'i-0de6058e3df19abd6', 'instance_ip': 'ip-172-31-87-148.ec2.internal'}


In [186]:
operation = DB.instances.find_one(
    {'status': 'trainning', 'instance_ip': 'ip-172-31-82-149.ec2.internal'})

In [80]:
str(operation)

"{'_id': ObjectId('5e6fa720014540001b791b51'), 'narrativas': {'periodo': {'vb1': ['O <indice> começou em <valorinicio> e finalizou em <valorfim> no período de <inicio> a <fim>.']}}, 'grupos': [2], 'id': 1234500, 'porta': '8090', 'status': 'trainning', 'instance_id': 'i-085b1b5307a2b2f9f', 'instance_ip': 'ip-172-31-82-149.ec2.internal'}"

In [69]:
DB.trainning.delete_many({})

<pymongo.results.DeleteResult at 0x7f4a3e2e5140>

In [30]:
DB.model_results

Collection(Database(MongoClient(host=['54.233.200.11:7017'], document_class=dict, tz_aware=False, connect=False, authsource='narrativas'), 'narrativas'), 'model_results')

In [19]:
DB.trainning.update_one(
    {"instance_ip": 'ip-172-31-82-149.ec2.internal', 'status': 'trainning'},
    {"$set": {"status": "done"}})


<pymongo.results.UpdateResult at 0x7fb6f97e8f00>

In [126]:
response_header = {'content-type': 'application/json'}

In [135]:
auth = {"username": "bia_narrativas",
        "password": "bi@_n@rrat1v4s"}

resp = requests.post(url="http://3.92.60.129:7001/auth",
                     headers=response_header, data=json.dumps(auth))

print(json.loads(resp.text))
auth_token = 'Bearer '+json.loads(resp.text)["access_token"]

response_header["authorization"] = auth_token

{'access_token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyX2lkIjpudWxsLCJleHAiOjE1ODQ0MDY1NjN9.iEMB8Wu0V1PTj8fzdXm8QjG9DvXi3Ezw8A9dCpQBlhk'}


In [136]:
response_header

{'content-type': 'application/json',
 'authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyX2lkIjpudWxsLCJleHAiOjE1ODQ0MDY1NjN9.iEMB8Wu0V1PTj8fzdXm8QjG9DvXi3Ezw8A9dCpQBlhk'}

In [57]:
DB.trainning

Collection(Database(MongoClient(host=['54.233.200.11:7017'], document_class=dict, tz_aware=False, connect=False, authsource='narrativas'), 'narrativas'), 'trainning')

In [None]:
operation = DB.trainning.find_one(
    {'status': 'waiting', 'instance_ip': instance_ip})

In [75]:
instance_ip = 'ip-172-31-87-148.ec2.internal'

# buscar no mongo a operação (operation) pendente de treinamento (status waiting) com o ip dessa instancia da aws (instance_ip) - collection trainning
operation = DB.trainning.find_one(
    {'status': 'waiting', 'instance_ip': instance_ip})

# Atualizar o status dessa operação para "trainning" - collection trainning
DB.trainning.update(
    {"instance_ip": instance_ip, 'status': 'waiting'},
    {"$set": {
        "status": "trainning"
    }

    }
)

  # This is added back by InteractiveShellApp.init_path()


{'n': 0, 'nModified': 0, 'ok': 1.0, 'updatedExisting': False}