In [55]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report

import string
import re
from bs4 import BeautifulSoup
import string
from unicodedata import normalize as norm
from nltk.corpus import stopwords
import joblib
import pymongo
import socket
from datetime import datetime

In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
# DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
# DATASET_ENCODING = "ISO-8859-1"
# TRAIN_SIZE = 0.8

In [12]:
df = pd.read_csv('../data/portuguese-tweets-for-sentiment-analysis/NoThemeTweets.csv')

In [13]:
df.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu já dei na época :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)
2,1031760962372689920,Eu só queria conseguir comer alguma coisa pra ...,Tue Aug 21 04:32:37 +0000 2018,Positivo,:)
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:)
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!é uma ""oferta...",Tue Aug 21 04:32:21 +0000 2018,Positivo,:)


In [14]:
def cleaning_text(text):
    
    text = text.lower()
    
    text = re.sub('(\\n)+', ' ', text)
    
    text = ' '.join([word for word in text.split() if word not in stopwords.words('portuguese') and word not in string.punctuation])
    
    text = norm('NFKD', text).encode('ascii', 'ignore').decode()
    
    text = re.sub('\@\S*', '', text)
    
#     text = ''.join([char for char in text if char not in string.punctuation])
    
    return text

In [15]:
Counter(df['sentiment'])
# 0 - Negative
# 4 - Positive

Counter({'Positivo': 263107, 'Negativo': 522707})

In [16]:
df['text_clean'] = df['tweet_text'].apply(cleaning_text)

In [21]:
522707-263107

259600

In [18]:
index_to_remove = df[df['sentiment'] == 'Negativo'].head(259600).index

In [19]:
df_final = df.drop(index_to_remove)

In [22]:
df_final.reset_index(inplace=True, drop=True)

In [23]:
df_final.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used,text_clean
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:),14 ir :)
1,1031761040462278656,@drexalvarez O meu like eu já dei na época :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:),like dei epoca :)
2,1031760962372689920,Eu só queria conseguir comer alguma coisa pra ...,Tue Aug 21 04:32:37 +0000 2018,Positivo,:),queria conseguir comer alguma coisa pra poder ...
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:),:d lindo dia
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!é uma ""oferta...",Tue Aug 21 04:32:21 +0000 2018,Positivo,:),"pq pr jeito!!e ""oferta"", ha q aproveitar. :p"


In [24]:
Counter(df_final['sentiment'])

Counter({'Positivo': 263107, 'Negativo': 263107})

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df_final['text_clean'], df_final['sentiment'], random_state=42, test_size=0.3)

In [26]:
pipeline= Pipeline([('vectorizer', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', XGBClassifier(n_estimators=500))])

In [27]:
pipeline

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing

In [28]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing

In [29]:
preds = pipeline.predict(X_test)

In [31]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

    Negativo       0.79      0.60      0.68     79074
    Positivo       0.68      0.84      0.75     78791

    accuracy                           0.72    157865
   macro avg       0.73      0.72      0.72    157865
weighted avg       0.73      0.72      0.72    157865



In [33]:
joblib.dump(pipeline, 'sentiment-classifier-v0.pkl')

['sentiment-classifier-v0.pkl']

In [2]:
CONNECTION = pymongo.MongoClient("mongodb://{}:{}/".format("54.233.200.11", "7017"),
                                 username="narrativas",
                                 password="N4rr4Y!v!@#1",
                                 authSource="narrativas",
                                 connect=False)

In [3]:
DB = CONNECTION["narrativas"]

In [6]:
DB.collection_names()

  """Entry point for launching an IPython kernel.


['model_dev',
 'model_old',
 'model_new_validated',
 'usuario',
 'model_backupOLD_29.06.19',
 'usuario_dev',
 'model_bkp_17072019',
 'model_new',
 'phrases_blacklist_dev',
 'model',
 'usuario_bkp_03_07',
 'checkpoints_dev',
 'checkpoints',
 'phrases_blacklist',
 'model_results_dev',
 'model_bkp_03072019',
 'model_results']

In [11]:
df = pd.read_csv("/home/bruno/BRUNO/Projetos/BIA/instances.csv", sep="\t", header=None)

In [13]:
df['status'] = "waiting"

In [15]:
df.columns = ["instance_id","instance_ip", "instance_name", "status"]

In [16]:
df.head()

Unnamed: 0,instance_id,instance_ip,instance_name,status
0,i-00186fdd28d4e2d91,ip-172-31-88-147.ec2.internal,BiaTrainingTestC51,waiting
1,i-085b1b5307a2b2f9f,ip-172-31-82-149.ec2.internal,BiaTrainingTestC526,waiting
2,i-0de6058e3df19abd6,ip-172-31-87-148.ec2.internal,BiaTrainingTestC546,waiting
3,i-0f42f6720992fafdc,ip-172-31-89-157.ec2.internal,BiaTrainingTestC548,waiting
4,i-06595cf63cf642e16,ip-172-31-85-28.ec2.internal,BiaTrainingTestC523,waiting


In [17]:
dict_ = df.to_dict()

In [22]:
for i, row in df.iterrows():
    
    DB.instances.insert_one(row.to_dict())

In [35]:
a = DB.instances.find_one({"instance_id": 'i-00186fdd28d4e2d91'})
         

In [69]:
DB.instances.update({"instance_id": 'i-00186fdd28d4e2d91'},
                               {"$set": {"status": "in use"}})

  


{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

In [87]:
for x in DB.trainning.find():
    print(x)


{'_id': ObjectId('5e6bd369014540001e791b4f'), 'narrativas': {'periodo': {'vb1': ['O <indice> começou em <valorinicio> e finalizou em <valorfim> no período de <inicio> a <fim>.'], 'vb2': ['No intervalo de <inicio> a <fim>, a cotação do <indice> iniciou em <valorinicio>, operando em <altaouquedain> de <percentualinicio>, e encerrou em <valorfim>, operando em <altaouquedafim> de <percentualfim>.']}, 'dia': {'vb1': ['No dia <data>, a cotação do <indice> era <valor>.'], 'vb2': ['A cotação do <indice>, no dia <data>, estava em <valor>, com variação de <percentual> em relação ao dia anterior.']}}, 'grupos': [2], 'id': 1234567, 'porta': '8090', 'status': 'trainning', 'instance_id': 'i-085b1b5307a2b2f9f', 'instance_ip': 'ip-172-31-82-149.ec2.internal'}


In [50]:
a = DB.model.find_one({'id': '309'})

In [77]:
DB.trainning.delete_many({})

<pymongo.results.DeleteResult at 0x7f945a132730>

In [30]:
DB.model_results

Collection(Database(MongoClient(host=['54.233.200.11:7017'], document_class=dict, tz_aware=False, connect=False, authsource='narrativas'), 'narrativas'), 'model_results')

In [34]:
socket.gethostname()

'bruno-G7-7588'

In [56]:
datetime.now()

datetime.datetime(2020, 3, 13, 8, 54, 32, 140743)

In [57]:
DB.trainning

Collection(Database(MongoClient(host=['54.233.200.11:7017'], document_class=dict, tz_aware=False, connect=False, authsource='narrativas'), 'narrativas'), 'trainning')

In [None]:
operation = DB.trainning.find_one(
    {'status': 'waiting', 'instance_ip': instance_ip})

In [75]:
instance_ip = 'ip-172-31-87-148.ec2.internal'

# buscar no mongo a operação (operation) pendente de treinamento (status waiting) com o ip dessa instancia da aws (instance_ip) - collection trainning
operation = DB.trainning.find_one(
    {'status': 'waiting', 'instance_ip': instance_ip})

# Atualizar o status dessa operação para "trainning" - collection trainning
DB.trainning.update(
    {"instance_ip": instance_ip, 'status': 'waiting'},
    {"$set": {
        "status": "trainning"
    }

    }
)

  # This is added back by InteractiveShellApp.init_path()


{'n': 0, 'nModified': 0, 'ok': 1.0, 'updatedExisting': False}