%pip install -U spacy
%python -m spacy download en_core_web_sm
%pip install --user -U nltk

In [2]:
import numpy as np
import pandas as pd
import spacy
import spacy.cli

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


import nltk
import string
from spacy import displacy
from spacy.tokens import Span


In [58]:
spacy.cli.download('en_core_web_lg')
nltk.download('stopwords')

Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dorival/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [5]:
nlp = spacy.load('en_core_web_sm')

texto = df.Sentence[0]

print('--------------')
print('Texto original')
print(texto)

doc = nlp(texto)
# Cada token que estiver dentro do documento você mantem caso não seja uma stopword ou pontuação(punch)  
tokens_filtrado = [token for token in doc if ((not token.is_stop) & (not token.is_punct))]

print('--------------')
print('Remoção de stopwords e pontuação')
print(tokens_filtrado)

lemmas = [token.lemma_.lower().strip() for token in tokens_filtrado]

print('--------------')
print('Lemmatization')
print(lemmas)

--------------
Texto original
The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model .
--------------
Remoção de stopwords e pontuação
[GeoSolutions, technology, leverage, Benefon, GPS, solutions, providing, Location, Based, Search, Technology, Communities, Platform, location, relevant, multimedia, content, new, powerful, commercial, model]
--------------
Lemmatization
['geosolutions', 'technology', 'leverage', 'benefon', 'gps', 'solution', 'provide', 'location', 'based', 'search', 'technology', 'communities', 'platform', 'location', 'relevant', 'multimedia', 'content', 'new', 'powerful', 'commercial', 'model']


----


### 1.a Construa as funções e a pipeline, separe os dados em treino e teste, execute a pipeline para classificar em positivo, negativo e neutro.

In [6]:
# Criamos uma classe para gerenciar X e y
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Esta funcao remove espacos em branco no inicio e
# no fim do texto e converte todo o texto em letras
# minusculas
def clean_text(texto):     
    return texto.strip().lower()

# Esta funcao remove todas as stopwords e pontuacoes
def tokenizer(texto):
    doc = nlp(texto)
    tokens = [token for token in doc if ((not token.is_stop) & (not token.is_punct))]
    tokens = [token.lemma_.lower().strip() for token in tokens]
    return tokens 
#criamos um objeto CountVectorizer para vetorizar cada texto
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1))

#criamos um modelo SVM
classifier = SVC()


In [7]:

# Separando em X e y
X = df.Sentence
y = df.Sentiment

# Separando em teste e treino
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Construindo uma pipeline
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])


pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)



1.b Quais foram os valores de acurácia, precisão e sensitividade deste modelo? (3.0 pontos)

In [8]:

print(round(accuracy_score(y_test, y_pred), 3))
print(round(precision_score(y_test, y_pred, average='macro'), 3))
print(round(recall_score(y_test, y_pred, average='macro'), 3))

0.681
0.596
0.526


----

##### 2 Use o seu modelo para classificar os seguintes textos extraídos do site Financial Times. Faça uma tabela com o valor esperado e o valor obtido, e responda: houve divergência entre o esperado e o obtido? O que poderia ser feito para corrigir? (1.0 ponto)

In [17]:
new_data_dict = {
                  "Sentence": ["Central banks’ rate rises, geopolitical risk and slowing growth trigger investors’ stampede for safety.",
                          "China opens up bond market in bid to woo foreign investors.",
                          "HM Revenue & Customs says residents had £850bn in accounts overseas but it does not estimate if tax paid on this.",
                          "Japan’s horrifying crop of data falsification is also encouraging. The scandals have emerged from a distinct new phase in the evolution of the country’s shareholder capitalism.",
                          "Despite internal problems, the group continues to exert a tight grip on the US’s gun control debate."],
                  "Sentiment": ["negative", "neutral", "negative", "negative", "neutral"]        
                          }
                          
new_data = pd.DataFrame(new_data_dict)

new_data.head()

Unnamed: 0,Sentence,Sentiment
0,"Central banks’ rate rises, geopolitical risk a...",negative
1,China opens up bond market in bid to woo forei...,neutral
2,HM Revenue & Customs says residents had £850bn...,negative
3,Japan’s horrifying crop of data falsification ...,negative
4,"Despite internal problems, the group continues...",neutral


In [None]:
y_pred2 = pipe.predict(new_data['Sentence'])
print(round(accuracy_score(new_data['Sentiment'], y_pred), 3))
print(round(precision_score(new_data['Sentiment'], y_pred, average='macro'), 3))
print(round(recall_score(new_data['Sentiment'], y_pred, average='macro'), 3))

In [51]:
sentences = new_data_dict['Sentence']
sentiments_or = new_data_dict['Sentiment']
sentiments_pred = y_pred2

values_dict = {"Sentence": sentences, "Original Sentiment": sentiments_or, "Predicted Sentiment": sentiments_pred}

values_df = pd.DataFrame(values_dict)

values_df['Divergence'] = values_df["Predicted Sentiment"] != values_df["Original Sentiment"]

values_df

Unnamed: 0,Sentence,Original Sentiment,Predicted Sentiment,Divergence
0,"Central banks’ rate rises, geopolitical risk a...",negative,positive,True
1,China opens up bond market in bid to woo forei...,neutral,neutral,False
2,HM Revenue & Customs says residents had £850bn...,negative,neutral,True
3,Japan’s horrifying crop of data falsification ...,negative,neutral,True
4,"Despite internal problems, the group continues...",neutral,neutral,False


In [52]:
values_df['Divergence'].value_counts()

True     3
False    2
Name: Divergence, dtype: int64

----

#### 3 Faça uma análise exploratória, onde identifique as três empresas mais citadas e quantifique os níveis de positividade, negatividade e neutralidade dos textos sobre estas empresas. (3.0 pontos)


a. Extraia de todos os textos as entidades, há quantas entidades? (0.6 pontos) 


In [59]:
entities = 0
for row in range(len(df['Sentence'])):
  doc = nlp(df['Sentence'][row])
  for ent in doc.ents:
    entities += 1

           
print(entities)

14467


b. Quantas entidades são empresas? (0.6 pontos)


In [57]:
orgs = {'original_index': [], 'entity': [], 'sentiment': []}
for row in range(len(df['Sentence'])):
  doc = nlp(df['Sentence'][row])
  for ent in doc.ents:
    if ent.label_ == 'ORG':
      orgs['original_index'].append(row)
      orgs['entity'].append(ent.text)
      orgs['sentiment'].append(df['Sentence'][row])

ents = pd.DataFrame(orgs)
ents.head()

Unnamed: 0,original_index,entity,sentiment
0,0,GeoSolutions,The GeoSolutions technology will leverage Bene...
1,0,Location Based Search Technology,The GeoSolutions technology will leverage Bene...
2,1,ESI,"$ESI on lows, down $1.50 to $2.50 BK a real po..."
3,1,BK,"$ESI on lows, down $1.50 to $2.50 BK a real po..."
4,5,SPY,$SPY wouldn't be surprised to see a green close


 c. Quais são as três empresas mais citadas? (0.6 pontos) 
 

d. Faça uma tabela onde demonstre as três empresas mais citadas e o total de textos positivos, negativos e neutros de cada uma. (1.2 pontos)

In [9]:
#pegar uma lista com palavras que se repetem e deixar só a original

lista = ['1', '2', '3', '1']
lista = list(set(lista))
print(lista)

['2', '3', '1']


In [10]:
doc2 = nlp('Eu tenho usado o serviço de armazenamento na nuvem da Google, é a opção mais barata no Brasil, pago somente R$ 9.99 por mês.')

for ent in doc2.ents:
    print(ent.text, ent.label_)

Eu ORG
mais barata PERSON
Brasil PERSON
9.99 MONEY


----