In [None]:
!pip install -U sentence-transformers

### Libraries 

In [2]:
import os 
import warnings
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import nltk
import gensim
import networkx as nx
import re, unicodedata
import dask.dataframe as ddf
import gensim.corpora as corpora

from dask import delayed
from dask import compute
from pprint import pprint
from itertools import chain
from nltk.stem.porter import *
from collections import Counter
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from transformers import pipeline
from prettytable import PrettyTable
from nltk.tokenize import TweetTokenizer
from gensim.models import CoherenceModel
from networkx.algorithms import community
from rpy2.robjects.packages import importr
from gensim.utils import simple_preprocess
from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from transformers import AutoTokenizer, AutoModelForCausalLM

warnings.filterwarnings('ignore', 'SelectableGroups dict interface')



In [None]:
os.chdir('input_path')

### Default values

In [None]:
col_name = 'col_name'
project_name = 'project_name_'
dataset_name = 'file_name.format'

### Load Dataset

In [None]:
df = pd.read_excel(dataset_name,engine ='openpyxl')

nRow, nCol = df.shape
df = df.set_index('Response ID')
print(f'There are {nRow} rows and {nCol} columns')

### Preprocessing
#### Stopword list

In [None]:
stop_words = list(stopwords.words("spanish"))

#### Preprocessing pipeline

In [None]:
nlp = spacy.load('es_core_news_lg')
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

def tokenizador(text):
    return(' '.join(tknzr.tokenize(text)))

def sent_to_words(sentence):
    return(gensim.utils.simple_preprocess(str(sentence), deacc=True, min_len = 4))

def remove_stopwords(doc):
    return ' '.join([word.strip() for word in simple_preprocess(str(doc)) 
   if word.strip() not in stop_words])

def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','INTJ']): 
    doc = nlp(text)
    return  [token.lemma_ for token in doc if token.pos_ in allowed_postags]

def clean_text(df, variable = 'ABIERTA CON ORTOGRAFÍA'):
    data_lemmatized = df[variable][~df[variable].isna()].map(tokenizador).map(sent_to_words).map(remove_stopwords).map(lemmatization)
    
    filtro = data_lemmatized.map(len) > 0 
    data_lemmatized = data_lemmatized[filtro]
    
    return data_lemmatized

In [None]:
%%time

data_lemmatized = clean_text(df,variable = col_name)

### EDA

In [None]:
words = list(chain.from_iterable(data_lemmatized))
pt = PrettyTable(field_names=['Palabra', 'Conteo']) 

c = Counter(words)
[ pt.add_row(kv) for kv in c.most_common()[:10] ]
pt.align['Palabra'], pt.align['Conteo'] = 'l', 'r' 
pt._max_width = {'Palabra':60, 'Conteo':10}
print(pt)

In [None]:
str_words = ' '.join(words) 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate(str_words)
                    
plt.figure(figsize = (10,10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()

### Sentiment Analysis

In [None]:
documents = [doc for doc in data_lemmatized.map(lambda x: ' '.join(x))]

#### 5 stars review format

In [None]:
model = 'nlptown/bert-base-multilingual-uncased-sentiment'

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)
sentiment_clf = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
sentiment_clf('el servicio es increiblemente malo')

Downloading:   0%|          | 0.00/334 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/838k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/415M [00:00<?, ?B/s]

In [None]:
sentiment_clf = generator 

#### Sentiment classification

In [None]:
model = 'pysentimiento/robertuito-sentiment-analysis'

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)
sentiment_clf = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
sentiment_clf('el servicio es increiblemente malo')

here if we obtaina score that allow us to predict a negative sentiment we could apply a hate speech classificator with the following classes

* HS: is it hate speech?
* TR: is it targeted to a specific individual?
* AG: is it aggressive?

In [None]:
model = 'pysentimiento/robertuito-hate-speech'

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)
sentiment_clf = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
sentiment_clf('el servicio es increiblemente malo')

#### Emotion classification

In [None]:
model = 'pysentimiento/robertuito-emotion-analysis'

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)
sentiment_clf = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
sentiment_clf('el servicio es increiblemente malo')