In [None]:
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dfcomentario = pd.read_csv('/content/Encuesta_Comunidades.csv' , sep=';')
dfcomentario

In [None]:
#TO DO
# 1) Limpieza del texto
from unicodedata import normalize
import re
# Define una funcion para limpiar el texto y devolverlo en minusculas
def clean(text):
# Remueve todos los caracteres especiales dejando solo los alfabeticos
    text=re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize( "NFD", text), 0, re.I    )
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text.lower()

# Limpia el texto en la columna comentario
dfcomentario['Comentario_Limpio'] = dfcomentario['Comentario'].apply(clean)
dfcomentario.head()

In [None]:
#2-4 Tokenizacion, POS tagging, eliminacion de Stopwords

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))

    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('spanish')):
          newlist.append(tuple([word, pos_dict.get(tag[0])]))

    return newlist

In [None]:
dfcomentario['POS_tagged'] = dfcomentario['Comentario_Limpio'].apply(token_stop_pos)
dfcomentario.head()

In [None]:
#5) Obtención de las palabras raíz – Lematización

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(pos_data):
  lemma_rew = " "
  for word, pos in pos_data:
    if not pos:
      lemma = word
      lemma_rew = lemma_rew + " " + lemma
    else:
      lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
      lemma_rew = lemma_rew + " " + lemma

    return lemma_rew

dfcomentario['Lemma'] = dfcomentario['POS_tagged'].apply(lemmatize)
dfcomentario.head()

In [None]:
dfcomentario[['Comentario_Limpio', 'Lemma']]

In [None]:
!pip install pysentimiento

In [None]:
from pysentimiento import create_analyzer
import transformers

transformers.logging.set_verbosity(transformers.logging.ERROR)

analyzer = create_analyzer(task="sentiment", lang="es")

In [None]:
def obtener_sentimiento_completo(comentario):
  res = analyzer.predict(comentario)
  return pd.Series({'sentiment': res.output, **res.probas})

def obtener_sentimiento(comentario):
  res = analyzer.predict(comentario)
  return  res.output

def obtener_polaridad(comentario):
  res = analyzer.predict(comentario)
  score = 0
  pos = res.probas["POS"]
  neg = res.probas["NEG"]
  neu = res.probas["NEU"]

  if pos > neg and pos>neu:
    score=pos
  if neg > pos and neg>neu:
    score=neg
  if neu > pos and neu>neg:
    score=neu

  return  score


In [None]:
dfcomentario['Analisis'] = dfcomentario['Comentario_Limpio'].apply(obtener_sentimiento)
dfcomentario['Polaridad'] = dfcomentario['Comentario_Limpio'].apply(obtener_polaridad)

In [None]:
dfcomentario

In [None]:
#TO DO
#Term Frequency - Inverse Document Frequency (TF-IDF) Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features = 5000, ngram_range = (2, 2))
X = tfidf.fit_transform(dfcomentario["Comentario_Limpio"])
X.shape

In [None]:
#Feature Engineering
#Encoding Sentiment variable
#LabelEncoder codifica etiquetas asignándoles números

from sklearn.preprocessing import LabelEncoder


Encoder = LabelEncoder()
dfcomentario["Analisis_Texto"] = dfcomentario["Analisis"]
dfcomentario["Analisis"] = Encoder.fit_transform(dfcomentario["Analisis"])
dfcomentario["Analisis"].value_counts()

In [None]:
y = dfcomentario['Analisis']
y

In [None]:
#Balance the imbalanced dataset
from collections import Counter

Counter(y)

In [None]:
from imblearn.over_sampling import SMOTE

Balancer = SMOTE(random_state = 42)
X_final, y_final = Balancer.fit_resample(X, y)

In [None]:
Counter(y_final)

In [None]:
#TO DO
#Entrenar un modelo de clasificación adecuado sobre los datos procesados ​​para la clasificación de sentimientos

#Separar los datos en entrenamiento y prueba
#Model Selection
#Split the dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.20, random_state = 42)

In [None]:
# Entrenamos el modelo

from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, y_train)

In [None]:
#Calculamos el score de exactitud del modelo

from sklearn import metrics
predicted = MNB.predict(X_test)

accuracy_score = metrics.accuracy_score(predicted, y_test)
print("Accuracuy Score: ",accuracy_score)

In [None]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

ConfusionMatrix = confusion_matrix(y_test, predicted )

In [None]:
# Plotting Function for Confusion Matrix

import matplotlib.pyplot as plt
%matplotlib inline
colors = ['#4F6272', '#B7C3F3', '#DD7596']

def plot_cm(cm, classes, title, normalized = False, cmap = plt.cm.BuPu):
    import numpy as np
    plt.imshow(cm, interpolation = "nearest", cmap = cmap)
    plt.title(title, pad = 20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)


    if normalized:
        cm = cm.astype('float') / cm.sum(axis = 1)[: np.newaxis]
        print("Matriz de Confusion Normalizada")
    else:
        print("Matriz de Confusion No-Normalizada")

    threshold = cm.max() / 2
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], horizontalalignment = "center", color = "white" if cm[i, j] > threshold else "black")

    plt.tight_layout()
    plt.xlabel("Etiqueta predicha", labelpad = 20)
    plt.ylabel("Etiqueta real", labelpad = 20)

In [None]:
plot_cm(ConfusionMatrix, classes = ["Positivo", "Neutral", "Negativo"], title = "Matriz de Confusion del Analisis de Sentimiento")
plt.tight_layout()
plt.savefig('matriz_confusion-png', dpi=300)

In [None]:
print(classification_report(y_test, predicted))

In [None]:
#Model building

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score


dt = DecisionTreeClassifier()
#lr = LogisticRegression()
SVC = SVC()
rf = RandomForestClassifier()
Bayes = BernoulliNB()
KNN = KNeighborsClassifier()

#Models = [dt, lr, SVC, rf, Bayes, KNN]
Models = [dt, SVC, rf, Bayes, KNN]
#Models_Dict = {0: "Decision Tree", 1: "Logistic Regression", 2: "SVC", 3: "Random Forest", 4: "Naive Bayes", 5: "K-Neighbors"}

Models_Dict = {0: "Decision Tree", 1: "SVC", 2: "Random Forest", 3: "Naive Bayes", 4: "K-Neighbors"}

for i, model in enumerate(Models):
  print("{} Test Accuracy: {}".format(Models_Dict[i], cross_val_score(model, X, y, cv = 10, scoring = "accuracy").mean()))

In [None]:
dfcomentario.to_csv('dfcomentario.csv')

In [None]:
dfcomentario.head()

In [None]:
df1_neg = dfcomentario[dfcomentario['Analisis_Texto'] == 'NEG']
df1_neg = df1_neg.groupby(['Localidad']).agg(['count'])
df1_neg['NombreCompleto']

In [None]:
df1_pos = dfcomentario[dfcomentario['Analisis_Texto'] == 'POS']
df1_pos = df1_pos.groupby(['Localidad']).agg(['count'])
df1_pos['NombreCompleto']

In [None]:
df1_neu = dfcomentario[dfcomentario['Analisis_Texto'] == 'NEU']
df1_neu = df1_neu.groupby(['Localidad']).agg(['count'])
df1_neu['NombreCompleto']