<a href="https://colab.research.google.com/github/AraRG2809/Natural_Language_Processing/blob/main/ARA_Proyecto_An%C3%A1lisis_de_Sentimientos_con_SpaCy_y_Sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Análisis de sentimientos con SpaCy y Sklearn
+ Análisis de sentimientos
+ + DataSet: http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

##### El objetivo es clasificar las reviews en positiva o negativa


In [14]:
import pandas as pd

In [16]:
# Cargamos dataset
df_yelp = pd.read_table('yelp_labelled.txt')
df_imdb = pd.read_table('imdb_labelled.txt')
df_amz = pd.read_table('amazon_cells_labelled.txt')


In [17]:
# Unimos los datasets
frames = [df_yelp,df_imdb,df_amz]

In [18]:
# Renombramos las columnas de la cabecera
for colname in frames:
    colname.columns = ["Message","Target"]

In [19]:
# Nombre de las columnas
for colname in frames:
    print(colname.columns)

Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')


In [20]:
# Asignamos claves para hacerlo más fácil
keys = ['Yelp','IMDB','Amazon']

In [21]:
# Concatenamos las claves y los dataframes
df = pd.concat(frames,keys=keys)

In [22]:
df.head()

Unnamed: 0,Unnamed: 1,Message,Target
Yelp,0,Crust is not good.,0
Yelp,1,Not tasty and the texture was just nasty.,0
Yelp,2,Stopped by during the late May bank holiday of...,1
Yelp,3,The selection on the menu was great and so wer...,1
Yelp,4,Now I am getting angry and I want my damn pho.,0



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



###  Trabajando con SpaCy

+ Instalar SpaCy, descargar modelo e importar

In [23]:
# instalamos librería spacy
!pip install -U pip setuptools wheel
!pip install -U spacy

# descargamos modelo español
!python -m spacy download en_core_web_sm
# !python -m spacy download es_core_news_sm

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Collecting setuptools
  Downloading setuptools-74.0.0-py3-none-any.whl.metadata (6.7 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Downloading setuptools-74.0.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 71.0.4
    Uninstalling setuptools-71.0.4:
      Successfully uninstalled setuptools-71.0.4
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [24]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")

+ Limpiar texto

In [25]:
def clean_text(texto):
  # Eliminar caracteres especiales y símbolos
  texto_limpio = re.sub(r'[^\w\s]', '', texto)

  # Eliminar números
  texto_limpio = re.sub(r'\d+', '', texto_limpio)

  # Eliminar hashtags
  texto_limpio = re.sub(r'#\w+', '', texto_limpio)

  return texto_limpio

+ Normalizar texto

In [26]:
def normalize_text(texto):
  text_norm = texto.lower()

  return text_norm

+ Eliminar Stopwords

In [27]:
def remove_stopwords(texto):
    doc = nlp(texto)
    tokens_sin_stopwords = [token.text for token in doc if not token.is_stop]
    texto_limpio = " ".join(tokens_sin_stopwords)
    return texto_limpio

+ Obtener Lemma

In [28]:
def lemmatization(texto):
  doc = nlp(texto)

  # Paso 3: Eliminar stopwords, lematizar y obtener una lista de tokens
  tokens = [token.lemma_ for token in doc]

##### Procesamos el texto completo

In [29]:
def spacy_tokenizer(texto):
  # Paso 0: Limpiar texto
  texto = clean_text(texto)

  # Paso 1: Convertir el texto a minúsculas (normalización)
  texto = normalize_text(texto)

  # Paso 2: Procesar el texto con spaCy
  doc = nlp(texto)

  # Paso 3: Eliminar stopwords, lematizar y obtener una lista de tokens
  tokens = [token.lemma_ for token in doc if not token.is_stop]

  return tokens

#### Vectorizamos

In [30]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [31]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [32]:
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

#### Machine Learning With SKlearn

In [33]:
# paquetes ML
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [34]:
# Transformación Custom con Spacy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

def clean_text(text):
    return text.strip().lower()

In [35]:
classifier = LinearSVC()

In [36]:
# Dividimos Data Set
from sklearn.model_selection import train_test_split

In [37]:
# Features y Labels
X = df['Message']
ylabels = df['Target']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.25, random_state=42)

In [39]:
# Creamos el pipelien en sklearn para limpiar, tokenizar, vectorizar y clasificar
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [40]:
# Fit our data
pipe.fit(X_train,y_train)



In [41]:
# Accuracy
print("Accuracy: ",pipe.score(X_train,y_train))


Accuracy:  0.9868804664723032


In [42]:
# Another random review
pipe.predict(["I recommend this movie to watch, it's great"])

array([1])

In [43]:
example = ["I love this product so much",
 "What an inferior item! I will purchase a new one",
 "I feel happy when using your product!",
 "In my experience, the product is quite useless. After 2 weeks of use it broke."
           ]


In [44]:
pipe.predict(example)

array([1, 0, 1, 0])