In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
# ignoramos las comillas dobles quoting
dataset = pd.read_csv('./data/Restaurant_Reviews.tsv', sep='\t', quoting=3)
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


### Limpieza del texto

In [13]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# eliminamos los caracteres regulares (? : ... !)
review = re.sub('[^a-zA-Z]',' ',dataset.Review[0])
print(dataset.Review[0])
print(review)
# pasamos a minusculas
review = review.lower()
print(review)
# separamos en una lista
review = review.split()
print(review)
# colocando en un cojunto y eliminamos las palabras no relevantes
review = [word for word in review if not word in set(stopwords.words('english'))]
print(review)
# pasar a raíz cada palabra o infinitivo (ESTEMIZADO)
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
print(review)
#juntamos
review = ' '.join(review)
print(review)


Wow... Loved this place.
Wow    Loved this place 
wow    loved this place 
['wow', 'loved', 'this', 'place']
['wow', 'loved', 'place']
['wow', 'love', 'place']
wow love place


### Generalizamos el código individual

In [15]:
corpus = []
for i in range(dataset.shape[0]):
    review = re.sub('[^a-zA-Z]',' ',dataset.Review[i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

### Creando el Back of Words

In [20]:
from sklearn.feature_extraction.text import CountVectorizer 

'''
Nota: 
- También se puede hacer el prepocesado con los parametros 
  de la función CountVectorizer

- eliminamos palabras no relevantes (como los nombres de las personas) 
  para reducir la dimensionalidad del problema (max_features)
'''
# Traducimos el corpus de palabras a un vector de palabras
# 

# cv = CountVectorizer()
cv = CountVectorizer(max_features=1200)
# La matriz será demasiado grande, pasemos un toarray()
X = cv.fit_transform(corpus).toarray()
X.shape

(1000, 1200)

Ahora que ya tenemos nuestra bolsa de palabras vectorizadas, solo nos queda entrenar con algún algoritmo de clasificación.

In [17]:
y = dataset.iloc[:,1].values

### Ajustamos el modelo de clasificación a usar

En este caso usaremos el de Naive Bayes

In [19]:
from  sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,test_size=0.2)

In [21]:
from sklearn.naive_bayes import GaussianNB 

classifier = GaussianNB()
classifier.fit(X_train,y_train)

In [22]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix 

cm = confusion_matrix(y_test,y_pred)
cm

array([[55, 42],
       [12, 91]])

In [25]:
# precisión
(cm[0][0]+cm[1][1])/len(y_test)

0.73