In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")

Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

Top terms per cluster:
Cluster 0:
 google
 cat
 best
 climbing
 ninja
 incredible
 app
 translate
 impressed
 map
Cluster 1:
 100
 open
 tab
 smiley
 face
 google
 feedback
 extension
 eating
 climbing


Prediction
[0]
[0]


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ahuangfeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:

if not sys.warnoptions:
    warnings.simplefilter("ignore")

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def removeStopWords(sentence):
    global stopwordsSpanish
    # Check characters to see if they are in punctuation
    nopunc = [char for char in sentence if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    splited = [word for word in nopunc.split() if word.lower() not in stopwordsSpanish]
    return ' '.join(splited)

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


In [22]:
#yelp = pd.read_csv('reviews_spain_total.csv')
yelp = pd.read_csv('data_world_bbva.csv')
yelp.head()


Unnamed: 0,id_review,caption,relative_date,retrieval_date,rating,username,n_review_user,n_photo_user,url_user,city,country
0,ChdDSUhNMG9nS0VJQ0FnSURDdWYtLWpRRRAB,Muy buena atencion,Hace una semana,2020-10-17 16:18:07.836052,5.0,luis gabriel pereyra diaz,0,0,https://www.google.com/maps/contrib/1105795026...,Ciudad de Buenos Aires Argentina,Argentina
1,ChdDSUhNMG9nS0VJQ0FnSURzb2NLTV9RRRAB,,Hace 7 meses,2020-10-17 16:18:07.836052,3.0,Maximiliano Freire,0,0,https://www.google.com/maps/contrib/1067034616...,Ciudad de Buenos Aires Argentina,Argentina
2,ChdDSUhNMG9nS0VJQ0FnSURNa0tuUy1BRRAB,Este Banco (todas las sucursales) tiene los re...,Hace 10 meses,2020-10-17 16:18:07.837051,1.0,i Lix,5,0,https://www.google.com/maps/contrib/1124264758...,Ciudad de Buenos Aires Argentina,Argentina
3,ChZDSUhNMG9nS0VJQ0FnSURNalBqUFdBEAE,Nunca me atendio tan mal el personal de recepc...,Hace 11 meses,2020-10-17 16:18:07.838052,1.0,Gonzalo Pereira,6,0,https://www.google.com/maps/contrib/1109467455...,Ciudad de Buenos Aires Argentina,Argentina
4,ChdDSUhNMG9nS0VJQ0FnSUMwejh6ZWlRRRAB,"TE OFRECEN MILES DE ""BENEFICIOS"" PARA ABRIR UN...",Hace un año,2020-10-17 16:18:07.838052,1.0,Leo Rodriguez,1,0,https://www.google.com/maps/contrib/1105119748...,Ciudad de Buenos Aires Argentina,Argentina


In [23]:
treated = yelp.drop(['id_review','relative_date','retrieval_date','username','n_review_user','n_photo_user','url_user','city','country'], axis=1, inplace=False)

In [24]:
treated.head()

Unnamed: 0,caption,rating
0,Muy buena atencion,5.0
1,,3.0
2,Este Banco (todas las sucursales) tiene los re...,1.0
3,Nunca me atendio tan mal el personal de recepc...,1.0
4,"TE OFRECEN MILES DE ""BENEFICIOS"" PARA ABRIR UN...",1.0


In [25]:
treated['caption'] = treated['caption'].astype(str)
treated['caption_cleaned'] = ''

for it in range(0,len(treated['caption'])):
    if '(Traducido por Google)' in treated['caption'][it]:
        treated['caption_cleaned'][it] = treated['caption'][it].split('(Original)')[0].split('(Traducido por Google)')[1]
    elif 'nan' == treated['caption'][it]:
        treated['caption_cleaned'][it] = ''
    else:
        treated['caption_cleaned'][it] = treated['caption'][it]


In [26]:
treated.head()

Unnamed: 0,caption,rating,caption_cleaned
0,Muy buena atencion,5.0,Muy buena atencion
1,,3.0,
2,Este Banco (todas las sucursales) tiene los re...,1.0,Este Banco (todas las sucursales) tiene los re...
3,Nunca me atendio tan mal el personal de recepc...,1.0,Nunca me atendio tan mal el personal de recepc...
4,"TE OFRECEN MILES DE ""BENEFICIOS"" PARA ABRIR UN...",1.0,"TE OFRECEN MILES DE ""BENEFICIOS"" PARA ABRIR UN..."


In [27]:
#Removing punctuation, html crap, empty comments, stopwords
treated = treated[treated.caption_cleaned != '']
treated['caption_cleaned'] = treated['caption_cleaned'].str.lower()
treated['caption_cleaned'] = treated['caption_cleaned'].apply(cleanHtml)
treated['caption_cleaned'] = treated['caption_cleaned'].apply(cleanPunc)
stopwordsSpanish = set(stopwords.words('spanish'))
treated['caption_cleaned'] = treated['caption_cleaned'].apply(removeStopWords)


In [28]:

#Stemming (transform words into semantics)
stemmer = SnowballStemmer("spanish")
treated['caption_cleaned'] = treated['caption_cleaned'].apply(stemming)

treated.head()

Unnamed: 0,caption,rating,caption_cleaned
0,Muy buena atencion,5.0,buen atencion
2,Este Banco (todas las sucursales) tiene los re...,1.0,banc tod sucursal recurs funcional reput confi...
3,Nunca me atendio tan mal el personal de recepc...,1.0,nunc atendi tan mal personal recepcion banc da...
4,"TE OFRECEN MILES DE ""BENEFICIOS"" PARA ABRIR UN...",1.0,ofrec mil benefici abrir cuent cumpl usurer
5,te ofrecen el oro y el moro para que abras una...,1.0,ofrec oro mor abras cuent mill latam etc obvi ...


In [16]:
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
fit = vectorizer.fit(treated['caption_cleaned'])
fit

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
X = vectorizer.fit_transform(treated['caption_cleaned'])
X

<1708x52301 sparse matrix of type '<class 'numpy.float64'>'
	with 83293 stored elements in Compressed Sparse Row format>

In [21]:

true_k = 8
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")

Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

Top terms per cluster:
Cluster 0:
 cajer
 banc
 funcion
 automat
 cerr
 cajer automat
 mal
 servici
 lent
 diner
Cluster 1:
 telefon
 cog
 cog telefon
 atencion
 nunc
 pesim
 llam
 nunc cog
 nunc cog telefon
 mal atencion
Cluster 2:
 oficin
 client
 esper
 trat
 cuent
 hac
 atencion
 bbva
 bien
 si
Cluster 3:
 buen
 buen atencion
 atencion
 buen trat
 buen banc
 buen atencion client
 trat
 buen servici
 atencion client
 banc
Cluster 4:
 amabl
 personal amabl
 personal
 trat amabl
 trat
 emple amabl
 rap
 director
 servicial
 personal amabl siempr


Prediction
[2]
[2]
