In [62]:
#Utils
import numpy as np
import pandas as pd

#NLP
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import  mark_negation
from nltk.stem import PorterStemmer

In [63]:
#Download resource
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("vader_lexicon")
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baspe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\baspe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\baspe\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\baspe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\baspe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\baspe\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!

True

In [64]:
def apply_stopwords(reviews_en):
    tokenize_words_list = []
    for row in reviews_en:
        words = nltk.tokenize.word_tokenize(row) 
        tokenize_words = [word.lower() for word in words if word not in nltk.corpus.stopwords.words("english")]
        tokenize_words_list.append(tokenize_words)
    return tokenize_words_list

In [70]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag

# corpus_words = set(nltk.corpus.words.words())

def apply_lemmatize(word_list):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence_list = []
    for words in word_list:
        lemmatized_sentence = []
        for word, tag in pos_tag(words):
            if tag.startswith('NN'):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
            list_to_str = ' '.join([str(elem) for elem in lemmatized_sentence]) 
#             str_in_corpus = ' '.join(w for w in nltk.wordpunct_tokenize(list_to_str) if w in corpus_words or not w.isalpha())
        lemmatized_sentence_list.append(list_to_str)
    return lemmatized_sentence_list

In [71]:
hospital_list = ["bumrungrad",  "lerdsin", "rajavithi"]

In [72]:
df = pd.read_csv("lerdsin-translated.csv")

In [73]:
df.head()

Unnamed: 0,reviews,ratings,reviews_en
0,จำไม่ได้ว่าปีไหน น่าจะ ช่วงปลายปี 57 หรือ 58 ต...,1,I can't remember which year it was at the end ...
1,เคยใช้บริการตอนขาหัก โดยใช้สิทธิ์ประกันสังคม ไ...,1,I had a broken leg The right to social securit...
2,รอคิวตรวจนานมากค่ะ มาตั้งแต่ 6 โมงเช้า กว่าจะไ...,1,"Make me wait so long since 6 am to 3 pm, they ..."
3,มาครั้งแรก ประทับใจมากคับ บริการเหมือน รพ เอกช...,1,The first impression is very good services lik...
4,บริการดีครับ แต่คนเยอะมากๆเลย ที่ใช้บริการคือม...,1,"Good service, but a lot of people yet. The ser..."


In [74]:
df["words"] = apply_stopwords(df["reviews_en"])
df.head()

Unnamed: 0,reviews,ratings,reviews_en,words
0,จำไม่ได้ว่าปีไหน น่าจะ ช่วงปลายปี 57 หรือ 58 ต...,1,I can't remember which year it was at the end ...,"[i, ca, n't, remember, year, end, year, 57, 58..."
1,เคยใช้บริการตอนขาหัก โดยใช้สิทธิ์ประกันสังคม ไ...,1,I had a broken leg The right to social securit...,"[i, broken, leg, the, right, social, security,..."
2,รอคิวตรวจนานมากค่ะ มาตั้งแต่ 6 โมงเช้า กว่าจะไ...,1,"Make me wait so long since 6 am to 3 pm, they ...","[make, wait, long, since, 6, 3, pm, ,, checked..."
3,มาครั้งแรก ประทับใจมากคับ บริการเหมือน รพ เอกช...,1,The first impression is very good services lik...,"[the, first, impression, good, services, like,..."
4,บริการดีครับ แต่คนเยอะมากๆเลย ที่ใช้บริการคือม...,1,"Good service, but a lot of people yet. The ser...","[good, service, ,, lot, people, yet, ., the, s..."


In [75]:
df["lemma_word_of_sentence"] = apply_lemmatize(df["words"])
df.head()

Unnamed: 0,reviews,ratings,reviews_en,words,lemma_word_of_sentence
0,จำไม่ได้ว่าปีไหน น่าจะ ช่วงปลายปี 57 หรือ 58 ต...,1,I can't remember which year it was at the end ...,"[i, ca, n't, remember, year, end, year, 57, 58...",i ca n't remember year end year 57 58 . at tim...
1,เคยใช้บริการตอนขาหัก โดยใช้สิทธิ์ประกันสังคม ไ...,1,I had a broken leg The right to social securit...,"[i, broken, leg, the, right, social, security,...",i break leg the right social security i pay ca...
2,รอคิวตรวจนานมากค่ะ มาตั้งแต่ 6 โมงเช้า กว่าจะไ...,1,"Make me wait so long since 6 am to 3 pm, they ...","[make, wait, long, since, 6, 3, pm, ,, checked...","make wait long since 6 3 pm , check hospital c..."
3,มาครั้งแรก ประทับใจมากคับ บริการเหมือน รพ เอกช...,1,The first impression is very good services lik...,"[the, first, impression, good, services, like,...",the first impression good service like hospita...
4,บริการดีครับ แต่คนเยอะมากๆเลย ที่ใช้บริการคือม...,1,"Good service, but a lot of people yet. The ser...","[good, service, ,, lot, people, yet, ., the, s...","good service , lot people yet . the service re..."


In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer  
vectorizer = TfidfVectorizer()  

vectors = tfidfconverter.fit_transform(df["lemma_word_of_sentence"]).toarray()

In [78]:
vectors.shape

(399, 386)

In [80]:
# tfidf_df = pd.DataFrame(vectors, column=vectorizer.get_feature_names())
tfidf_df = pd.DataFrame(vectors)
tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,376,377,378,379,380,381,382,383,384,385
0,0.0,0.0,0.0,0.0,0.0,0.052727,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.039641,0.04093,0.0,0.095192,0.0,0.000000,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.138848,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.212526,...,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.153920,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.199838,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.191613,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.611028,...,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0
395,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0
396,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.337049,...,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0
397,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0
