In [1]:
from IPython.display import display, HTML
from utils import load_data, print_data_stats, subset_data
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Introduction to the dataset
- What is Sentiment Analysis?
    - "I like this movie" --> positive
    - "I hate this movie" --> negative
- [ABSA multilingual SA dataset](http://alt.qcri.org/semeval2016/task5/)
- What kind of preprocessing can you think of from the examples below

In [2]:
LANGS = ["ar","en","es","ru","zh"]
LANGS_MAPPING = {"en":"english","es":"spanish","ru":"russian","ar":"arabic","zh":"chinese"}

data = load_data()

In [3]:
print_data_stats(data, max_len=40)

Unnamed: 0,#train,#test,train-pos%,test-pos%,sample,label
en,1608,555,0.673507,0.718919,I'm saving up for my next visit.,pos
es,1535,650,0.716612,0.678462,Calidad-precio muy bien.,pos
ru,2663,865,0.755914,0.677457,Удачи вам и процветания!!!,pos
ar,4438,1145,0.605453,0.567686,فريق العمل ودود ومستعد لإسعاد الضيوف,pos
zh,1333,529,0.568642,0.586011,但质量，,neg


- re-sample the data to make all languages to have the same number of training data

In [4]:
data_sampled = subset_data(data)
print_data_stats(data_sampled,40)

Unnamed: 0,#train,#test,train-pos%,test-pos%,sample,label
en,1333,555,0.669917,0.718919,Overpriced and not tasty,neg
es,1333,650,0.717179,0.678462,El asado muy bueno.,pos
ru,1333,865,0.757689,0.677457,Общее впечатление приятное.,pos
ar,1333,1145,0.609152,0.567686,أسوأ فندق في باريس,neg
zh,1333,529,0.568642,0.586011,手感也很好。,pos


# Activity: build your own dataset (20 for train, 5 for test)
- try to use similar words as much as possible
- try to make some word overlaps between examples

In [5]:
# NEW_LANG = "?"
# train_pos_sents = ["I like this movie","Ihe nkiri a masịrị m", "Ninapenda sinema hii"]
# train_neg_sents = ["I have this movie"]
# test_pos_sents = ["I enjoyed the movie"]
# test_neg_sents = ["Never watch it"]

# data[NEW_LANG] = {}
# data[NEW_LANG]["train"] = [(sent,"pos") for sent in train_pos_sents] + [(sent,"neg") for sent in train_neg_sents]
# data[NEW_LANG]["test"] = [(sent,"pos") for sent in train_pos_sents] + [(sent,"neg") for sent in train_neg_sents]

# Load stemmers, word_tokenizers, stopword_filters
- **stemming/lemmatization**: reducing inflected (or sometimes derived) words to their word stem
- **word segmentation (tokenization)**: dividing a string of written language into its component words
- **stopwords**: a set of commonly used words

In [6]:
import Stemmer
import stopwordsiso as stopwordsiso
import jieba
from pyarabic import araby 
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import stopwordsiso

class MultiStopword:
    
    
    def __init__(self):
        
        self.stopwords = {}
        for lang in ["en","es","ar","ru"]:
            self.stopwords[lang] = set(stopwords.words(LANGS_MAPPING[lang]))
        for lang in ["zh"]:
            self.stopwords[lang] = stopwordsiso.stopwords(lang) 
        
        # # TODO
        # self.stopwords[NEW_LANG] = set([""])
            
    def is_stopword(self, word,lang):
        
        if lang in self.stopwords:
            return (word in self.stopwords[lang])
        else:
            raise NotImplementedError

class MultiWordSegmenter:
    
    def __init__(self):
        
        self.tokenizer = {}
        self.tokenizer["ru"] = ToktokTokenizer()

    def segment(self, text, lang):
        
        if lang in ["en","es"]:
            return word_tokenize(text, language=LANGS_MAPPING[lang])
        elif lang == "zh":
            return jieba.cut(text)
        elif lang == "ru":
            return self.tokenizer["ru"].tokenize(text)
        elif lang == "ar":
            return araby.tokenize(text)
        
        ## TODO
        # elif lang == NEW_LANG:
        #    return word_tokenize(text)
        
        else:
            raise NotImplementedError

class MultiWordStemmers:
    
    
    def __init__(self):
        
        self.stemmers = {}
        self.stemmers["en"] = Stemmer.Stemmer('english')
        self.stemmers["ar"] = Stemmer.Stemmer('arabic')
        self.stemmers["ru"] = Stemmer.Stemmer('russian')
        self.stemmers["es"] = Stemmer.Stemmer('spanish')

    def stem(self, word, lang):
        
        if lang in self.stemmers:
            return self.stemmers[lang].stemWord(word)
        elif lang == "zh":
            return word
        
        # elif lang == NEW_LANG:
        #    ## TODO
        #    return word
        
        else:
            raise NotImplementedError

stopword_checkers = MultiStopword()
word_segmenters = MultiWordSegmenter()
stemmers = MultiWordStemmers()

- **Examples:**

In [7]:
print(stemmers.stem("friend","en"))
print(stemmers.stem("friends","en"))
print(stemmers.stem("friended","en"))

friend
friend
friend


In [8]:
# russian verbs for MUST
print(stemmers.stem("должен","ru")) # Male
print(stemmers.stem("должна","ru")) # Female
print(stemmers.stem("должно","ru")) # Neutral
print(stemmers.stem("должны","ru")) # Plural

долж
должн
должн
должн


Seems to be not perfect.

In [9]:
def preprocessing_example(sentence, lang):
    
    print("\n".join([str((stemmers.stem(w,lang), stopword_checkers.is_stopword(w,lang))) for w in word_segmenters.segment(sentence,lang)]))
    print(" ".join([w for w in word_segmenters.segment(sentence,lang) if not stopword_checkers.is_stopword(w,lang)]))

ex_sentence = "Mr.Brown measured the cat this morning, and it was 14.5 pounds!"
preprocessing_example(ex_sentence, "en")

('Mr.Brown', False)
('measur', False)
('the', True)
('cat', False)
('this', True)
('morn', False)
(',', False)
('and', True)
('it', True)
('was', True)
('14.5', False)
('pound', False)
('!', False)
Mr.Brown measured cat morning , 14.5 pounds !


# Activity: add stemmers, word_tokenizers, stopwords for your language
- find and edit `## TODO`

In [10]:
# # and test out!
# ex_sentence = "your sentence!"
# preprocessing_example(ex_sentence, NEW_LANG)

# Activity 2: reduce the number of features (# of unigrams)

In [11]:
# show the shap of the dataset for "en"

data_sampled['en']['train'][0]

('Love the scene first off- the place has a character and nice light to it..very fortunate, location wise.',
 'pos')

In [12]:
baseline = {}
vectorizer = CountVectorizer(ngram_range=(1, 1), lowercase=False)

for lang in data_sampled.keys():
    sentences_train, y_train = zip(*data_sampled[lang]["train"])
    vectorizer.fit(sentences_train)
    num_unigram = len(vectorizer.get_feature_names_out())
    baseline[lang] = num_unigram
    print(lang, num_unigram)

en 2988
es 3581
ru 4465
ar 7964
zh 1438


In [13]:
def preprocess(sentence_list:list, lang:str) -> list:
    return [preprocess_sentence(sentence, lang) for sentence in sentence_list]


def preprocess_sentence(text:str, lang:str) -> str:
    text = text.lower()
    
    ## TODO
    # words = text.split()
    words = word_segmenters.segment(text, lang)
    words = [w for w in words if not stopword_checkers.is_stopword(w, lang)]
    text = " ".join(words)
    return text



vectorizer = CountVectorizer(ngram_range=(1, 1), lowercase=False)
for lang in LANGS:
    sentences_train, y_train = zip(*data_sampled[lang]["train"])
    sentences_train = preprocess(sentences_train, lang)
    vectorizer.fit(sentences_train)
    num_unigram = len(vectorizer.get_feature_names_out())
    print(f"{lang}: {num_unigram:<5}({baseline[lang]-num_unigram}\u2193)")

ar: 7778 (186↓)
en: 2512 (476↓)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ehhho\AppData\Local\Temp\jieba.cache


es: 3085 (496↓)
ru: 3930 (535↓)


Loading model cost 0.998 seconds.
Prefix dict has been built successfully.


zh: 1479 (-41↓)


# Train Naive Bayes models

In [14]:
def preprocess(sentence_list:list, lang:str, bool_lowercase=False, bool_segment=False, bool_stem=False, filter_stopwords=False) -> list:
    return [preprocess_sentence(sentence, lang, bool_lowercase, bool_segment, bool_stem, filter_stopwords) for sentence in sentence_list]

def preprocess_sentence(text:str, lang:str, bool_lowercase, bool_segment, bool_stem, filter_stopwords) -> str:
    if bool_lowercase:
        text = text.lower()

    if bool_segment:
        words = word_segmenters.segment(text, lang)
    else:
        words = text.split()

    if bool_stem:
        words = [stemmers.stem(w, lang) for w in words]
    
    if filter_stopwords:
        words = [w for w in words if not stopword_checkers.is_stopword(w, lang)]

    return " ".join(words)

def train_and_evaluate_nb(data:dict, lang:str, max_feat=100) -> float:
    sentences_train, y_train = zip(*data[lang]["train"])
    sentences_test, y_test = zip(*data[lang]["test"])
    
    sentences_train, sentences_test = preprocess(sentences_train, lang), preprocess(sentences_test, lang)
    vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=max_feat, lowercase=False)
    x_train = vectorizer.fit_transform(sentences_train)
    x_test = vectorizer.transform(sentences_test)    
    
    model = MultinomialNB()
    model.fit(x_train, y_train)
    acc = model.score(x_test, y_test)
    print(f"{lang}: {acc:.2f}")
    return {"model":model, "vectorizer":vectorizer}

def predict(models, lang, sents):
    model, vectorizer = models[lang]["model"], models[lang]["vectorizer"]
    if type(sents) == str:
        sents = [sents]
    sents = preprocess(sents, lang)
    x = vectorizer.transform(sents)
    pred = model.predict(x)
    print(list(zip(sents,pred)))

In [15]:
models = {}
for lang in data.keys():
    models[lang] = train_and_evaluate_nb(data, lang)

en: 0.74
es: 0.76
ru: 0.73
ar: 0.73
zh: 0.59


In [16]:
ex_sents = ["will watch it again","will not watch it again"]
print(models["en"]["vectorizer"].transform(ex_sents))
predict(models,"en",ex_sents)

  (0, 7)	1
  (0, 45)	1
  (0, 94)	1
  (1, 7)	1
  (1, 45)	1
  (1, 55)	1
  (1, 94)	1
[('will watch it again', 'pos'), ('will not watch it again', 'neg')]


# Activity 3: Fill out the following table
- change boolean arguments in `preprocess()`

|                          	| Ar 	| En 	| Es 	| Ru 	| Zh 	|
|--------------------------	|:--:	|:--:	|:--:	|:--:	|:--:	|
| Baseline                 	|  0.73 |  0.74 |  0.76 |  0.73 |  0.59 |
| All                      	|  0.76 |  0.74 |  0.78 |  0.74 |  0.67 |
| All - segmentation       	|  0.75 |  0.75 |  0.78 |  0.74 |  0.59 |
| All - stemmer            	|  0.69 |  0.74 |  0.73 |  0.73 |  0.67 |
| All - stopword_filtering 	|  0.79 |  0.76 |  0.82 |  0.75 |  0.68 |

# Activity 4: Explain your own observations

- observation 1: Stop words are important for sentiement classification in general.
- observation 2: Stemming affects the quality of the results except for Chinese.