In [2]:
import numpy as np
import pandas as pd
import re
import string
import pickle

In [3]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [4]:
with open('../static/model/modelmnb.pickle', 'rb') as f:
    model = pickle.load(f)

In [5]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [6]:
with open('../static/model/vocabulary.txt', 'r', encoding='utf-8') as f:
    tokens = [line.strip() for line in f]
print(tokens)  

['materi', 'eka', 'perfect', 'ithin', 'satisfi', 'phone', 'super', 'shiok', 'damn', 'order', 'deliveri', 'fast', 'seller', 'respons', 'sia.', 'repli', 'size', 'cannot', 'qualiti', 'money', 'slow', 'one', 'week', 'custom', 'servic', 'chat', 'ekanam', 'hondai,', 'meka', 'mara', 'lassanai,', 'welawatama', 'una,', 'meke', 'build', 'supiri', 'ekak,', 'time', 'hariyata', 'dunna', 'eke', 'use', 'karanawa,', 'godak', 'practic', 'wenawa.', 'supiri,', 'warranti', 'watinawa.', 'menu', 'athi', 'product', 'maru', 'hithapu', 'nadda', 'ekak', 'karanna', 'sound', 'idin', 'issu', 'kalin', 'care', 'ekata', 'balaporoththu', 'eka.', 'lassanai', 'hondai', 'puluwan', 'price', 'hari', 'hodai', 'wenawa', 'set', 'hariyatama', 'thiyenawa', 'una', 'disappoint', 'ne', 'aduwata', 'aya', 'amarui', 'thibuna', 'karaddi', 'hodata', 'ganna', 'hithuwatath', 'wada', 'ikmanata', 'awa', 'hondata', 'weda', 'karanawa', 'wedak', 'na', 'pata', 'harima', 'oder', 'karapu', 'newei', 'wena', 'wela', 'ekedi', 'damag', 'kedila', 'ke

In [7]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [8]:
def preprocessing(text):
    data = pd.DataFrame([text], columns=['Review (Singlish)'])
    # Convert text to lowercase
    data["Review (Singlish)"] = data["Review (Singlish)"].apply(lambda x: " ".join(x.lower() for x in x.split()))
    # Remove URLs
    data["Review (Singlish)"] = data['Review (Singlish)'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))
    # Remove punctuations
    data["Review (Singlish)"] = data["Review (Singlish)"].apply(remove_punctuations)
    # Remove numbers
    data["Review (Singlish)"] = data['Review (Singlish)'].str.replace(r'\d+', '', regex=True)  # Use raw string
    # Remove stopwords
    data["Review (Singlish)"] = data["Review (Singlish)"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
    # Apply stemming
    data["Review (Singlish)"] = data["Review (Singlish)"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
    return data["Review (Singlish)"]


In [9]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []
    
    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))
        
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1
                
        vectorized_lst.append(sentence_lst)
        
    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)
    
    return vectorized_lst_new

In [10]:
def get_prediction(vectorized_text):
    prediction = model.predict(vectorized_text)
    if prediction == 0:
        return 'negative'
    else:
        return 'positive'

In [11]:
txt = "mekanam wedak na ganna epa"
preprocessed_txt = preprocessing(txt)
vectorized_txt = vectorizer(preprocessed_txt, tokens)
prediction = get_prediction(vectorized_txt)
prediction

'negative'

In [17]:
txt = "light eka color full lassanai"
preprocessed_txt = preprocessing(txt)
vectorized_txt = vectorizer(preprocessed_txt, tokens)
prediction = get_prediction(vectorized_txt)
prediction

'positive'