In [3]:
import pandas as pd
import numpy as np

In [4]:
import re

def de_html(target):
    filtered = re.sub(r'<.*>',' ',target)
    return filtered

def de_special_chars(target):
    cleaned = re.sub(r'[?|!|\'|"|#|$|@|&|%]',' ',target)
    cleaned = re.sub(r'[.|,|)|(|\\|/]',' ',cleaned)
    return cleaned

In [9]:
from nltk.stem import  SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

stemmer = SnowballStemmer('english')
stop_words = stopwords.words('english')

def pipeline(target,pre_trained_model=None):
    """
    target - A dataframe with the text under a column named 'Review'
    pre_trained_model - A pre-trained word vectorizer model if already trained with train data.
    """
    cleaned_data = []
    for review in target['Review']:
        cleaned_review = []
        review = de_html(review)
        review = de_special_chars(review)
        for word in review.split():
            if word.lower() not in stop_words:
                if word.isalpha() and len(word) > 2:
                    stemmed_word = stemmer.stem(word.lower())
                    cleaned_review.append(stemmed_word)
        cleaned_review = ' '.join(cleaned_review)
        cleaned_data.append(cleaned_review)
    
    model = CountVectorizer(binary=True,ngram_range=(1,2))
    if pre_trained_model:
        model = pre_trained_model
        results = model.transform(cleaned_data)
    else:
        model.fit(cleaned_data)
        results = model.transform(cleaned_data)
    
    return results,model

In [10]:
#Training the model:
from sklearn.naive_bayes import BernoulliNB

data = pd.read_csv('restaurant_reviews.tsv',sep='\t')
print(f'Original data: {data.shape}')
cleaned_data,text_model = pipeline(data)
print(f'cleaned_data : {cleaned_data.shape}')
model = BernoulliNB(alpha=3)
model.fit(cleaned_data,data['Liked'])


Original data: (1000, 2)
cleaned_data : (1000, 5495)


BernoulliNB(alpha=3)

In [20]:
#Getting a review and making a prediction:
test_review = input('Enter your review: ')
print(f'The input review: {test_review}')
test_review = pd.DataFrame(np.array([[test_review]]),columns=['Review'])
cleaned_data,_ = pipeline(test_review,pre_trained_model=text_model)
sentiment = model.predict(cleaned_data)
sentiment = sentiment[0]
if sentiment:
    print('Thank you for your good feedback! Visit again!')
else:
    print('We apologize for your inconvenience. We\'ll work on getting better')

The input review: Honeslty it didn't taste THAT fresh.)
We apologize for your inconvenience. We'll work on getting better
