# Import Library

In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

# Preprocessing

## Import Data

In [2]:
english_df = pd.read_csv('english_review.csv')
english_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378859 entries, 0 to 378858
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   place_id           378859 non-null  object
 1   review_id_hash     378859 non-null  object
 2   rating             378859 non-null  int64 
 3   published_at_date  378859 non-null  object
 4   english_review     378859 non-null  object
 5   sentiment          378859 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 17.3+ MB


## Import Model

In [3]:
classifier_model =  tf.keras.models.load_model('FeedbackClassifier.h5', custom_objects={'KerasLayer': hub.KerasLayer})

# Predicting

In [4]:
def review_to_words(raw_review):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, 'lxml').get_text() 
    
    # 2. Tokenize words
    words = word_tokenize(review_text)
    
    # 3. Convert to lower case
    words = [word.lower() for word in words]
    
    # 4. Remove non-alphabetic characters and numbers
    words = [re.sub("[^a-zA-Z]", "", word) for word in words]
    
    # 5. Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # 6. Create set of stopwords
    stops = set(stopwords.words("english"))
    
    # 7. Remove stop words
    meaningful_words = [word for word in words if word not in stops]
    
    # 8. Join the words back into one string separated by space
    return " ".join(meaningful_words)

In [5]:
# Function to predict a single review and return 0 or 1
def predict_review(review, classifier_model):
    # Check if the review has less than 4 words
    if len(review.split()) < 4:
        return 0
    else:
        preprocessed_review = review_to_words(review)
        probability = classifier_model.predict([preprocessed_review])[0][0]
        # Return 1 if the probability is greater than or equal to 0.5, else return 0
        return 1 if probability >= 0.5 else 0

# Combined function to add a 'label' column with binary predictions
def add_probability_column(df, classifier_model):
    df['label'] = df['english_review'].apply(lambda review: predict_review(review, classifier_model))
    return df


In [6]:
english_df = english_df.iloc[:80000]
predicted_df = english_df.copy()
predicted_df = add_probability_column(predicted_df, classifier_model)

  review_text = BeautifulSoup(raw_review, 'lxml').get_text()




In [7]:
english_df['label'] = predicted_df['label']
final_df = english_df[['place_id', 'english_review', 'label']]

# Export

In [8]:
final_df.to_csv('CompleteReviews.csv', index=False)