In [20]:
import streamlit as st
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import string
import numpy as np


In [21]:
# ------------- Preprocessing Functions -------------

def clean_stopwords(text):
    stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
                   'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
                   'being', 'below', 'between', 'both', 'by', 'can', 'd', 'did', 'do',
                   'does', 'doing', 'down', 'during', 'each', 'few', 'for', 'from',
                   'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
                   'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
                   'into', 'is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
                   'me', 'more', 'most', 'my', 'myself', 'now', 'o', 'of', 'on', 'once',
                   'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'own', 're',
                   's', 'same', 'she', "shes", 'should', "shouldve", 'so', 'some', 'such',
                   't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
                   'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
                   'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was',
                   'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom',
                   'why', 'will', 'with', 'won', 'y', 'you', "youd", "youll", "youre",
                   "youve", 'your', 'yours', 'yourself', 'yourselves']
    STOPWORDS = set(stopwordlist)
    return " ".join([word for word in str(text).split() if word.lower() not in STOPWORDS])

def clean_punctuations(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def clean_repeating_characters(text):
    return re.sub(r'(.)\1+', r'\1', text)

def clean_URLs(text):
    return re.sub(r"((www\.[^\s]+)|(http\S+))", "", text)

def clean_numeric(text):
    return re.sub('[0-9]+', '', text)


In [22]:
# ------------- Load the Pre-trained Model -------------

@st.cache_resource  # Caches the model to prevent reloading on every interaction
def load_model():
    return tf.keras.models.load_model('fakerev.h5')

model = load_model()

In [23]:
# ------------- Reconstruct the Tokenizer -------------

def get_tokenizer():
    # Define the tokenizer parameters as used during training
    tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
    
    # Reconstruct the word_index as used during training
    word_index = {
        'the': 1,
        'and': 2,
        'to': 3,
        'of': 4,
        'a': 5,
        'in': 6,
        'is': 7,
        'it': 8,
        'you': 9,
        'that': 10,
        # ... (Add all words used during training)
    }
    
    # Assign the word_index to the tokenizer
    tokenizer.word_index = word_index
    
    return tokenizer

tokenizer = get_tokenizer()



In [24]:
# ------------- Preprocessing Function -------------

def preprocess_text(text, tokenizer, max_length):
    if text is None or text.strip() == "":
        return None

    text = text.lower()
    text = clean_stopwords(text)
    # Optionally uncomment to clean punctuations
    # text = clean_punctuations(text)
    text = clean_repeating_characters(text)
    text = clean_URLs(text)
    text = clean_numeric(text)
    
    sequences = tokenizer.texts_to_sequences([text])
    
    if len(sequences[0]) == 0:
        return None
    
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    return padded_sequences

In [25]:
# ------------- Streamlit Interface -------------

st.title("Fake Review Detection")

st.write("""
### Enter a review below, and the model will predict whether it's computer-generated or human-generated.
""")

# Text input
input_text = st.text_area("Enter the review text:", height=200)

# Prediction button
if st.button("Predict"):
    if input_text.strip():
        # Define the maximum sequence length (must match training)
        max_sequence_length = 200  # Example value; replace with actual value used during training
        
        # Preprocess the input text
        processed_text = preprocess_text(input_text, tokenizer, max_sequence_length)
        
        if processed_text is None:
            st.error("The input text is invalid or cannot be processed. Please try again with different text.")
        else:
            # Make prediction
            prediction = model.predict(processed_text)
            
            # Define classification threshold
            threshold = 0.76  # As per your user code
            
            # Interpret the prediction
            classify = {1: "Computer Generated", 0: "Original Resource"}
            predicted_class = (prediction > threshold).astype(int)
            predicted_label = classify[predicted_class[0][0]]
            
            # Display the result
            st.success(f"**Predicted class:** {predicted_label}")
            st.write(f"**Prediction Probability:** {prediction[0][0]:.4f}")
    else:
        st.warning("Please enter some text to analyze.")

# ------------- Instructions -------------

st.sidebar.header("Instructions")
st.sidebar.write("""
1. **Enter the Review Text**: Input the review you want to analyze in the text area.
2. **Predict**: Click the "Predict" button to see the result.
3. **Result**: The app will display whether the review is computer-generated or human-generated along with the prediction probability.
""")
