In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/Preprocessed Fake Reviews Detection Dataset(1).csv'
df = pd.read_csv(file_path)

# Display the first few rows
print("Dataset Preview:")
print(df.head())

# Check dataset info
print("\nDataset Information:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check target distribution
print("\nTarget Distribution:")
print(df['label'].value_counts())


Dataset Preview:
   Unnamed: 0            category  rating label  \
0           0  Home_and_Kitchen_5     5.0    CG   
1           1  Home_and_Kitchen_5     5.0    CG   
2           2  Home_and_Kitchen_5     5.0    CG   
3           3  Home_and_Kitchen_5     1.0    CG   
4           4  Home_and_Kitchen_5     5.0    CG   

                                          text_  
0     love well made sturdi comfort love pretti  
1  love great upgrad origin 've mine coupl year  
2        pillow save back love look feel pillow  
3           miss inform use great product price  
4           nice set good qualiti set two month  

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  40432 non-null  int64  
 1   category    40432 non-null  object 
 2   rating      40432 non-null  float64
 3   label       40432 non-null  object 


**PRE PROCESS THE DATASET**

In [None]:
# Drop unnecessary column
df = df.drop(columns=['Unnamed: 0'])

# Drop rows with missing text
df = df.dropna(subset=['text_'])

# Encode target variable (label)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Check the processed dataset
print("\nProcessed Dataset Preview:")
print(df.head())



Processed Dataset Preview:
             category  rating  label  \
0  Home_and_Kitchen_5     5.0      0   
1  Home_and_Kitchen_5     5.0      0   
2  Home_and_Kitchen_5     5.0      0   
3  Home_and_Kitchen_5     1.0      0   
4  Home_and_Kitchen_5     5.0      0   

                                          text_  
0     love well made sturdi comfort love pretti  
1  love great upgrad origin 've mine coupl year  
2        pillow save back love look feel pillow  
3           miss inform use great product price  
4           nice set good qualiti set two month  


**PRE PROCESS THE TEXT COLUMN TOKENIZING**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters for text preprocessing
max_words = 10000  # Maximum number of unique words in the vocabulary
max_len = 100      # Maximum length of sequences (padding/truncation)

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text_'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['text_'])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Check shape of padded data
print("\nShape of padded_sequences:", padded_sequences.shape)



Shape of padded_sequences: (40431, 100)


**SPLIT THE DATA**

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Features (text data) and target
X = np.array(padded_sequences)
y = df['label'].values

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (32344, 100)
Testing data shape: (8087, 100)


# **BUILD THE MODEL**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout

# Model parameters
embedding_dim = 16  # Dimension of the embedding layer

# Build the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=embedding_dim, input_length=100),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Use 'sigmoid' for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()




**TRAIN THE MODEL**

In [None]:
# Training the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=64,
    verbose=1
)


Epoch 1/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 157ms/step - accuracy: 0.5559 - loss: 0.6743 - val_accuracy: 0.6808 - val_loss: 0.6154
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 147ms/step - accuracy: 0.6591 - loss: 0.6254 - val_accuracy: 0.6375 - val_loss: 0.6521
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 148ms/step - accuracy: 0.6461 - loss: 0.6465 - val_accuracy: 0.7060 - val_loss: 0.5715
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 151ms/step - accuracy: 0.7360 - loss: 0.5415 - val_accuracy: 0.8194 - val_loss: 0.4367
Epoch 5/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 148ms/step - accuracy: 0.8445 - loss: 0.3778 - val_accuracy: 0.8791 - val_loss: 0.2913
Epoch 6/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 151ms/step - accuracy: 0.8981 - loss: 0.2595 - val_accuracy: 0.8902 - val_loss: 0.2616
Epoch 7/10

**EVALUATE THE MODEL**

In [None]:
# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.9011 - loss: 0.2861
Test Loss: 0.2977585196495056
Test Accuracy: 0.8958823084831238


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

model_cnn = Sequential([
    Embedding(input_dim=10000, output_dim=16, input_length=100),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary
model_cnn.summary()


In [None]:
# Train the model
history_cnn = model_cnn.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=64,
    verbose=1
)

# Evaluate the model on the test data
test_loss_cnn, test_accuracy_cnn = model_cnn.evaluate(X_test, y_test, verbose=1)

# Print results
print(f"\nCNN Model Test Loss: {test_loss_cnn}")
print(f"CNN Model Test Accuracy: {test_accuracy_cnn}")




Epoch 1/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - accuracy: 0.7096 - loss: 0.5182 - val_accuracy: 0.8909 - val_loss: 0.2617
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.9152 - loss: 0.2123 - val_accuracy: 0.8989 - val_loss: 0.2413
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.9432 - loss: 0.1502 - val_accuracy: 0.9034 - val_loss: 0.2402
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.9624 - loss: 0.1003 - val_accuracy: 0.8998 - val_loss: 0.2626
Epoch 5/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.9760 - loss: 0.0686 - val_accuracy: 0.9006 - val_loss: 0.3157
Epoch 6/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.9821 - loss: 0.0503 - val_accuracy: 0.9000 - val_loss: 0.3543
Epoch 7/10
[1m405/

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Dropout

# Build the combined CNN-LSTM model
model_cnn_lstm = Sequential([
    Embedding(input_dim=10000, output_dim=16, input_length=100),  # Embedding layer
    Conv1D(128, kernel_size=5, activation='relu'),               # CNN layer
    # GlobalMaxPooling1D(),  # Remove GlobalMaxPooling1D to retain temporal dimension for LSTM
    LSTM(64, return_sequences=False),                            # LSTM layer
    Dropout(0.2),                                                # Dropout layer
    Dense(32, activation='relu'),                                # Fully connected layer
    Dense(1, activation='sigmoid')                               # Output layer for binary classification
])

# Compile the model
model_cnn_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model_cnn_lstm.summary()

# Train the model
history_cnn_lstm = model_cnn_lstm.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=64,
    verbose=1
)

# Evaluate the model on the test data
test_loss_cnn_lstm, test_accuracy_cnn_lstm = model_cnn_lstm.evaluate(X_test, y_test, verbose=1)

# Print results
print(f"\nCNN-LSTM Model Test Loss: {test_loss_cnn_lstm}")
print(f"CNN-LSTM Model Test Accuracy: {test_accuracy_cnn_lstm}")



Epoch 1/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 158ms/step - accuracy: 0.6023 - loss: 0.6478 - val_accuracy: 0.7675 - val_loss: 0.5148
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 150ms/step - accuracy: 0.5876 - loss: 0.6546 - val_accuracy: 0.7621 - val_loss: 0.5197
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 151ms/step - accuracy: 0.8073 - loss: 0.4395 - val_accuracy: 0.8847 - val_loss: 0.2724
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 160ms/step - accuracy: 0.9073 - loss: 0.2404 - val_accuracy: 0.8989 - val_loss: 0.2480
Epoch 5/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 171ms/step - accuracy: 0.9360 - loss: 0.1708 - val_accuracy: 0.8674 - val_loss: 0.3214
Epoch 6/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 150ms/step - accuracy: 0.9511 - loss: 0.1303 - val_accuracy: 0.8967 - val_loss: 0.2953
Epoch 7/10