<a href="https://colab.research.google.com/github/Bimindu-aberathna/AI-Travel-app/blob/main/Fake_Review_Detection_with_CNN_%26_Bi_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Imports packages
import kagglehub
import pandas as pd
import os
import re
import numpy as np
import sentencepiece as spm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, Conv1D, MaxPooling1D,
    LSTM, Dense, Dropout, GlobalMaxPooling1D,
    Bidirectional
)

In [None]:
from tensorflow.keras.layers import (
    Embedding, Conv1D, MaxPooling1D,
    LSTM, Dense, Dropout, GlobalMaxPooling1D,
    Bidirectional
)

In [None]:
# 2 Download & Load
path = kagglehub.dataset_download("mexwell/fake-reviews-dataset")
file_path = os.path.join(path, "fake reviews dataset.csv")
df = pd.read_csv(file_path)

In [None]:
# 3 Quick Inspect
print("Original label counts:\n", df['label'].value_counts(), "\n")

#Limit if you wanna fine-tune. Other-wise take stupid amount of time. But less data=less accuracy
#df = df.head(2000)
# print("Using top-1000 label counts:\n", df['label'].value_counts(), "\n")

Original label counts:
 label
CG    20216
OR    20216
Name: count, dtype: int64 



In [None]:
# 4. Map text labels to numeric/bool
label_map = {'CG': 1, 'OR': 0}
df['label_num'] = df['label'].map(label_map)
print("Mapped label counts:\n", df['label_num'].value_counts(), "\n")

Mapped label counts:
 label_num
1    20216
0    20216
Name: count, dtype: int64 



In [None]:
# 5. Clean Text
def clean_text(s):
    s = s.lower()
    s = re.sub(r"<.*?>", "", s)
    s = re.sub(r"http\S+|www\S+", "", s)
    s = re.sub(r"[^a-z\s]", "", s)
    return re.sub(r"\s+", " ", s).strip()

df['text'] = df['text_'].apply(clean_text)

In [None]:
# 6. Train/Test Split (stratified on numeric labels)
train_df, test_df = train_test_split(
    df, test_size=0.2,
    stratify=df['label_num'],
    random_state=42
)
y_train = train_df['label_num'].values
y_test  = test_df ['label_num'].values

In [None]:
# 7. Train SentencePiece on train only. THis is for tokenization. Su-word tokenization
train_df['text'].to_csv('train_corpus.txt', index=False, header=False)
spm.SentencePieceTrainer.Train(
    input='train_corpus.txt',
    model_prefix='spm_bpe',
    vocab_size=10000,
    character_coverage=1.0,
    model_type='bpe',
    control_symbols=['<pad>']
)
sp = spm.SentencePieceProcessor()
sp.Load('spm_bpe.model')

True

In [None]:

from sklearn.utils.class_weight import compute_class_weight

In [None]:
# 8. Encode & Pad
maxlen = 150
def encode_and_pad(texts):
    seqs = [sp.encode(t, out_type=int) for t in texts]
    return pad_sequences(seqs, padding='post', maxlen=maxlen)

X_train = encode_and_pad(train_df['text'])
X_test  = encode_and_pad(test_df ['text'])

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LeakyReLU,BatchNormalization

In [None]:
# 9. Build CNN + LSTM Model
vocab_size = sp.get_piece_size()
embedding_dim = 128
model = Sequential([
    Embedding(vocab_size, embedding_dim, mask_zero=True),
    Conv1D(128, 3, activation='relu', padding='same'),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(64),
    LeakyReLU(alpha=0.1),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
#model.summary()



In [None]:
# 10. Train
# history = model.fit(
#     X_train, y_train,
#     epochs=5,
#     batch_size=32,
#     validation_data=(X_test, y_test)
# )
# compute the weights for each class 0 and 1
cw = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

#try to get a better value by changing Epoches and size folks
model.fit(X_train, y_train,
          epochs=13, batch_size=32,
          validation_data=(X_test, y_test),
          class_weight={i: w for i, w in enumerate(cw)})

Epoch 1/13




[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 19ms/step - accuracy: 0.8325 - loss: 0.3421 - val_accuracy: 0.9413 - val_loss: 0.1659
Epoch 2/13
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19ms/step - accuracy: 0.9612 - loss: 0.0992 - val_accuracy: 0.9450 - val_loss: 0.1434
Epoch 3/13
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 18ms/step - accuracy: 0.9832 - loss: 0.0494 - val_accuracy: 0.9487 - val_loss: 0.1761
Epoch 4/13
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 18ms/step - accuracy: 0.9913 - loss: 0.0261 - val_accuracy: 0.9461 - val_loss: 0.1846
Epoch 5/13
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 18ms/step - accuracy: 0.9942 - loss: 0.0172 - val_accuracy: 0.9439 - val_loss: 0.2177
Epoch 6/13
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 18ms/step - accuracy: 0.9963 - loss: 0.0107 - val_accuracy: 0.9470 - val_loss: 0.2442
Epoch 7/13
[1m

<keras.src.callbacks.history.History at 0x7dbbe0bd7a50>

In [None]:
# 11. Evaluate & Confusion Matrix
y_pred_prob = model.predict(X_test)
y_pred      = (y_pred_prob > 0.5).astype(int).flatten()

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step

Confusion Matrix:
[[3921  123]
 [ 348 3695]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9185    0.9696    0.9433      4044
           1     0.9678    0.9139    0.9401      4043

    accuracy                         0.9418      8087
   macro avg     0.9431    0.9418    0.9417      8087
weighted avg     0.9431    0.9418    0.9417      8087

