In [2]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv("/content/clean_df")

# Features and labels
X = df['clean_text']
y = df['fraudulent']  # This should be binary: 0 (real), 1 (fake)

# Tokenization
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [4]:
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 75ms/step - accuracy: 0.9428 - loss: 0.2672 - val_accuracy: 0.9629 - val_loss: 0.1357
Epoch 2/5
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 77ms/step - accuracy: 0.9733 - loss: 0.1001 - val_accuracy: 0.9682 - val_loss: 0.1244
Epoch 3/5
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 73ms/step - accuracy: 0.9832 - loss: 0.0620 - val_accuracy: 0.9689 - val_loss: 0.1389
Epoch 4/5
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 76ms/step - accuracy: 0.9888 - loss: 0.0441 - val_accuracy: 0.9626 - val_loss: 0.1591
Epoch 5/5
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 73ms/step - accuracy: 0.9952 - loss: 0.0191 - val_accuracy: 0.9710 - val_loss: 0.1339


In [5]:
from sklearn.metrics import roc_auc_score, classification_report

loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.4f}")

# ROC-AUC
y_probs = model.predict(X_test).ravel()
auc = roc_auc_score(y_test, y_probs)
print(f"ROC-AUC: {auc:.4f}")

# Thresholding to get final predictions
y_pred = (y_probs > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.9760 - loss: 0.1105
Test Accuracy: 0.9748
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
ROC-AUC: 0.9090
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3395
           1       0.85      0.61      0.71       181

    accuracy                           0.97      3576
   macro avg       0.91      0.80      0.85      3576
weighted avg       0.97      0.97      0.97      3576

