In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pickle

# File path to the dataset
file_path = "C:\\Users\\adity\\Downloads\\kaggle1.txt"

# Load dataset from the text file with handling for bad lines
data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'text'], on_bad_lines='skip')

# Check the first few rows of the data
print(data.head())

# Preprocess data
texts = data['text']  # Extract the conversation texts
labels = data['label']  # Extract the fraud/normal labels

# Encode labels (fraud -> 1, normal -> 0)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)

# Convert text to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Build neural network model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_tfidf, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_tfidf, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Save the model
model.save('fraud_detection_model.h5')

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# Predict on new data
def predict_fraud(conversation):
    conversation_tfidf = tfidf_vectorizer.transform([conversation]).toarray()
    prediction = model.predict(conversation_tfidf)
    return "Fraud" if prediction[0] > 0.5 else "Real"

# Test prediction
print(predict_fraud("Your account has been compromised. Provide your PIN to secure it."))


    label                                               text
0   fraud  hello, i m bank manager of SBI, ur debit card ...
1   fraud  Todays Vodafone numbers ending with 4882 are s...
2  normal               Please don't say like that. Hi hi hi
3  normal                                         Thank you!
4  normal  Oh that was a forwarded message. I thought you...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.8772 - loss: 0.3813 - val_accuracy: 0.9167 - val_loss: 0.1377
Epoch 2/20
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9558 - loss: 0.1077 - val_accuracy: 0.9800 - val_loss: 0.0795
Epoch 3/20
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9913 - loss: 0.0306 - val_accuracy: 0.9789 - val_loss: 0.0866
Epoch 4/20
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9947 - loss: 0.0126 - val_accuracy: 0.9800 - val_loss: 0.0882
Epoch 5/20
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9991 - loss: 0.0045 - val_accuracy: 0.9778 - val_loss: 0.0927
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9728 - loss: 0.0828




Model Accuracy: 98.23%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
Fraud
