<a href="https://colab.research.google.com/github/AbeerProg/RRDS/blob/main/LSTM_AugData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install --upgrade nltk



In [3]:
import pandas as pd
import numpy as np
import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')

# ------------------------------
# 1. Load and Preprocess the Dataset
# ------------------------------

# Load your dataset
df = pd.read_excel("AuDS.xlsx")
df.columns = df.columns.str.strip()  # Clean column names

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Define column names (adjust if necessary)
text_col = "text"
label_col = "label"
# Numeric columns are the ones between text and label
numeric_cols = df.columns.tolist()[1:-1]

# Preprocess text: Tokenization & Padding
max_words = 5000    # vocabulary size
max_len = 200       # maximum sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df[text_col].astype(str))
X_text = tokenizer.texts_to_sequences(df[text_col].astype(str))
X_text = pad_sequences(X_text, maxlen=max_len)

In [5]:

# Preprocess numeric features: Extract and scale
X_numeric = df[numeric_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Get labels
y = df[label_col]

# Split data into training and testing sets
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
    X_text, X_numeric_scaled, y, test_size=0.2, random_state=42
)


In [6]:
# ------------------------------
# 2. Build the LSTM Model
# ------------------------------

# Text branch: Input, Embedding, and LSTM
text_input = Input(shape=(max_len,), name="text_input")
embedding_layer = Embedding(input_dim=max_words, output_dim=100, input_length=max_len)(text_input)
lstm_out = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# Numeric branch: Input and Dense layer
num_features = X_train_num.shape[1]
numeric_input = Input(shape=(num_features,), name="numeric_input")
numeric_dense = Dense(32, activation="relu")(numeric_input)

# Concatenate both branches
combined = Concatenate()([lstm_out, numeric_dense])
combined_dense = Dense(32, activation="relu")(combined)
combined_dropout = Dropout(0.3)(combined_dense)
output = Dense(1, activation="sigmoid")(combined_dropout)  # For binary classification

# Build and compile the model
model = Model(inputs=[text_input, numeric_input], outputs=output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()





In [7]:
# ------------------------------
# 3. Train the Model
# ------------------------------

# Use EarlyStopping to avoid overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

history = model.fit(
    [X_train_text, X_train_num], y_train,
    epochs=10, batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping],
    verbose=1
)

# ------------------------------
# 4. Evaluate the Model
# ------------------------------

loss, accuracy = model.evaluate([X_test_text, X_test_num], y_test)
print(f"Test Accuracy: {accuracy:.2f}")


Epoch 1/10
[1m2417/2417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m648s[0m 266ms/step - accuracy: 0.8512 - loss: 0.3184 - val_accuracy: 0.9469 - val_loss: 0.1410
Epoch 2/10
[1m2417/2417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m705s[0m 275ms/step - accuracy: 0.9520 - loss: 0.1225 - val_accuracy: 0.9645 - val_loss: 0.0944
Epoch 3/10
[1m2417/2417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m675s[0m 273ms/step - accuracy: 0.9735 - loss: 0.0724 - val_accuracy: 0.9778 - val_loss: 0.0651
Epoch 4/10
[1m2417/2417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m686s[0m 274ms/step - accuracy: 0.9833 - loss: 0.0470 - val_accuracy: 0.9817 - val_loss: 0.0541
Epoch 5/10
[1m2417/2417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m691s[0m 278ms/step - accuracy: 0.9881 - loss: 0.0354 - val_accuracy: 0.9829 - val_loss: 0.0501
Epoch 6/10
[1m2417/2417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m675s[0m 275ms/step - accuracy: 0.9908 - loss: 0.0255 - val_accuracy: 0.9881 - val_loss: