In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score

# Define custom Hindi stopwords
stopwords_hindi = [
    'अत', 'अपना', 'अपनी', 'अपने', 'अभी', 'अंदर', 'आदि', 'आप', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों',
    'इस', 'इसका', 'इसकी', 'इसके', 'इसमें', 'इसी', 'इसे', 'उन', 'उनका', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें',
    'उन्हों', 'उस', 'उसके', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'ऐसे', 'और', 'कई', 'कर', 'करता', 'करते', 'करना',
    'करने', 'करें', 'कहते', 'कहा', 'का', 'काफ़ी', 'कि', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस',
    'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोई', 'कौन', 'कौन', 'बही', 'बहुत', 'बाद', 'बाला', 'बिलकुल',
    'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रहा', 'रहे',
    'ऱ्वासा', 'लिए', 'लिये', 'लेकिन', 'व', 'वग़ैरह', 'वर्ग', 'वह', 'वहाँ', 'वहीं', 'वाले', 'वुह', 'वे', 'सकता',
    'सकते', 'सबसे', 'सभी', 'साथ', 'साबुत', 'साभ', 'सारा', 'से', 'सो', 'संग', 'ही', 'हुआ', 'हुई', 'हुए', 'है',
    'हैं', 'हो', 'होता', 'होती', 'होते', 'होना', 'होने'
]

# Load dataset
data = pd.read_csv('/content/randomized_combined_dataset.csv')

# Drop null values
data = data.dropna()

# Splitting input and label
x = data['title']
y = data['label']

# Preprocessing
corpus = []
for review in x:
    review = review.split()
    review = [word for word in review if word not in stopwords_hindi]
    corpus.append(' '.join(review))

# Tokenization and Padding
voc_size = 5000
one_hot_repr = [one_hot(words, voc_size) for words in corpus]
padded = pad_sequences(one_hot_repr, padding='post', maxlen=20)

# Model Building
embed_dim = 40
model = Sequential([
    Embedding(voc_size, embed_dim, input_length=20),
    Bidirectional(LSTM(100)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train-Test Split
x = np.array(padded)
y = np.array(y)
trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.3, random_state=0)

# Training
history = model.fit(trainX, trainY, epochs=10, validation_data=(testX, testY), batch_size=64)

# Predictions and Evaluation
pred = model.predict(testX)
binary_predictions = [1 if i >= 0.5 else 0 for i in pred]

print('Accuracy on testing set:', accuracy_score(binary_predictions, testY))
print('Precision on testing set:', precision_score(binary_predictions, testY))
print('Recall on testing set:', recall_score(binary_predictions, testY))

# Confusion Matrix and AUC
cm = confusion_matrix(testY, binary_predictions)
# Use model.predict instead of model.predict_proba
probs = model.predict(testX)[:, 0]  # Get the predicted probabilities

# Calculate AUC
auc = roc_auc_score(testY, probs)

print('Confusion Matrix:\n', cm)
print('AUC:', auc)



Epoch 1/10




[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 60ms/step - accuracy: 0.8118 - loss: 0.3786 - val_accuracy: 0.9065 - val_loss: 0.2232
Epoch 2/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 59ms/step - accuracy: 0.9275 - loss: 0.1814 - val_accuracy: 0.9040 - val_loss: 0.2267
Epoch 3/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 60ms/step - accuracy: 0.9435 - loss: 0.1446 - val_accuracy: 0.9090 - val_loss: 0.2269
Epoch 4/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 59ms/step - accuracy: 0.9595 - loss: 0.1072 - val_accuracy: 0.9026 - val_loss: 0.2889
Epoch 5/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 60ms/step - accuracy: 0.9727 - loss: 0.0722 - val_accuracy: 0.8995 - val_loss: 0.3881
Epoch 6/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 60ms/step - accuracy: 0.9804 - loss: 0.0502 - val_accuracy: 0.9022 - val_loss: 0.4099
Epoch 7/10
[1m679/679[0m 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score

# Define custom Hindi stopwords
stopwords_hindi = [
    'अत', 'अपना', 'अपनी', 'अपने', 'अभी', 'अंदर', 'आदि', 'आप', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों',
    'इस', 'इसका', 'इसकी', 'इसके', 'इसमें', 'इसी', 'इसे', 'उन', 'उनका', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें',
    'उन्हों', 'उस', 'उसके', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'ऐसे', 'और', 'कई', 'कर', 'करता', 'करते', 'करना',
    'करने', 'करें', 'कहते', 'कहा', 'का', 'काफ़ी', 'कि', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस',
    'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोई', 'कौन', 'कौन', 'बही', 'बहुत', 'बाद', 'बाला', 'बिलकुल',
    'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रहा', 'रहे',
    'ऱ्वासा', 'लिए', 'लिये', 'लेकिन', 'व', 'वग़ैरह', 'वर्ग', 'वह', 'वहाँ', 'वहीं', 'वाले', 'वुह', 'वे', 'सकता',
    'सकते', 'सबसे', 'सभी', 'साथ', 'साबुत', 'साभ', 'सारा', 'से', 'सो', 'संग', 'ही', 'हुआ', 'हुई', 'हुए', 'है',
    'हैं', 'हो', 'होता', 'होती', 'होते', 'होना', 'होने'
]

# Load dataset
data = pd.read_csv('/content/randomized_combined_dataset.csv')

# Drop null values
data = data.dropna()

# Splitting input and label
x = data['title']
y = data['label']

# Preprocessing
corpus = []
for review in x:
    review = review.split()
    review = [word for word in review if word not in stopwords_hindi]
    corpus.append(' '.join(review))

# Tokenization and Padding
voc_size = 5000
one_hot_repr = [one_hot(words, voc_size) for words in corpus]
padded = pad_sequences(one_hot_repr, padding='post', maxlen=20)

# Model Building
embed_dim = 40
model = Sequential([
    Embedding(voc_size, embed_dim, input_length=20),
    Bidirectional(LSTM(100)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train-Test Split
x = np.array(padded)
y = np.array(y)
trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.3, random_state=0)

# Training
history = model.fit(trainX, trainY, epochs=10, validation_data=(testX, testY), batch_size=64)

# Predictions and Evaluation
pred = model.predict(testX)
binary_predictions = [1 if i >= 0.5 else 0 for i in pred.flatten()]

print('Accuracy on testing set:', accuracy_score(testY, binary_predictions))
print('Precision on testing set:', precision_score(testY, binary_predictions))
print('Recall on testing set:', recall_score(testY, binary_predictions))

# Confusion Matrix and AUC
cm = confusion_matrix(testY, binary_predictions)
probs = pred.flatten()  # Get predicted probabilities
auc = roc_auc_score(testY, probs)

print('Confusion Matrix:\n', cm)
print('AUC:', auc)


Epoch 1/10




[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 59ms/step - accuracy: 0.8111 - loss: 0.3791 - val_accuracy: 0.9043 - val_loss: 0.2219
Epoch 2/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 59ms/step - accuracy: 0.9291 - loss: 0.1796 - val_accuracy: 0.9034 - val_loss: 0.2453
Epoch 3/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 58ms/step - accuracy: 0.9455 - loss: 0.1423 - val_accuracy: 0.9042 - val_loss: 0.2475
Epoch 4/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 59ms/step - accuracy: 0.9619 - loss: 0.1026 - val_accuracy: 0.9010 - val_loss: 0.2732
Epoch 5/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 60ms/step - accuracy: 0.9706 - loss: 0.0793 - val_accuracy: 0.8999 - val_loss: 0.3780
Epoch 6/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 60ms/step - accuracy: 0.9794 - loss: 0.0588 - val_accuracy: 0.8964 - val_loss: 0.3739
Epoch 7/10
[1m679/679[0m 

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Optimized Model
model = Sequential([
    Embedding(voc_size, embed_dim, input_length=15),  # Smaller embedding dimension and sequence length
    Bidirectional(LSTM(50)),  # Reduced LSTM units
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Training
history = model.fit(trainX, trainY, epochs=5, validation_data=(testX, testY), batch_size=32, callbacks=[early_stopping])


Epoch 1/5




[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 24ms/step - accuracy: 0.8380 - loss: 0.3481 - val_accuracy: 0.9021 - val_loss: 0.2316
Epoch 2/5
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 24ms/step - accuracy: 0.9291 - loss: 0.1744 - val_accuracy: 0.9035 - val_loss: 0.2275
Epoch 3/5
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 24ms/step - accuracy: 0.9470 - loss: 0.1264 - val_accuracy: 0.9042 - val_loss: 0.2353
Epoch 4/5
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 23ms/step - accuracy: 0.9647 - loss: 0.0927 - val_accuracy: 0.8985 - val_loss: 0.2765
