In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MultiLabelBinarizer
import joblib


In [2]:
# Load datasets
train_dataset = pd.read_excel('E://PROJECTMINIS/DATASETS//hindi dataset//constraint_Hindi_Train.xlsx',engine='openpyxl')
val_dataset = pd.read_excel('E://PROJECTMINIS//DATASETS//hindi dataset//constraint_Hindi_Valid.xlsx',engine='openpyxl')
test_dataset = pd.read_excel('E://PROJECTMINIS//DATASETS//hindi dataset//Test Set Complete.xlsx',engine='openpyxl')
val_dataset.rename(columns={'Labels Set': 'labels'}, inplace=True)
train_dataset.rename(columns={'Labels Set': 'labels'}, inplace=True)
test_dataset.rename(columns={'Labels Set': 'labels'}, inplace=True)
train_dataset.rename(columns={'Post': 'text'}, inplace=True)
val_dataset.rename(columns={'Post': 'text'}, inplace=True)
test_dataset.rename(columns={'Post': 'text'}, inplace=True)
# Assuming 'labels' column contains comma-separated labels, e.g., "hostile,fake"
train_dataset['labels'] = train_dataset['labels'].apply(lambda x: x.split(','))
val_dataset['labels'] = val_dataset['labels'].apply(lambda x: x.split(','))
test_dataset['labels'] = test_dataset['labels'].apply(lambda x: x.split(','))

# MultiLabelBinarizer to convert string labels into binary matrix (0 or 1 for each label)
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_dataset['labels'])
y_val = mlb.transform(val_dataset['labels'])
y_test = mlb.transform(test_dataset['labels'])

# Tokenization and preprocessing
max_words = 10000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_dataset['text'])

# Convert texts to sequences
X_train = tokenizer.texts_to_sequences(train_dataset['text'])
X_val = tokenizer.texts_to_sequences(val_dataset['text'])
X_test = tokenizer.texts_to_sequences(test_dataset['text'])

# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train, maxlen=max_sequence_length)
X_val_pad = pad_sequences(X_val, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test, maxlen=max_sequence_length)

# Define the model architecture
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=max_words, output_dim=100, input_length=max_sequence_length))

# LSTM layer for sequence learning
model.add(LSTM(100, activation='tanh', return_sequences=True))
model.add(GlobalMaxPooling1D())

# Dense layer
model.add(Dense(128, activation='relu'))

# Output layer: one output for each label, with sigmoid activation
model.add(Dense(len(mlb.classes_), activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train_pad, y_train, validation_data=(X_val_pad, y_val), epochs=10, batch_size=64, callbacks=[early_stopping])

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate predictions for the test data
test_predictions = model.predict(X_test_pad)

# Convert the probabilities to binary values (using a 0.5 threshold)
test_predictions_binary = (test_predictions > 0.5).astype(int)

# Print classification report
print(classification_report(y_test, test_predictions_binary, target_names=mlb.classes_))

# Optionally, print predicted probabilities for the first test instance (percentage confidence for each label)
print("Predicted percentages for the first test instance:")
for label, prob in zip(mlb.classes_, test_predictions[0]):
    print(f"{label}: {prob*100:.2f}%")




Epoch 1/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 212ms/step - accuracy: 0.5223 - loss: 0.5045 - val_accuracy: 0.6634 - val_loss: 0.3155
Epoch 2/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 218ms/step - accuracy: 0.6666 - loss: 0.3107 - val_accuracy: 0.6806 - val_loss: 0.2848
Epoch 3/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 225ms/step - accuracy: 0.6966 - loss: 0.2600 - val_accuracy: 0.6893 - val_loss: 0.2582
Epoch 4/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 229ms/step - accuracy: 0.7302 - loss: 0.2150 - val_accuracy: 0.7115 - val_loss: 0.2473
Epoch 5/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 230ms/step - accuracy: 0.7765 - loss: 0.1735 - val_accuracy: 0.7238 - val_loss: 0.2649
Epoch 6/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 220ms/step - accuracy: 0.7999 - loss: 0.1570 - val_accuracy: 0.7250 - val_loss: 0.2752
Epoch 7/10
[1m90/90[

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import numpy as np

# Define a function to preprocess custom inputs
def preprocess_input(texts, tokenizer, max_sequence_length=100):
    # Tokenize the custom input texts
    sequences = tokenizer.texts_to_sequences(texts)
    # Pad the sequences to ensure the same input shape as training data
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
    return padded_sequences

# Example custom inputs (these can be any sentences you want to classify)
custom_texts = [
 "ये दलित लोग समाज का कचरा हैं, इन्हें गाँव से भगा देना चाहिए।",
    "मुसलमानों को मार डालो, ये सब आतंकवादी हैं जो हिंदुस्तान को तबाह कर देंगे!",
    'पंजाबी लोग सिर्फ पैसा कमाने के लिए फर्जी दवाइयाँ बेचते हैं, ये लालची कुत्ते सबको लूट रहे हैं।',
    ': "हरियाणा के लोग बहुत मेहनती हैं, उनके खेतों से देश को इतना अनाज मिलता है'
]

# Preprocess the custom input texts
custom_input_data = preprocess_input(custom_texts, tokenizer)

# Get predictions for the custom inputs
predictions = model.predict(custom_input_data)

# Display the predicted probabilities for each label
for i, text in enumerate(custom_texts):
    print(f"Text: {text}")
    print("Predicted Categories and Percentages:")
    for label, prob in zip(mlb.classes_, predictions[i]):
        print(f"{label}: {prob * 100:.2f}%")
    print("\n" + "-" * 50 + "\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
Text: ये दलित लोग समाज का कचरा हैं, इन्हें गाँव से भगा देना चाहिए।
Predicted Categories and Percentages:
defamation: 3.36%
fake: 73.85%
hate: 3.94%
non-hostile: 17.17%
offensive: 1.43%

--------------------------------------------------

Text: मुसलमानों को मार डालो, ये सब आतंकवादी हैं जो हिंदुस्तान को तबाह कर देंगे!
Predicted Categories and Percentages:
defamation: 30.61%
fake: 13.73%
hate: 47.82%
non-hostile: 0.35%
offensive: 51.73%

--------------------------------------------------

Text: पंजाबी लोग सिर्फ पैसा कमाने के लिए फर्जी दवाइयाँ बेचते हैं, ये लालची कुत्ते सबको लूट रहे हैं।
Predicted Categories and Percentages:
defamation: 27.82%
fake: 20.05%
hate: 39.51%
non-hostile: 0.45%
offensive: 42.07%

--------------------------------------------------

Text: : "हरियाणा के लोग बहुत मेहनती हैं, उनके खेतों से देश को इतना अनाज मिलता है
Predicted Categories and Percentages:
defamation: 20.33%
fake: 39.66%
hate: 27.25%

In [4]:
joblib.dump(model, 'hindi_model.pkl')

['hindi_model.pkl']

In [5]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer



# Save the tokenizer as a .pkl file
with open('tokenizer_hindi.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("Tokenizer saved to tokenizer.pkl")


Tokenizer saved to tokenizer.pkl
