In [1]:
import joblib
mlb = joblib.load("mlb.pkl")
print("MultiLabelBinarizer loaded!")

MultiLabelBinarizer loaded!


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
import whisper

# Load the Whisper model

# # in gpu
# whisper_model = whisper.load_model("small").to("cuda")
# # Check if the model is on GPU
# print(next(whisper_model.parameters()).device)  # Should print "cuda:0"

whisper_model = whisper.load_model("small").to("cpu")

print("Whisper model loaded!")

Whisper model loaded!


In [7]:

# Transcribe Hindi audio
#31
#33
audio_path = "C:/Users/deepa/OneDrive/Desktop/Spechaudio/vishnu_31.mp3"
result = whisper_model.transcribe(audio_path, language="hi")

# Extract transcribed text
transcribed_text = result["text"]
print("Transcribed Text:", transcribed_text)




Transcribed Text:  अद्रस्पोर्ट् कोन, हैश भेशर्मी और हैश भेईमानी की इंतिहांतो देकिए ज़रा, ये आद्मेश जो दोहाँसाद के जुदुसो में इस तरे नाच्ता ता, आज खुद को चत्रपती शिवाजी महाराच का विरासब्ट का चेकेडार बताता है.


In [8]:
import re

def clean_text(text):
    # Remove URLs and punctuation, then extra spaces
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


cleaned_text = clean_text(transcribed_text)
print("Cleaned Text:", cleaned_text)


Cleaned Text: अदरसपरट कन हश भशरम और हश भईमन क इतहत दकए जर य आदमश ज दहसद क जदस म इस तर नचत त आज खद क चतरपत शवज महरच क वरसबट क चकडर बतत ह


In [9]:
from transformers import AutoTokenizer
import torch

# Load tokenizer for IndicBERT
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

# Convert cleaned text into token IDs
tokens = tokenizer(cleaned_text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")['input_ids']
tokens = tokens.to("cuda" if torch.cuda.is_available() else "cpu")


In [10]:
import torch.nn as nn

# Define Hybrid Model class
class HybridModel(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(HybridModel, self).__init__()
        self.bert = bert_model
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
        self.fc = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids):
        with torch.no_grad():
            bert_output = self.bert(input_ids).last_hidden_state
        conv1_out = self.relu(self.conv1(bert_output.permute(0, 2, 1)))
        conv2_out = self.relu(self.conv2(conv1_out))
        pooled = torch.mean(conv2_out, dim=2)
        output = self.sigmoid(self.fc(pooled))
        return output

# Load BERT model
from transformers import AutoModel

indic_bert = AutoModel.from_pretrained("ai4bharat/indic-bert")

# Define and load the trained model
num_classes = 5  # Change this based on your number of sentence types
device = "cuda" if torch.cuda.is_available() else "cpu"

model = HybridModel(indic_bert, num_classes)
model.load_state_dict(torch.load(r"./nlp_model/best_model.pth", map_location=device))
model.to(device)
model.eval()

# Predict the sentence type
with torch.no_grad():
    output = model(tokens)

# Convert prediction to labels
predicted_labels = (output.cpu().numpy() > 0.5).astype(int)  # Binary thresholding
print("Predicted Sentence Type:", mlb.inverse_transform(predicted_labels))


Predicted Sentence Type: [('fake', 'hate')]
