In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import re
import string
import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt


In [None]:
import csv
from google.colab import files

In [None]:
lyrics_file = files.upload()

Saving book1.csv to book1 (1).csv


In [None]:
# Reading the data
data = pd.read_csv('book1.csv')

In [None]:
# Removing unwanted characters and digits from the text
data['lyrics'] = data['lyrics'].apply(lambda x: re.sub('[^a-zA-Z\u0900-\u097F]+', ' ', str(x)))
data['lyrics'] = data['lyrics'].apply(lambda x: re.sub('\d+', ' ', str(x)))

In [None]:
# Tokenizing the data
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(data['lyrics'].values)
X = tokenizer.texts_to_sequences(data['lyrics'].values)
X = pad_sequences(X)
y = data['status'].values

In [None]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Replace missing values with a default value
default_value = 0
y_train = pd.Series(y_train).fillna(default_value).values
y_test = pd.Series(y_test).fillna(default_value).values

# Convert boolean values to integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [None]:
# Creating the model
model = Sequential()
model.add(Embedding(5000, 128, input_length=X.shape[1]))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Define the custom threshold value
threshold = 0.2

# Train the LSTM model
model.fit(X_train, y_train, epochs=15, batch_size=32)



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fdc6d183520>

In [None]:
# Predict probabilities for the test set
y_pred_prob = model.predict(X_test)

# Convert probabilities to binary predictions using the custom threshold
y_pred = (y_pred_prob >= threshold).astype(int)

# Calculate the accuracy, F1 score, precision, and recall for the custom threshold value
accuracy = accuracy_score(y_test, y_pred)
f1_score_val = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy: ", accuracy)
print("F1 Score: ", f1_score_val)
print("Precision: ", precision)
print("Recall: ", recall)

Accuracy:  0.8104575163398693
F1 Score:  0.4912280701754386
Precision:  0.45161290322580644
Recall:  0.5384615384615384


In [None]:
# Function to predict the explicitness of user input lyric
def predict_explicitness(lyric, threshold):
    # Preprocess the input lyric
    lyric = re.sub('[^a-zA-Z\u0900-\u097F]+', ' ', lyric)
    lyric = re.sub('\d+', ' ', lyric)
    lyric = lyric.lower()

    # Tokenize and pad the input lyric
    lyric_X = tokenizer.texts_to_sequences([lyric])
    lyric_X = pad_sequences(lyric_X, maxlen=X.shape[1])

    # Predict the explicitness of the input lyric
    prediction = model.predict(lyric_X)[0][0]
    if prediction > threshold:
        return 'Explicit'
    else:
        return 'Not explicit'


In [None]:
# Test the model with user input
lyric = input('Enter a Hindi music lyric to check its explicitness: ')
threshold = 0.2
print(predict_explicitness(lyric, threshold))

Enter a Hindi music lyric to check its explicitness: कौन बोला मुझसे ना हो पायेगा कौन बोला
Not explicit
