In [1]:
import pandas as pd

train_df = pd.read_csv('train.csv')

print(train_df.head())

comments = train_df['comment'].values
labels = train_df['label'].values


                                             comment       label
0  last proof me  kaha gaya little confused about it       doubt
1                       sir g ki value positive hogi       doubt
2                             like the video guyssss  irrelevant
3                        ummm sir i want see ur wife  irrelevant
4  physics wallah punabi mundamundi mein thusde d...  irrelevant


In [2]:
hinglish_stopwords = [
    'aap', 'ab', 'hai', 'haina', 'hoga', 'hoti', 'hoon', 'hu', 'ka', 'kya',
    'kar', 'kare', 'karo', 'to', 'tum', 'main', 'ke', 'se', 'par', 'mein',
    'bhi', 'ya', 'aur', 'yaar', 'kahi', 'sab', 'log', 'ye', 'wo', 'waise',
    'jab', 'waqt', 'agar', 'khud', 'nahi', 'karta', 'karti', 'karna',
    'karega', 'se', 'kar', 'jo', 'thoda', 'zyada', 'kisi', 'toh',
    'kaise', 'aise', 'na', 'toh', 'abhi', 'poora', 'waise', 'ek', 'sabse',
    'jaisa', 'kya', 'hoga', 'hota', 'ho', 'lekin', 'phir', 'kyunki',
    'chalo', 'suno', 'dekhna', 'dekh', 'bata', 'samajh', 'karega', 'kya',
    'hai', 'thik', 'sab', 'hoti', 'hota', 'hote', 'bhi', 'sabhi',
    'koi', 'sari', 'naya', 'pura', 'bada', 'chota', 'bahut', 'acha',
    'sahi', 'suno', 'kuch', 'aate', 'jata', 'jaate', 'hona', 'ho'
]

import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in hinglish_stopwords])
    return text


cleaned_comments = [preprocess_text(comment) for comment in comments]


In [3]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=500)
vectorizer.adapt(cleaned_comments)
vectorized_comments = vectorizer(cleaned_comments)


In [4]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
from tensorflow.keras.utils import to_categorical
categorical_labels = to_categorical(encoded_labels)


In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
vectorized_comments_np = vectorized_comments.numpy()

X_train, X_val, y_train, y_val = train_test_split(vectorized_comments_np, categorical_labels, test_size=0.2, random_state=42)

print(f"Training Data Shape: {X_train.shape}")
print(f"Validation Data Shape: {X_val.shape}")



Training Data Shape: (170997, 500)
Validation Data Shape: (42750, 500)


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras import mixed_precision
from sklearn.metrics import f1_score

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
U
model = Sequential()
model.add(Embedding(input_dim=20000 + 1, output_dim=64, input_length=500))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=2,
                    batch_size=256)


batch_size = 256

val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

val_loss, val_accuracy = model.evaluate(val_dataset)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

y_pred_probs = model.predict(val_dataset)
y_pred = tf.argmax(y_pred_probs, axis=1)

y_true = tf.argmax(y_val, axis=1)

f1 = f1_score(y_true, y_pred, average='macro')
print(f"F1 Score: {f1:.4f}")

Epoch 1/2
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 54ms/step - accuracy: 0.5945 - loss: 0.8602 - val_accuracy: 0.6907 - val_loss: 0.7024
Epoch 2/2
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 53ms/step - accuracy: 0.6969 - loss: 0.7007 - val_accuracy: 0.6934 - val_loss: 0.6951
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.6944 - loss: 0.6939
Validation Loss: 0.6951, Validation Accuracy: 0.6934
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step
F1 Score: 0.6887


In [14]:

test_df = pd.read_csv('test.csv')
test_comments = test_df['comment'].values

cleaned_test_comments = [preprocess_text(comment) for comment in test_comments]

vectorized_test_comments = vectorizer(cleaned_test_comments)

vectorized_test_comments_np = vectorized_test_comments.numpy()

predictions_probs = model.predict(vectorized_test_comments_np)
predictions = np.argmax(predictions_probs, axis=1)

label = label_encoder.inverse_transform(predictions)

results_df = pd.DataFrame({
    'id': test_df['id'],
    'label': label
})

results_df.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv")



[1m2863/2863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 15ms/step
Predictions saved to predictions.csv
