<a href="https://colab.research.google.com/github/Debadrita96/DL_Projects_IITK/blob/main/DL!.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer


In [None]:
questions = pd.read_csv(r'/content/Questions.csv', encoding='ISO-8859-1', on_bad_lines='skip', engine='python')
#answers = pd.read_csv(r'/content/Answers.csv', encoding='ISO-8859-1')
tags = pd.read_csv(r'/content/Tags.csv', encoding='ISO-8859-1')

In [None]:
top_tags = tags['Tag'].value_counts().head(10).index.tolist()
tags_filtered = tags[tags['Tag'].isin(top_tags)]

In [None]:
tags_grouped = tags_filtered.groupby('Id')['Tag'].apply(list).reset_index()

In [None]:
data = pd.merge(questions, tags_grouped, on='Id')
data = data.dropna(subset=['Body'])  # Remove missing Body

In [None]:
data['text'] = data['Title'].fillna('') + ' ' + data['Body']


In [None]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['Tag'])

In [None]:
max_words = 20000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(data['text'])
X_seq = tokenizer.texts_to_sequences(data['text'])

In [None]:
max_len = 300
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post', truncating='post')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [None]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(top_tags), activation='sigmoid')  # multilabel output
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()



In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,  # You can increase for better performance
    batch_size=128
)

Epoch 1/5
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 29ms/step - accuracy: 0.1643 - loss: 0.3634 - val_accuracy: 0.2532 - val_loss: 0.3202
Epoch 2/5
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 28ms/step - accuracy: 0.2162 - loss: 0.3364 - val_accuracy: 0.5806 - val_loss: 0.1983
Epoch 3/5
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 28ms/step - accuracy: 0.6451 - loss: 0.1737 - val_accuracy: 0.7870 - val_loss: 0.1083
Epoch 4/5
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 31ms/step - accuracy: 0.8120 - loss: 0.1021 - val_accuracy: 0.8286 - val_loss: 0.0901
Epoch 5/5
[1m 980/1018[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 26ms/step - accuracy: 0.8519 - loss: 0.0814

In [15]:
# Predict probabilities
y_pred = model.predict(X_val)

# compute ROC-AUC for multilabel 
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val, y_pred, average='macro')
print(f"Validation ROC-AUC: {roc_auc:.4f}")

[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step
Validation ROC-AUC: 0.9797
