In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Read data
train_df = pd.read_csv("/Dataset/train.csv")
test_df = pd.read_csv("/Dataset/test.csv")

# Extract features and labels
X_train_text = train_df['TITLE'] + ' ' + train_df['ABSTRACT']
y_train = train_df.iloc[:, 3:]

X_test_text = test_df['TITLE'] + ' ' + test_df['ABSTRACT']

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

max_len = 128
X_train = pad_sequences(X_train_seq, maxlen=max_len)
X_test = pad_sequences(X_test_seq, maxlen=max_len)

# Build model
model = keras.Sequential([
    keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_len),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(y_train.shape[1], activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)

# Evaluate model on train data
train_loss, train_accuracy = model.evaluate(X_train, y_train)
print("Train Accuracy:", train_accuracy)

# Predict on test data
predictions = model.predict(X_test)

# Convert probabilities to binary predictions
binary_predictions = (predictions > 0.5).astype(int)

# Compare actual vs. predicted for the first 5 samples
print("\nPredicted (First 5 samples):")
for i in range(5):
    print("Sample", i+1)
    print("Predicted:", binary_predictions[i])
    print()

# Write predictions to CSV
actual_vs_predicted = pd.DataFrame(binary_predictions, columns=y_train.columns)
actual_vs_predicted.insert(0, 'ID', test_df['ID'])
actual_vs_predicted.to_csv("sample_submission.csv", index=False)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train Accuracy: 0.8681575655937195

Predicted (First 5 samples):
Sample 1
Predicted: [0 0 0 1 0 0]

Sample 2
Predicted: [0 1 0 0 0 0]

Sample 3
Predicted: [1 0 0 1 0 0]

Sample 4
Predicted: [0 1 0 0 0 0]

Sample 5
Predicted: [1 0 1 0 0 0]

