In [None]:
from google.colab import files

# This will prompt you to select the kaggle.json file
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c copy-of-artificial-neural-networks-competition --force

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!unzip '/content/drive/My Drive/Colab Notebooks/copy-of-artificial-neural-networks-competition.zip' -d '/content/dataset'

In [None]:
import pandas as pd

# Adjust the filename and path as necessary
file_path = '/content/dataset/train_mpst.csv'  # Example for accessing the training dataset
train_df = pd.read_csv(file_path)
train_df.head()


In [None]:
# Step 2: Data Cleaning and Preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer

# Download NLTK data (you may skip this if you've already done it)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Function to clean text data
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text

# Apply the cleaning function to the plot_synopsis column
train_df['cleaned_plot'] = train_df['plot_synopsis'].apply(clean_text)

# Step 3: Text Vectorization (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')

# Fit and transform the cleaned plot synopsis text
X_tfidf = tfidf_vectorizer.fit_transform(train_df['cleaned_plot'])

# For multi-label classification, ensure labels are properly encoded
# Assuming labels are in separate columns following 'plot_synopsis', 'imdb_id', etc.
# Convert DataFrame label columns to a list of lists

# Assuming the DataFrame 'train_df' holds your data, and label columns are correctly identified
label_columns = train_df.columns[4:75]  # Adjust indices as necessary
labels_list = train_df[label_columns].apply(lambda row: row.index[row == 1].tolist(), axis=1)
labels = train_df[label_columns]
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels_list)

print(y.shape)  # Should now match the number of rows in 'X_tfidf'
print( X_tfidf.shape)  # Should now match the number of rows in 'X_tfidf'


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

print(X_train.shape)  # Should now match the number of rows in 'X_tfidf'
print( y_val.shape)  # Should now match the number of rows in 'X_tfidf'



In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from scipy.sparse import csr_matrix

# Convert the Scipy sparse matrix to a dense NumPy array
# This step is necessary because PyTorch doesn't support sparse matrices as input for fully connected layers directly
X_train_dense = X_train.toarray() if isinstance(X_train, csr_matrix) else X_train
X_val_dense = X_val.toarray() if isinstance(X_val, csr_matrix) else X_val

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_dense, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_dense, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print(X_train_tensor.shape)  # Should now match the number of rows in 'X_tfidf'
print( X_val_tensor.shape)  # Should now match the number of rows in 'X_tfidf'


In [None]:
from torch import nn

class MultiLabelNN(nn.Module):
    def __init__(self, num_features, num_labels):
        super(MultiLabelNN, self).__init__()
        self.layer1 = nn.Linear(num_features, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.layer2 = nn.Linear(512, 256)
        self.output_layer = nn.Linear(256, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.output_layer(x))
        return x

# Instantiate the model
num_features = X_train.shape[1]
num_labels = y_train.shape[1]
model = MultiLabelNN(num_features, num_labels)


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {train_loss / len(train_loader)}')


In [None]:
import time

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

num_epochs = 20

# Start time of the training
start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    epoch_start_time = time.time()  # Start time of the current epoch

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    epoch_duration = time.time() - epoch_start_time
    total_estimated_time = epoch_duration * num_epochs
    time_elapsed = time.time() - start_time
    time_remaining = total_estimated_time - time_elapsed

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss / len(train_loader)}, Time elapsed: {time_elapsed:.2f}s, Estimated time remaining: {time_remaining:.2f}s')


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss
import numpy as np

# Ensure model is in evaluation mode
model.eval()

# Containers for predictions and true labels
all_preds = []
all_true_labels = []

# No gradient is needed for evaluation
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        outputs = model(X_batch)
        # Convert model outputs to binary values (0 or 1)
        predicted = (outputs > 0.5).int()
        all_preds.append(predicted)
        all_true_labels.append(y_batch.int())

# Concatenate all batches
all_preds = torch.cat(all_preds, dim=0).cpu().numpy()
all_true_labels = torch.cat(all_true_labels, dim=0).cpu().numpy()

# Calculate metrics
f1 = f1_score(all_true_labels, all_preds, average='micro')
precision = precision_score(all_true_labels, all_preds, average='micro')
recall = recall_score(all_true_labels, all_preds, average='micro')
hammingloss = hamming_loss(all_true_labels, all_preds)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Hamming Loss: {hammingloss:.4f}')


In [None]:
# Assuming you've defined a preprocessing function similar to `clean_text` previously

test_file_path = '/content/dataset/test.csv'

test_df = pd.read_csv(test_file_path)
test_df['processed_plot'] = test_df['plot_synopsis'].apply(clean_text)

# Vectorize the processed text using the same TF-IDF vectorizer you used for training
# IMPORTANT: Use transform() NOT fit_transform(), as you want to use the same vocabulary as your training set
X_test_tfidf = tfidf_vectorizer.transform(test_df['processed_plot'])


In [None]:
X_test_tensor = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)  # Convert to tensor
test_dataset = TensorDataset(X_test_tensor)  # Create dataset without labels
test_loader = DataLoader(test_dataset, batch_size=64)  # Create DataLoader


In [None]:
model.eval()  # Set the model to evaluation mode
test_preds = []

with torch.no_grad():
    for X_batch, in test_loader:
        outputs = model(X_batch)
        predicted = (outputs > 0.5).int()  # Apply threshold to get binary predictions
        print(predicted.shape)  # Check the shape of predictions per batch
        test_preds.append(predicted)


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming `test_preds` contains the binary predictions for the test dataset

# Concatenate all batch predictions
test_preds_concatenated = torch.cat(tuple(test_preds), dim=0)

print(test_preds_concatenated.shape)

# Assuming you have the test DataFrame loaded for 'ID' mapping
test_df = pd.read_csv(test_file_path)

# Assuming your test DataFrame has an 'ID' column that matches the sample submission
# If your DataFrame uses a different column name for IDs, adjust 'imdb_id' accordingly
ids = test_df['imdb_id'].values


# Load the sample submission file again
sample_submission_path = '/content/dataset/sample_submission.csv'  # Adjust this path if necessary
sample_submission_df = pd.read_csv(sample_submission_path)

print( test_preds_concatenated.shape)


# Convert binary predictions to a DataFrame
# The column names for predictions should match those in the sample submission, excluding the 'ID' column
label_columns = sample_submission_df.columns[1:]  # Exclude the 'ID' column
predictions_df = pd.DataFrame(test_preds_concatenated.numpy(), columns=label_columns)

# Insert the 'ID' column at the beginning of the DataFrame
predictions_df.insert(0, 'ID', ids)

# Ensure the format matches the sample submission by converting to float
# This step may be optional depending on the requirements of the submission platform
# predictions_df = predictions_df.astype(float)

submission_file_path = '/content/dataset/final_submission.csv'
# Save the DataFrame to a CSV file
predictions_df.to_csv(submission_file_path, index=False)

# Output the path to the saved submission file
submission_file_path
