- Collate Function is used by dataloader to batch samples together dynamically. This is important when we are dealing with inputs that are varible in length eg. texts.

- Here since we are using TF-IDF Vectorizer with max_features= 5000. We are ensuring that our vectors are in the same length. This is like a subsititute to our collate function.

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn.utils import clip_grad_value_
from torch.optim import AdamW
from sklearn.metrics import hamming_loss


In [None]:
drive.mount('/content/drive')
df = load('/content/drive/MyDrive/NLP/df_multilabel_cleaned.joblib')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df['Tags'] = df['Tags'].apply(lambda x: x.split())
df

Unnamed: 0,cleaned_text,Tags,Tag_Number
0,asp query stre dropdown webpage follow control...,"[c#, asp.net]","[0, 9]"
1,run javascript code server java code want run ...,"[java, javascript]","[1, 3]"
2,linq sql throw exception row find change hi li...,"[c#, asp.net]","[0, 9]"
3,run python script php server run nginx web ser...,"[php, python]","[2, 7]"
4,advice write function m try write function res...,"[javascript, jquery]","[3, 5]"
...,...,...,...
47422,take value edittext put decimal point all- wor...,"[java, android]","[1, 4]"
47423,listen phone state application nee liste phone...,"[java, android]","[1, 4]"
47424,android ui thread thread task want access main...,"[java, android]","[1, 4]"
47425,dynamic table row creation html javascript htm...,"[asp.net, javascript]","[9, 3]"


In [None]:
# Splitting Dataset
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.2, random_state=42)

In [None]:
# Now fit the MLB on the processed tags
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df['Tags'])
y_test = mlb.transform(test_df['Tags'])
y_valid = mlb.transform(valid_df['Tags'])

# Check the classes to ensure they represent whole tags
print(mlb.classes_)

['android' 'asp.net' 'c#' 'c++' 'iphone' 'java' 'javascript' 'jquery'
 'php' 'python']


In [None]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
# Fit on the training data and transform the text data into TF-IDF vectors
X_train = vectorizer.fit_transform(train_df['cleaned_text']).toarray()
X_test = vectorizer.transform(test_df['cleaned_text']).toarray()
X_valid = vectorizer.transform(valid_df['cleaned_text']).toarray()

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.Y = torch.tensor(Y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# Create dataset instances
train_dataset = TextDataset(X_train, y_train)
valid_dataset = TextDataset(X_valid, y_valid)
test_dataset = TextDataset(X_test, y_test)


In [None]:
class TextClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 200)
        self.fc2 = nn.Linear(200, 100)
        self.fc3 = nn.Linear(100, output_dim)
        self.dropout1 = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.5)
        self.batchnorm1 = nn.BatchNorm1d(200)
        self.batchnorm2 = nn.BatchNorm1d(100)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.batchnorm1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.batchnorm2(x)
        x = self.fc3(x)
        return x


# Hyperparameters
input_dim = 5000  # This should match the max_features of TF-IDF
output_dim = len(mlb.classes_)  # Number of unique tags
epochs = 5
batch_size = 128
learning_rate = 0.001
weight_decay = 0.000

# Initialize model, optimizer, and loss function
model = TextClassifier(input_dim, output_dim)
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = nn.BCEWithLogitsLoss()

# Adjust dataloaders if necessary
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        clip_grad_value_(model.parameters(), clip_value=10.0)
        optimizer.step()
        train_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss = 0.0
    all_targets = []
    all_outputs = []

    with torch.no_grad():
        for inputs, targets in valid_loader:
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            val_loss += loss.item()
            all_targets.append(targets.cpu().numpy())
            all_outputs.append(torch.sigmoid(outputs).cpu().numpy() > 0.5) # Applying threshold to get binary outputs

    # Calculate Hamming Loss
    all_targets_np = np.vstack(all_targets)
    all_outputs_np = np.vstack(all_outputs)
    ham_loss = hamming_loss(all_targets_np, all_outputs_np)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(valid_loader)}, Hamming Loss: {ham_loss}")


Epoch 1, Train Loss: 0.3010796257843473, Val Loss: 0.12830333386858303, Hamming Loss: 0.04653400105429626
Epoch 2, Train Loss: 0.12674568575941753, Val Loss: 0.10840483158826827, Hamming Loss: 0.04074855034264628
Epoch 3, Train Loss: 0.10438300129841474, Val Loss: 0.10258422897507748, Hamming Loss: 0.03812598840274117
Epoch 4, Train Loss: 0.09044241979216486, Val Loss: 0.09800245376924674, Hamming Loss: 0.03612282551396943
Epoch 5, Train Loss: 0.08047164015798056, Val Loss: 0.09636750879387061, Hamming Loss: 0.03472588297311544


In [None]:
def compute_hamming_loss(y_true, y_pred):
    return hamming_loss(y_true, y_pred)

# Training and validation loop
for epoch in range(epochs):
    model.train()
    train_losses = []
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        clip_grad_value_(model.parameters(), clip_value=10.0)
        optimizer.step()
        train_losses.append(loss.item())

    # Validation phase
    model.eval()
    val_losses = []
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, targets in valid_loader:
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            val_losses.append(loss.item())
            y_true.append(targets.numpy())
            y_pred.append(outputs.sigmoid().numpy() > 0.5)

    # Compute Hamming Loss for the validation set
    y_true = np.vstack(y_true)
    y_pred = np.vstack(y_pred)
    hammingLoss = compute_hamming_loss(y_true, y_pred)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(valid_loader)}, Hamming Loss: {ham_loss}")


Epoch 1, Train Loss: 0.08047164015798056, Val Loss: 0.09636750879387061, Hamming Loss: 0.03472588297311544
Epoch 2, Train Loss: 0.08047164015798056, Val Loss: 0.09636750879387061, Hamming Loss: 0.03472588297311544
Epoch 3, Train Loss: 0.08047164015798056, Val Loss: 0.09636750879387061, Hamming Loss: 0.03472588297311544
Epoch 4, Train Loss: 0.08047164015798056, Val Loss: 0.09636750879387061, Hamming Loss: 0.03472588297311544
Epoch 5, Train Loss: 0.08047164015798056, Val Loss: 0.09636750879387061, Hamming Loss: 0.03472588297311544


### Test Function

In [None]:
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Test phase
model.eval()
test_losses = []
y_true_test, y_pred_test = [], []
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        test_losses.append(loss.item())
        y_true_test.append(targets.numpy())
        y_pred_test.append(outputs.sigmoid().numpy() > 0.5)

# Compute Hamming Loss for the test set
y_true_test = np.vstack(y_true_test)
y_pred_test = np.vstack(y_pred_test)
hammingLossTest = compute_hamming_loss(y_true_test, y_pred_test)

print(f"Test Loss: {np.mean(test_losses):.4f}, Hamming Loss: {hammingLossTest:.4f}")



Test Loss: 0.0920, Hamming Loss: 0.0307


### Inference


In [None]:
def predict_tags(text, model, vectorizer, mlb):
    # Preprocess the text
    text_vector = vectorizer.transform([text]).toarray()
    text_tensor = torch.tensor(text_vector, dtype=torch.float32)

    # Predict
    model.eval()
    with torch.no_grad():
        output = model(text_tensor)
        output = output.sigmoid().numpy() > 0.5

    # Decode the predicted labels
    predicted_labels = mlb.inverse_transform(output)
    return predicted_labels


sample_text = test_df['cleaned_text'].iloc[0]  # Example: selecting the first text from the test set
predicted_tags = predict_tags(sample_text,model,vectorizer,mlb)
print("Predicted Tags:", predicted_tags)


Predicted Tags: [('asp.net', 'c#')]


In [None]:
# Example usage
sample_text = "java classes have inheritance"
predicted_tags = predict_tags(sample_text, model, vectorizer, mlb)
print("Predicted tags:", predicted_tags)

Predicted tags: [('java',)]


In [None]:
# Example usage
sample_text = "js can be used for frontend applications and works well with different API's"
predicted_tags = predict_tags(sample_text, model, vectorizer, mlb)
print("Predicted tags:", predicted_tags)

Predicted tags: [('javascript', 'jquery')]
