In [None]:
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

### Bert

In [11]:
import os

# Check if running on Kaggle
if not os.path.exists('datasets/data.csv'):
    # If datasets folder doesn't exist, assume running on Kaggle
    !git clone https://github.com/Aries-IITD/INNOV8-2.0 
    !mkdir datasets
    !mv INNOV8-2.0/data.csv INNOV8-2.0/test.csv datasets/
    !rm -rf INNOV8-2.0
else:
    print("Dataset already exists locally")


Cloning into 'INNOV8-2.0'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 30 (delta 10), reused 1 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 425.39 KiB | 10.91 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [51]:
from transformers import BertTokenizer, BertModel
import torch

# Define device
device = 'cpu'

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)  # Move model to device




In [54]:
raw_train_data = pd.read_csv("datasets/data.csv")

In [55]:
train_data, test_data = train_test_split(raw_train_data, test_size=0.2, random_state=42, stratify=raw_train_data['species'])


In [61]:
def get_cls_embedding(texts):
    # Tokenize the input
    inputs = tokenizer(texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to device
    
    # Get BERT embeddings
    with torch.no_grad():
        outputs = bert_model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token embedding
        return cls_embedding  # shape is 1x768

def get_all_embeddings(texts):
    # Define batch size
    batch_size = 32  # Adjust based on GPU memory capacity

    # Initialize list to store all embeddings
    all_embeddings = []

    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = get_cls_embedding(batch_texts)
        all_embeddings.append(batch_embeddings)

    # Concatenate all embeddings into a single tensor
    embeddings_tensor = torch.cat(all_embeddings, dim=0)
    return embeddings_tensor


In [62]:

texts = train_data['message'].tolist()
train_embed = get_all_embeddings(texts)


100%|██████████| 13/13 [00:46<00:00,  3.54s/it]


In [63]:

texts = test_data['message'].tolist()
test_embed = get_all_embeddings(texts)


100%|██████████| 4/4 [00:10<00:00,  2.72s/it]


In [64]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [65]:
train_embed = train_embed.to(device)
test_embed = test_embed.to(device)

In [66]:
df

Unnamed: 0,message,tokenized
0,This is the first sentence.,"[input_ids, token_type_ids, attention_mask]"
1,"This is the second one, which is longer.","[input_ids, token_type_ids, attention_mask]"


In [67]:
# df = train_data

In [68]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [69]:
# Define lists of feature names
numerical_feature_names = ['fingers',]
categorical_feature_names = ['tail',]
output_feature_name = 'species'

In [70]:
import torch
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Initialize encoders
one_hot_encoder = OneHotEncoder(sparse=False, drop=None)
label_encoder = LabelEncoder()

def process_dataframe(df, fit_encoders=True):
    # Extract features
    numerical_features = df[numerical_feature_names].values
    categorical_features = df[categorical_feature_names].values
    output_labels = df[output_feature_name].values

    # Convert numerical features to a tensor and move to device
    numerical_tensor = torch.tensor(numerical_features, dtype=torch.float32).to(device)

    # One-hot encode categorical features
    if fit_encoders:
        one_hot_encoded = one_hot_encoder.fit_transform(categorical_features)
    else:
        one_hot_encoded = one_hot_encoder.transform(categorical_features)
    categorical_tensor = torch.tensor(one_hot_encoded, dtype=torch.float32).to(device)

    # Ordinal encode the output feature
    if fit_encoders:
        output_encoded = label_encoder.fit_transform(output_labels)
    else:
        output_encoded = label_encoder.transform(output_labels)
    output_tensor = torch.tensor(output_encoded, dtype=torch.long).to(device)  # Use long type for class indices

    # Create mapping dictionary for output labels
    if fit_encoders:
        label_to_index = {label: idx for idx, label in enumerate(label_encoder.classes_)}
        index_to_label = {idx: label for idx, label in enumerate(label_encoder.classes_)}
    else:
        label_to_index = None
        index_to_label = None

    # Concatenate tensors (assuming embeddings_tensor is available)
    if fit_encoders:
        combined_features = torch.cat((train_embed, numerical_tensor, categorical_tensor), dim=1)
    else:
        combined_features = torch.cat((test_embed, numerical_tensor, categorical_tensor), dim=1)

    inputs = combined_features
    labels = output_tensor
    
    return inputs, labels, label_to_index, index_to_label

# Process training data
train_inputs, train_labels, train_label_to_index, train_index_to_label = process_dataframe(train_data, fit_encoders=True)

# Process test data
test_inputs, test_labels, _, _ = process_dataframe(test_data, fit_encoders=False)

# Now you have processed inputs and labels for both training and test datasets
print("Training inputs shape:", train_inputs.shape)
print("Training labels shape:", train_labels.shape)
print("Test inputs shape:", test_inputs.shape)
print("Test labels shape:", test_labels.shape)


Training inputs shape: torch.Size([400, 771])
Training labels shape: torch.Size([400])
Test inputs shape: torch.Size([100, 771])
Test labels shape: torch.Size([100])




In [71]:
# train_index_to_label

In [72]:
# # Concatenate tensors
# combined_features 


In [73]:
# combined_features

In [74]:
# inputs.shape

In [75]:

# num_classes = len(label_to_index)

In [76]:
# num_classes

In [77]:
# Raise ZeroDivisionError

In [78]:
import torch.nn as nn

# Define a simple MLP for classification
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define the model
input_size = train_inputs.shape[1]  # Adjusted to match your data
hidden_size = 256
num_classes = len(train_index_to_label)  # Number of output classes

model = MLPClassifier(input_size, hidden_size, num_classes).to(device)


In [82]:
def accuracy_fn(y_true, y_pred):
    """Calculates accuracy between truth labels and predictions."""
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_true)) * 100
    return acc

loss_fn = nn.CrossEntropyLoss()

In [159]:
WRITE_LOG_FLAG = None

In [191]:
import torch as t
from tqdm.auto import tqdm
from timeit import default_timer as timer

def test_step(test_inputs, test_labels, m0, loss_fn, accuracy_fn, batch_size, device):
    m0.eval()
    test_loss, test_acc = 0, 0
    num_batches = (len(test_inputs) + batch_size - 1) // batch_size  # Calculate number of batches

    with t.inference_mode():
        for start in range(0, len(test_inputs), batch_size):
            end = min(start + batch_size, len(test_inputs))
            X_batch = test_inputs[start:end].to(device)
            y_batch = test_labels[start:end].to(device)

            test_pred = m0(X_batch)
            test_loss += loss_fn(test_pred, y_batch).item()
            test_acc += accuracy_fn(y_batch, test_pred.argmax(dim=1))

    test_loss /= num_batches
    test_acc /= num_batches
    return test_loss, test_acc

def train_step(train_inputs, train_labels, test_inputs, test_labels, m0, optimizer, loss_fn, accuracy_fn, batch_size, validate_flag,WRITE_LOG_FLAG, device, epoch):
    start_time = timer()
    train_loss, acc = 0, 0
    m0.train()
    num_batches = (len(train_inputs) + batch_size - 1) // batch_size  # Calculate number of batches

    for start in range(0, len(train_inputs), batch_size):
        end = min(start + batch_size, len(train_inputs))
        X_batch = train_inputs[start:end].to(device)
        y_batch = train_labels[start:end].to(device)

        y_pred = m0(X_batch)
        loss = loss_fn(y_pred, y_batch)
        train_loss += loss.item()
        acc += accuracy_fn(y_batch, y_pred.argmax(dim=1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= num_batches
    acc /= num_batches

    if validate_flag:
        test_loss, test_acc = test_step(test_inputs, test_labels, m0, loss_fn, accuracy_fn, batch_size, device)
        end_time = timer()
        content = (f"Epoch: {epoch}  | Train Loss: {train_loss:.3f}, Acc: {acc:.2f}% | Test Loss: {test_loss:.3f}, Acc: {test_acc:.2f}% | Time taken: {(end_time - start_time):.3f} sec")
        print(content)
        if WRITE_LOG_FLAG : 
            # Open the file in write mode ('w')
            if epoch == 1 : 
                filemode = 'w+'
            else:
                filemode = 'a+' 
            with open(f'model/{LOG_FILE_NAME}.txt', filemode) as file:
                file.write(content+'\n')

    else:
        test_loss, test_acc = -1, -1

    

In [161]:
# Define constants
BATCH_SIZE = 32
NUM_EPOCHS = 100
EVAL_INTERVAL = 10
LR  = 0.2
LOG_FILE_NAME = 'trainlog'
WRITE_LOG_FLAG = True

In [164]:
!mkdir model

In [162]:
import torch as t
from tqdm.auto import tqdm



def reset(device):
    """Reset the model and optimizer."""
    t.manual_seed(42)
    
    model = MLPClassifier(input_size, hidden_size, num_classes).to(device)
    optimizer = t.optim.SGD(params=model.parameters(), lr=LR)
    return model, optimizer


def train_model(model, optimizer):
    device = t.device('cuda' if t.cuda.is_available() else 'cpu')

    # Initialize model and optimizer
    model, optimizer = reset(device)

    for epoch in tqdm(range(1, NUM_EPOCHS + 1)):
        # Train the model
        if epoch == 1 or epoch == NUM_EPOCHS or epoch%EVAL_INTERVAL == 0:
            validate_flag = True
        else:
            validate_flag = False

        train_step(
            train_inputs, train_labels, 
            test_inputs, test_labels, 
            model, optimizer, loss_fn, 
            accuracy_fn, BATCH_SIZE, 
            validate_flag=validate_flag, 
            WRITE_LOG_FLAG = WRITE_LOG_FLAG,
            device=device, 
            epoch=epoch
        )
        
    return model , optimizer


In [126]:
hidden_size

256

In [192]:
# Define constants
BATCH_SIZE = 512
NUM_EPOCHS = 2000
EVAL_INTERVAL = 100
LR  = 0.02

t.manual_seed(42)

model = MLPClassifier(input_size, 128, num_classes).to(device)
optimizer = t.optim.Adam(params=model.parameters(), lr=LR)

model, optimizer = train_model(model, optimizer)

  0%|          | 0/2000 [00:00<?, ?it/s]

Epoch: 1  | Train Loss: 2.311, Acc: 11.75% | Test Loss: 2.302, Acc: 11.00% | Time taken: 0.003 sec
Epoch: 100  | Train Loss: 1.883, Acc: 50.00% | Test Loss: 1.910, Acc: 41.00% | Time taken: 0.002 sec
Epoch: 200  | Train Loss: 1.437, Acc: 65.50% | Test Loss: 1.504, Acc: 59.00% | Time taken: 0.001 sec
Epoch: 300  | Train Loss: 1.113, Acc: 74.25% | Test Loss: 1.212, Acc: 67.00% | Time taken: 0.001 sec
Epoch: 400  | Train Loss: 0.891, Acc: 78.75% | Test Loss: 1.013, Acc: 69.00% | Time taken: 0.001 sec
Epoch: 500  | Train Loss: 0.738, Acc: 80.75% | Test Loss: 0.880, Acc: 69.00% | Time taken: 0.001 sec
Epoch: 600  | Train Loss: 0.628, Acc: 83.75% | Test Loss: 0.789, Acc: 73.00% | Time taken: 0.001 sec
Epoch: 700  | Train Loss: 0.546, Acc: 86.00% | Test Loss: 0.726, Acc: 77.00% | Time taken: 0.001 sec
Epoch: 800  | Train Loss: 0.482, Acc: 88.50% | Test Loss: 0.680, Acc: 76.00% | Time taken: 0.001 sec
Epoch: 900  | Train Loss: 0.431, Acc: 89.50% | Test Loss: 0.647, Acc: 76.00% | Time taken: 0.

In [193]:
import torch

hyperparameters = {
    "BATCH_SIZE": BATCH_SIZE,
    "NUM_EPOCHS": NUM_EPOCHS,
    "EVAL_INTERVAL": EVAL_INTERVAL,
    "LR": LR
}
# Save the model's state dictionary
torch.save(model.state_dict(), 'model/model_state.pth')
torch.save(model, 'model/model.pth')

# Define the file path
file_path = 'model/hyperparameters.txt'

# Open the file in write mode ('w')
with open(file_path, 'w') as file:
    for key, value in hyperparameters.items():
        file.write(f"{key}: {value}\n")

print(f"Hyperparameters saved to {file_path}")


Hyperparameters saved to model/hyperparameters.txt


In [194]:
import zipfile
import os
from IPython.display import FileLink, display

# Path to the file or directory to zip
file_to_zip = 'model'
zip_file_name = 'model_zip.zip'

# Function to zip a directory
def zip_dir(directory, zip_file):
    with zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, directory))

# Zip the file or directory
if os.path.isdir(file_to_zip):
    zip_dir(file_to_zip, zip_file_name)
else:
    with zipfile.ZipFile(zip_file_name, 'w') as zipf:
        zipf.write(file_to_zip, os.path.basename(file_to_zip))

# Display the download link
download_link = FileLink(zip_file_name)
display(download_link)


In [89]:
Raise EofError

SyntaxError: invalid syntax (2340767571.py, line 1)