In [2]:

import pandas as pd
import numpy as np
import re
import math
import os
import subprocess

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import torch
import torch.nn
import importlib

[nltk_data] Downloading package stopwords to /Users/dwika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import simple_mlp
import TextDataset
importlib.reload(simple_mlp)

<module 'simple_mlp' from '/Volumes/Dwika/ISE/ISE-solution/lab1/simple_mlp.py'>

In [4]:
from torch.utils.data import random_split, DataLoader

# Step 1: Create the full dataset
full_dataset = TextDataset.TextDatasetTFIDF('datasets/pytorch.csv')

# Step 2: Define train/val split sizes
val_ratio = 0.2
val_size = int(len(full_dataset) * val_ratio)
train_size = len(full_dataset) - val_size

# Step 3: Split the dataset
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Step 4: Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

Preprocessed DataFrame:
                                                   text  sentiment
584  undefined symbol: cudnnSetConvolutionGroupCoun...          0
591  Unable to build PyTorch from source without Nu...          0
486  [docs] Make clear the format of torch.eig eige...          0
77   `index_select` on flat tensor faster than inte...          1
212  test_det_logdet_slogdet_batched (in test_cuda....          0


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
import helper

In [6]:
model = simple_mlp.DeeperMLP(input_dim=1000)  # TF-IDF has 1000 features
criterion = nn.CrossEntropyLoss()  # You can add weights here if needed
optimizer = optim.Adam(model.parameters(), lr=0.01)  # Lowered LR
model

DeeperMLP(
  (fc1): Linear(in_features=1000, out_features=128, bias=True)
  (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (norm3): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  (fc4): Linear(in_features=32, out_features=2, bias=True)
)

In [7]:
epochs=100* len(train_loader)
# ========== Training Loop ==========
for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    for batch_X, batch_y in train_loader:
        batch_X=batch_X.to('cpu')
        batch_y=batch_y.to('cpu')
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
    print(f"Epoch {epoch} until {epochs}")

Epoch 0 until 1900
Epoch 1 until 1900
Epoch 2 until 1900
Epoch 3 until 1900
Epoch 4 until 1900
Epoch 5 until 1900
Epoch 6 until 1900
Epoch 7 until 1900
Epoch 8 until 1900
Epoch 9 until 1900
Epoch 10 until 1900
Epoch 11 until 1900
Epoch 12 until 1900
Epoch 13 until 1900
Epoch 14 until 1900
Epoch 15 until 1900
Epoch 16 until 1900
Epoch 17 until 1900
Epoch 18 until 1900
Epoch 19 until 1900
Epoch 20 until 1900
Epoch 21 until 1900
Epoch 22 until 1900
Epoch 23 until 1900
Epoch 24 until 1900
Epoch 25 until 1900
Epoch 26 until 1900
Epoch 27 until 1900
Epoch 28 until 1900
Epoch 29 until 1900
Epoch 30 until 1900
Epoch 31 until 1900
Epoch 32 until 1900
Epoch 33 until 1900
Epoch 34 until 1900
Epoch 35 until 1900
Epoch 36 until 1900
Epoch 37 until 1900
Epoch 38 until 1900
Epoch 39 until 1900
Epoch 40 until 1900
Epoch 41 until 1900
Epoch 42 until 1900
Epoch 43 until 1900
Epoch 44 until 1900
Epoch 45 until 1900
Epoch 46 until 1900
Epoch 47 until 1900
Epoch 48 until 1900
Epoch 49 until 1900
Epoch 50 u

In [8]:
import helper
helper.evaluate_model(model,val_loader,"pytorch","pytorch",0)

{'iteration': 0,
 'name': 'pytorch->pytorch',
 'accuracy': 0.8733333333333333,
 'precision': 0.45,
 'recall': 0.5294117647058824,
 'f1': 0.4864864864864865,
 'auc': 0.7996461742591773}

# Prediction on Different Data Set


In [9]:
inference_dataset=TextDataset.TextDatasetTFIDF("datasets/caffe.csv")
inference_loader = DataLoader(inference_dataset, batch_size=32, shuffle=False)

Preprocessed DataFrame:
                                                   text  sentiment
9    unable to reproduce accuracy of bvlc-alexnet. ...          1
267  osx: abs not defined absval_layer. When compil...          0
143  cafe_intsall.caffe 36 error. I am trying caffe...          0
212   undefined reference to `lzma_index_end@XZ_5.0...          0
227  Dimension mismatch training with my own model ...          0


In [10]:
model.eval()
all_preds = []
all_probs = []
all_targets = []

with torch.no_grad():
    for batch_X, batch_y in inference_loader:
        outputs = model(batch_X)
        probs = torch.softmax(outputs, dim=1)
        preds = torch.argmax(probs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs[:, 1].cpu().numpy())
        all_targets.extend(batch_y.cpu().numpy())


In [46]:
accuracy = accuracy_score(all_targets, all_preds)
precision = precision_score(all_targets, all_preds, average='binary')
recall = recall_score(all_targets, all_preds, average='binary')
f1 = f1_score(all_targets, all_preds, average='binary')
try:
    auc = roc_auc_score(all_targets, all_probs)
except ValueError:
    auc = None

print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")
print(f"AUC       : {auc:.4f}" if auc is not None else "AUC: Not computable")

Accuracy  : 0.8531
Precision : 0.0909
Recall    : 0.0303
F1 Score  : 0.0455
AUC       : 0.4260


# Trying Tent on Title Body Caffe

In [12]:
import tent
import TextDataset
import importlib
# import batch_data_loader
importlib.reload(TextDataset)  
importlib.reload(tent)
import copy

In [45]:
model_state = copy.deepcopy(model.state_dict())
model=tent.configureTent(model)
result=tent.Tent(model,inference_loader,"pytorch","caffe",1)
print(result)

Adapting with Tent: 100%|██████████| 9/9 [00:00<00:00, 172.39it/s]


📊 Test-Time Adaptation (Tent) Metrics:
  Accuracy : 84.97%
  F1 Score : 0.0444
  Precision: 0.0833
  Recall   : 0.0303
  ROC AUC  : 0.4174
{'iteration': 1, 'name': 'pytorch->caffe', 'accuracy': 0.8496503496503497, 'precision': 0.08333333333333333, 'recall': 0.030303030303030304, 'f1': 0.044444444444444446, 'auc': 0.41741525931249246}



