In [1]:

import pandas as pd
import numpy as np
import re
import math
import os
import subprocess

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import torch
import torch.nn

[nltk_data] Downloading package stopwords to /Users/dwika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [300]:
from br_classification import remove_html,remove_emoji,remove_stopwords,clean_str

In [301]:
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']  # You can customize this list as needed
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list

In [302]:
project = 'pytorch'
path = f'datasets/{project}.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id", "Number", "sentiment", "text"])

In [303]:
project = 'pytorch'
path = f'datasets/{project}.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

In [304]:
########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 10

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

In [305]:
from anticf_model import AntiCFTextClassifier

In [306]:


X_train, X_test, y_train, y_test = train_test_split(data[text_col], data['sentiment'], train_size=0.8, random_state=42)
tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)


In [307]:
X_train = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test = torch.tensor(X_test.toarray(), dtype=torch.float32)


In [308]:
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

In [309]:
import torch
import torch.nn as nn
import torch.optim as optim

In [310]:
model = AntiCFTextClassifier(input_dim=X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [311]:
epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    
    _,outputs = model(X_train)  # Forward pass
    loss = criterion(outputs, y_train)  # Compute loss
    
    # Compute accuracy
    predictions = torch.argmax(outputs, dim=1)  # Get predicted class (0 or 1)
    correct = (predictions == y_train).sum().item()  # Count correct predictions
    accuracy = correct / y_train.size(0)  # Compute accuracy percentage
    
    # Backward pass & optimization
    loss.backward()
    optimizer.step()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy * 100:.2f}%")


Epoch [1/10], Loss: 0.6411, Accuracy: 62.56%
Epoch [2/10], Loss: 0.4115, Accuracy: 85.69%
Epoch [3/10], Loss: 0.1650, Accuracy: 95.01%
Epoch [4/10], Loss: 0.1355, Accuracy: 95.51%
Epoch [5/10], Loss: 0.1545, Accuracy: 93.01%
Epoch [6/10], Loss: 0.1443, Accuracy: 92.68%
Epoch [7/10], Loss: 0.1191, Accuracy: 94.84%
Epoch [8/10], Loss: 0.0912, Accuracy: 97.00%
Epoch [9/10], Loss: 0.0673, Accuracy: 98.00%
Epoch [10/10], Loss: 0.0499, Accuracy: 99.17%


In [312]:
for epoch in range(epochs):
    with torch.no_grad():
        _,outputs = model(X_test)
        predictions = torch.argmax(outputs, dim=1)  # Get predicted class (0 or 1)

    # Compute accuracy
    correct = (predictions == y_test).sum().item()  # Count correct predictions
    accuracy = correct / y_test.size(0) * 100  # Compute accuracy percentage
    print(f"Accuracy {epoch+1}: {accuracy * 100:.2f}%")

Accuracy 1: 8609.27%
Accuracy 2: 8609.27%
Accuracy 3: 8609.27%
Accuracy 4: 8609.27%
Accuracy 5: 8609.27%
Accuracy 6: 8609.27%
Accuracy 7: 8609.27%
Accuracy 8: 8609.27%
Accuracy 9: 8609.27%
Accuracy 10: 8609.27%


# Prediction on Different Data Set


In [313]:
project = 'caffe'
path = f'datasets/{project}.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+BodyKeras.csv', index=False, columns=["id", "Number", "sentiment", "text"])

########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+BodyKeras.csv'

# 2) Number of repeated experiments
REPEAT = 10

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

In [314]:

# X_train, X_test, y_train, y_test = train_test_split(data[text_col], data['sentiment'], train_size=0.8, random_state=42)
tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
X_new = tfidf.fit_transform(data[text_col])
X_new = torch.tensor(X_new.toarray(), dtype=torch.float32)
y_new= torch.tensor(data['sentiment'])

In [315]:
for epoch in range(epochs):
    with torch.no_grad():
        _,outputs = model(X_new)
        predictions = torch.argmax(outputs, dim=1)  # Get predicted class (0 or 1)

    # Compute accuracy
    correct = (predictions == y_new).sum().item()  # Count correct predictions
    accuracy = correct / y_new.size(0) * 100  # Compute accuracy percentage
    print(f"Accuracy {epoch+1}: {accuracy * 100:.2f}%")

Accuracy 1: 8846.15%
Accuracy 2: 8846.15%
Accuracy 3: 8846.15%
Accuracy 4: 8846.15%
Accuracy 5: 8846.15%
Accuracy 6: 8846.15%
Accuracy 7: 8846.15%
Accuracy 8: 8846.15%
Accuracy 9: 8846.15%
Accuracy 10: 8846.15%


# Trying Tent on TItle Body Caffe

In [316]:
import tent
import TextDataset
import importlib
# import batch_data_loader
importlib.reload(TextDataset)  
importlib.reload(tent)
import copy

In [317]:
dataset=TextDataset.TextDatasetTFIDF("datasets/caffe.csv")

Preprocessed DataFrame:
                                                   text  sentiment
9    unable to reproduce accuracy of bvlc-alexnet. ...          1
267  osx: abs not defined absval_layer. When compil...          0
143  cafe_intsall.caffe 36 error. I am trying caffe...          0
212   undefined reference to `lzma_index_end@XZ_5.0...          0
227  Dimension mismatch training with my own model ...          0


In [318]:
model_state = copy.deepcopy(model.state_dict())

In [319]:
for name, param in model.named_parameters():
        if "adapter" not in name:
            param.requires_grad = False


In [320]:
import anticf

In [276]:
result=anticf.AntiCF(model,dataset)
print(result)
tent.reset(model,model_state)

AntiCF Training: 100%|██████████| 5/5 [00:00<00:00, 26.47it/s]


✅ Final Accuracy: 0.8636
([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0


