In [52]:

import pandas as pd
import numpy as np
import re
import math
import os
import subprocess

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import torch
import torch.nn

[nltk_data] Downloading package stopwords to /Users/dwika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
from br_classification import remove_html,remove_emoji,remove_stopwords,clean_str

In [54]:
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']  # You can customize this list as needed
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list

In [55]:
project = 'pytorch'
path = f'datasets/{project}.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id", "Number", "sentiment", "text"])

In [56]:
project = 'pytorch'
path = f'datasets/{project}.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

In [57]:
########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 10

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

In [58]:
from simple_mlp import MLPWithLayerNorm

In [59]:


X_train, X_test, y_train, y_test = train_test_split(data[text_col], data['sentiment'], train_size=0.8, random_state=42)
tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)


In [60]:
X_train = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test = torch.tensor(X_test.toarray(), dtype=torch.float32)


In [61]:
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

In [62]:
import torch
import torch.nn as nn
import torch.optim as optim

In [63]:
model = MLPWithLayerNorm(input_dim=X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [64]:
epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    
    outputs = model(X_train)  # Forward pass
    loss = criterion(outputs, y_train)  # Compute loss
    
    # Compute accuracy
    predictions = torch.argmax(outputs, dim=1)  # Get predicted class (0 or 1)
    correct = (predictions == y_train).sum().item()  # Count correct predictions
    accuracy = correct / y_train.size(0)  # Compute accuracy percentage
    
    # Backward pass & optimization
    loss.backward()
    optimizer.step()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy * 100:.2f}%")


Epoch [1/10], Loss: 0.6926, Accuracy: 51.58%
Epoch [2/10], Loss: 0.4307, Accuracy: 87.69%
Epoch [3/10], Loss: 0.4125, Accuracy: 87.69%
Epoch [4/10], Loss: 0.3863, Accuracy: 87.69%
Epoch [5/10], Loss: 0.3639, Accuracy: 87.69%
Epoch [6/10], Loss: 0.3498, Accuracy: 87.69%
Epoch [7/10], Loss: 0.3426, Accuracy: 87.69%
Epoch [8/10], Loss: 0.3364, Accuracy: 87.69%
Epoch [9/10], Loss: 0.3241, Accuracy: 87.69%
Epoch [10/10], Loss: 0.3053, Accuracy: 87.69%


In [65]:
for epoch in range(epochs):
    with torch.no_grad():
        outputs = model(X_test)
        predictions = torch.argmax(outputs, dim=1)  # Get predicted class (0 or 1)

    # Compute accuracy
    correct = (predictions == y_test).sum().item()  # Count correct predictions
    accuracy = correct / y_test.size(0) * 100  # Compute accuracy percentage
    print(f"Accuracy {epoch+1}: {accuracy * 100:.2f}%")

Accuracy 1: 8609.27%
Accuracy 2: 8609.27%
Accuracy 3: 8609.27%
Accuracy 4: 8609.27%
Accuracy 5: 8609.27%
Accuracy 6: 8609.27%
Accuracy 7: 8609.27%
Accuracy 8: 8609.27%
Accuracy 9: 8609.27%
Accuracy 10: 8609.27%


# Prediction on Different Data Set


In [78]:
project = 'tensorflow'
path = f'datasets/{project}.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+BodyCaffe.csv', index=False, columns=["id", "Number", "sentiment", "text"])

########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+BodyCaffe.csv'

# 2) Number of repeated experiments
REPEAT = 10

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

In [79]:

# X_train, X_test, y_train, y_test = train_test_split(data[text_col], data['sentiment'], train_size=0.8, random_state=42)
tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
X_new = tfidf.fit_transform(data[text_col])
X_new = torch.tensor(X_new.toarray(), dtype=torch.float32)
y_new= torch.tensor(data['sentiment'])

In [80]:
for epoch in range(epochs):
    with torch.no_grad():
        outputs = model(X_new)
        predictions = torch.argmax(outputs, dim=1)  # Get predicted class (0 or 1)

    # Compute accuracy
    correct = (predictions == y_new).sum().item()  # Count correct predictions
    accuracy = correct / y_new.size(0) * 100  # Compute accuracy percentage
    print(f"Accuracy {epoch+1}: {accuracy * 100:.2f}%")

Accuracy 1: 8127.52%
Accuracy 2: 8127.52%
Accuracy 3: 8127.52%
Accuracy 4: 8127.52%
Accuracy 5: 8127.52%
Accuracy 6: 8127.52%
Accuracy 7: 8127.52%
Accuracy 8: 8127.52%
Accuracy 9: 8127.52%
Accuracy 10: 8127.52%
