In [1]:

import pandas as pd
import numpy as np
import re
import math
import os
import subprocess

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/dwika/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from br_classification import remove_html,remove_emoji,remove_stopwords,clean_str


[nltk_data] Downloading package stopwords to /Users/dwika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Naive Bayes + TF-IDF Results ===
Number of repeats:     10
Average Accuracy:      0.6238
Average Precision:     0.6056
Average Recall:        0.7402
Average F1 score:      0.5519
Average AUC:           0.7402

Results have been saved to: ../pytorch_NB.csv


In [3]:
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']  # You can customize this list as needed
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list


In [4]:
project = 'pytorch'
path = f'datasets/{project}.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id", "Number", "sentiment", "text"])


In [5]:
project = 'pytorch'
path = f'datasets/{project}.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle


In [6]:
########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 10

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

In [7]:
data

Unnamed: 0,id,Number,sentiment,text
0,688,17355,0,the python doc global op carried source. bug m...
1,616,13787,0,dataloader segmentation fault using mpi backen...
2,322,18998,0,torch.from pil( ) request ? . feature a simple...
3,241,4302,0,torch.halftensor object attribute mean. result...
4,647,367,0,support dilation conv1d conv3d
...,...,...,...,...
747,225,14864,0,discussion recommend different file extension ...
748,712,24991,0,feature request add support selu activation ca...
749,481,14653,0,jit error reporting imported modules highlight...
750,348,19969,0,libtorch segmentation fault rhel 7 easy reprod...


In [18]:
indices = np.arange(data.shape[0])
train_index, test_index = train_test_split(
        indices, test_size=0.2, random_state=0
    )
train_index

array([495, 251,  97, 520, 473, 401, 618, 181, 586, 261,  17, 666, 252,
       333, 334, 293, 691, 386, 678, 465,  45, 312,  62, 712, 327, 267,
       140, 302, 533, 127, 424, 364, 625, 144, 355,  21, 597, 392,  35,
       456, 240, 299,  77, 746, 241, 272, 109, 242, 417, 101, 306, 258,
       230, 576, 285, 735, 395, 356, 741, 698, 211, 253, 165, 188, 578,
       268,  34, 545, 316, 748, 249, 460, 624, 155, 271, 427, 468, 517,
       200, 247, 729, 319,  12, 161, 413, 453, 493, 692, 518, 159, 529,
       428, 229,  78,  92, 523,  66, 303, 352, 609, 310, 687, 369, 409,
       580, 667,  15, 245, 283,   6, 313, 331, 104, 436, 390, 416,  90,
       725, 389, 452, 218, 570, 530, 205, 738, 190, 477, 367, 194, 467,
       132, 233, 173, 178, 727, 569, 206, 536,  96, 645, 587, 425,  89,
       553, 179,   0,  46, 171, 742, 362, 107, 133, 496, 223, 582, 102,
       108, 213, 740, 315, 246, 602, 407, 125, 437, 583, 399, 608, 224,
        26, 623, 527, 492,   3, 134, 325, 350, 186, 564, 295, 74

In [19]:
test_index

array([647, 142, 415,  79, 214,  40, 397,  27, 263, 318, 673, 317, 422,
        14, 572, 531,  31, 534, 651, 479, 162, 406, 344, 463, 418, 391,
       449, 193, 346, 156, 202, 538, 722, 278, 103, 516,   8, 243, 505,
       122, 747, 613, 568,  75, 204, 404, 157, 222,   1, 294, 434,  55,
       650, 382, 693, 665, 363, 365, 585, 235, 542, 113, 170, 490, 636,
       210, 674, 612, 556, 605, 337, 521, 614, 454, 567, 546, 573, 375,
       250,  85, 236, 487, 187,  18, 172, 688, 595, 702,  50, 744, 519,
       714, 408, 592, 385, 374,  10,  68, 279, 255,   2, 745, 451, 743,
       654, 435, 644, 588, 696, 231, 351, 239, 400, 681, 175, 215, 366,
       360, 591, 526,  71,  49, 266, 402, 354, 649, 502, 474,  37, 694,
        48, 685, 628, 196, 503,  76,  64, 603,  52, 301, 403, 506, 338,
       641, 620, 462, 466, 118, 447, 575, 751])

In [20]:
train_text = data[text_col].iloc[train_index]
train_text

495    getting build error. i building pytorch i gett...
251    document torch.quantize per tensor torch.quant...
97     bug fail throw error computing loss tensors sh...
520    feature request pytorch more flexible optimize...
473    nccl hang pytorch distributed data parallel mi...
                             ...                        
707    mysterious tensor indexing problem. bug indexi...
192    migrate ` multinomial alias setup` th aten (cp...
629    build nccl failed build libnccl debian unstabl...
559    switch cuda svd qr using cusolver. currently u...
684    detach working properly stochastic variables. ...
Name: text, Length: 601, dtype: object

In [21]:
y_train = data['sentiment'].iloc[train_index]
y_test  = data['sentiment'].iloc[test_index]

In [22]:
test_text = data[text_col].iloc[test_index]

In [None]:
tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
X_train = tfidf.fit_transform(train_text) # this one traisn
X_test = tfidf.transform(test_text) 

In [25]:
X_train.shape

(601, 1000)

In [8]:
import matplotlib.pyplot as plt

In [None]:
for repeated_time in range(REPEAT):
    # --- 4.1 Split into train/test ---
    indices = np.arange(data.shape[0])
    train_index, test_index = train_test_split(
        indices, test_size=0.2, random_state=repeated_time
    )

    train_text = data[text_col].iloc[train_index]
    test_text = data[text_col].iloc[test_index]

    y_train = data['sentiment'].iloc[train_index]
    y_test  = data['sentiment'].iloc[test_index]

    # --- 4.2 TF-IDF vectorization ---
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
    X_train = tfidf.fit_transform(train_text)
    X_test = tfidf.transform(test_text)
   
    # --- 4.3 Naive Bayes model & GridSearch ---
    clf = GaussianNB()
    grid = GridSearchCV(
        clf,
        params,
        cv=5,              # 5-fold CV (can be changed)
        scoring='roc_auc'  # Using roc_auc as the metric for selection
    )
    grid.fit(X_train.toarray(), y_train)

    # Retrieve the best model
    best_clf = grid.best_estimator_
    best_clf.fit(X_train.toarray(), y_train)

    # --- 4.4 Make predictions & evaluate ---
    y_pred = best_clf.predict(X_test.toarray())

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    # Precision (macro)
    prec = precision_score(y_test, y_pred, average='macro')
    precisions.append(prec)

    # Recall (macro)
    rec = recall_score(y_test, y_pred, average='macro')
    recalls.append(rec)

    # F1 Score (macro)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(f1)

    # AUC
    # If labels are 0/1 only, this works directly.
    # If labels are something else, adjust pos_label accordingly.
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    auc_val = auc(fpr, tpr)
    auc_values.append(auc_val)

# --- 4.5 Aggregate results ---
final_accuracy  = np.mean(accuracies)
final_precision = np.mean(precisions)
final_recall    = np.mean(recalls)
final_f1        = np.mean(f1_scores)
final_auc       = np.mean(auc_values)

print("=== Naive Bayes + TF-IDF Results ===")
print(f"Number of repeats:     {REPEAT}")
print(f"Average Accuracy:      {final_accuracy:.4f}")
print(f"Average Precision:     {final_precision:.4f}")
print(f"Average Recall:        {final_recall:.4f}")
print(f"Average F1 score:      {final_f1:.4f}")
print(f"Average AUC:           {final_auc:.4f}")

# Save final results to CSV (append mode)
try:
    # Attempt to check if the file already has a header
    existing_data = pd.read_csv(out_csv_name, nrows=1)
    header_needed = False
except:
    header_needed = True

df_log = pd.DataFrame(
    {
        'repeated_times': [REPEAT],
        'Accuracy': [final_accuracy],
        'Precision': [final_precision],
        'Recall': [final_recall],
        'F1': [final_f1],
        'AUC': [final_auc],
        'CV_list(AUC)': [str(auc_values)]
    }
)

df_log.to_csv(out_csv_name, mode='a', header=header_needed, index=False)

print(f"\nResults have been saved to: {out_csv_name}")

=== Naive Bayes + TF-IDF Results ===
Number of repeats:     10
Average Accuracy:      0.6238
Average Precision:     0.6056
Average Recall:        0.7402
Average F1 score:      0.5519
Average AUC:           0.7402

Results have been saved to: ../pytorch_NB.csv


In [17]:
train_text = data[text_col].iloc[train_index]
test_text = data[text_col].iloc[test_index]

y_train = data['sentiment'].iloc[train_index]
y_test  = data['sentiment'].iloc[test_index]

    # --- 4.2 TF-IDF vectorization ---
tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
X_train = tfidf.fit_transform(train_text)
X_test = tfidf.transform(test_text)

In [19]:
X_train.shape

(601, 1000)

<601x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 26895 stored elements in Compressed Sparse Row format>