In [1]:

import pandas as pd
import numpy as np
import re
import math
import os
import subprocess

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/dwika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from br_classification import remove_html,remove_emoji,remove_stopwords,clean_str


In [8]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/dwika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/dwika/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 10

# 3) Output CSV file name

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(clean_str)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col]= data[text_col].apply(word_tokenize)

In [10]:
data[text_col][0]

['python',
 'doc',
 'global',
 'op',
 'carried',
 'source',
 '.',
 'bug',
 'missing',
 'many',
 'global',
 'operators',
 '(',
 ',',
 ',',
 ')',
 '.',
 'reproduce',
 'expected',
 'behavior',
 'global',
 'ops',
 'contain',
 'valid',
 's.',
 'environment',
 'pytorch',
 'version',
 '1.0.1.post2',
 'debug',
 'build',
 'cuda',
 'used',
 'build',
 'pytorch',
 '9.0.176',
 'os',
 'ubuntu',
 '16.04.5',
 'lts',
 'gcc',
 'version',
 '(',
 'ubuntu',
 '5.4.0',
 '6ubuntu1',
 '16.04.11',
 ')',
 '5.4.0',
 '20160609',
 'cmake',
 'version',
 'version',
 '3.9.4',
 'python',
 'version',
 '3.6',
 'cuda',
 'available',
 'yes',
 'cuda',
 'runtime',
 'version',
 'could',
 'collect',
 'gpu',
 'models',
 'configuration',
 'gpu',
 '0',
 'geforce',
 'gtx',
 'titan',
 'black',
 'gpu',
 '1',
 'geforce',
 'gtx',
 'titan',
 'black',
 'gpu',
 '2',
 'geforce',
 'gtx',
 'titan',
 'black',
 'gpu',
 '3',
 'geforce',
 'gtx',
 'titan',
 'black',
 'nvidia',
 'driver',
 'version',
 '390.30',
 'cudnn',
 'version',
 'usr',
 'lib

In [13]:
import string

In [11]:
def remove_punkt(tokens):
    punctuation_set = set(string.punctuation)
    return [word for word in tokens if not all(char in punctuation_set for char in word)]

In [14]:
data[text_col]= data[text_col].apply(remove_punkt)

# Stemming

 The paper mentions the following:
 After tokenization and stop word removal,
the words were stemmed into their lemma (root). For example,
the words: ‘execution’, ‘executed’, ‘executable’ were stemmed
into their lemma ‘execute’.
In Table 2, we have represented the output generated by
our pre-processing steps for two example sentences. In the
first row of the table, words such as ‘after’, ‘to’, and ‘the’
have been removed. Subsequently, the word ‘added’ has been
stemmed into ‘add’ (the root word). Similarly, the second row
has been pre-processed into ‘Prime face be found’.

In [15]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/dwika/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

In [17]:
data[text_col]= data[text_col].apply(lemmatize_tokens)

# Apply WMD (Word Mover Distance) approach 

In [18]:
import gensim.downloader as api
from gensim.models import KeyedVectors
import nltk
from nltk.tokenize import word_tokenize

In [19]:
model = api.load('word2vec-google-news-300')

In [20]:
def classify_document(X_test_withYPred,model):
    min_distance = float('inf')
    best_class=None 
    X_pred_result=X_test_withYPred.copy()
    for id_pred, text_pred,_ in X_pred_result.itertuples(index=False,name=None):
        for id_real, text_real,sentiment_real in X_test_withYPred.itertuples(index=False,name=None):
            if id_pred == id_real: #avoid comparing to the same docs
                continue
            distance = model.wmdistance(text_pred,text_real)
        
            if distance < min_distance:
            #get the smallest distance and change the label
                min_distance = distance
                best_class = sentiment_real
        X_pred_result.loc[X_pred_result["id"] == id_pred, "sentiment"] = best_class #save the result
        best_class=None #reset the classes for the next for loop
        min_distance=float('inf')

    return X_pred_result

In [21]:
X_test_with_Y_Pred=pd.DataFrame(
    data,
    columns=["id",text_col,"sentiment"],  
 
)
X_test_with_Y_Pred

Unnamed: 0,id,text,sentiment
0,688,"[python, doc, global, op, carried, source, bug...",0
1,616,"[dataloader, segmentation, fault, using, mpi, ...",0
2,322,"[torch.from, pil, request, feature, simple, me...",0
3,241,"[torch.halftensor, object, attribute, mean, re...",0
4,647,"[support, dilation, conv1d, conv3d]",0
...,...,...,...
747,225,"[discussion, recommend, different, file, exten...",0
748,712,"[feature, request, add, support, selu, activat...",0
749,481,"[jit, error, reporting, imported, module, high...",0
750,348,"[libtorch, segmentation, fault, rhel, 7, easy,...",0


In [None]:
X_result=classify_document(X_test_with_Y_Pred,model)
X_result.to_csv("word2mov.csv", index=False)

In [130]:
X_result=pd.read_csv('word2mov.csv')

In [131]:
import ast

In [132]:
acc = accuracy_score(X_test_with_Y_Pred['sentiment'], X_result['sentiment'])
prec = precision_score(X_test_with_Y_Pred['sentiment'], X_result['sentiment'], average='macro')


    # Recall (macro)
rec = recall_score(X_test_with_Y_Pred['sentiment'], X_result['sentiment'], average='macro')


    # F1 Score (macro)
f1 = f1_score(X_test_with_Y_Pred['sentiment'], X_result['sentiment'], average='macro')

fpr, tpr, _ = roc_curve(X_test_with_Y_Pred['sentiment'], X_result['sentiment'], pos_label=1)
auc_val = auc(fpr, tpr)


In [122]:
print(f"Accuracy:      {acc:.4f}")
print(f"Precision:     {prec:.4f}")
print(f"Recall:        {rec:.4f}")
print(f"F1 score:      {f1:.4f}")
print(f"AUC:           {auc_val:.4f}")

Accuracy:      0.8684
Precision:     0.7005
Recall:        0.6950
F1 score:      0.6977
AUC:           0.6950


# Try the model with Naive Bayes Classifier

## according to the paper we need to remove wrongful prediction

In [135]:
X_pred_result = X_result[X_test_with_Y_Pred['sentiment'] == X_result["sentiment"]]

In [137]:
X_pred_result

Unnamed: 0,id,text,sentiment
1,616,"['dataloader', 'segmentation', 'fault', 'using...",0
2,322,"['torch.from', 'pil', 'request', 'feature', 's...",0
3,241,"['torch.halftensor', 'object', 'attribute', 'm...",0
4,647,"['support', 'dilation', 'conv1d', 'conv3d']",0
5,701,"['better', 'error', 'message', 'compiling', 'c...",0
...,...,...,...
747,225,"['discussion', 'recommend', 'different', 'file...",0
748,712,"['feature', 'request', 'add', 'support', 'selu...",0
749,481,"['jit', 'error', 'reporting', 'imported', 'mod...",0
750,348,"['libtorch', 'segmentation', 'fault', 'rhel', ...",0


In [138]:
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Impelement Naive Bayes after WMD

In [139]:
def listToString(list):
    res = ' '.join(list)
    return res

In [147]:
X_Naive_Bayes=X_pred_result.copy()

In [148]:
#Join the strings back for tf idf

# " ".join(str(x) for x in X_Naive_Bayes[text_col])
X_Naive_Bayes[text_col]=X_Naive_Bayes[text_col].apply(ast.literal_eval)
X_Naive_Bayes[text_col]=X_Naive_Bayes[text_col].apply(listToString)

In [149]:
X_Naive_Bayes[text_col]

1      dataloader segmentation fault using mpi backen...
2      torch.from pil request feature simple method t...
3      torch.halftensor object attribute mean result ...
4                         support dilation conv1d conv3d
5      better error message compiling cudnn v5 which ...
                             ...                        
747    discussion recommend different file extension ...
748    feature request add support selu activation ca...
749    jit error reporting imported module highlight ...
750    libtorch segmentation fault rhel 7 easy reprod...
751    tracking issue rpc test flaky cc ezyang gchana...
Name: text, Length: 653, dtype: object

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [170]:
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
X = vectorizer.fit_transform(X_Naive_Bayes["text"])  # Convert text into numerical vectors
y =X_Naive_Bayes["sentiment"]  # Labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# --- 1️⃣ Naive Bayes Classifier ---
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_train_nb = nb_model.predict(X_train)  # Predictions for training set
y_pred_nb = nb_model.predict(X_test)   
accuracy = accuracy_score(y_test, y_pred_nb)
# the algo on the paper says we should put the result of x_test and y_test again to the next model 
#as such we would input X_test into train_test_split
# # --- 2️⃣ KNN Classifier (Using NB Predictions as Input) ---
# knn_model = KNeighborsClassifier(n_neighbors=5)
# knn_model.fit(X_test, y_pred_nb)  # Training KNN on Naive Bayes predictions
# y_pred_knn = knn_model.predict(X_test)

# # --- 3️⃣ Random Forest Classifier (Using KNN Predictions as Input) ---
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_pred_knn)  # Training RF on KNN predictions
# y_pred_rf = rf_model.predict(X_test)

# # Final Evaluation

print(accuracy)


0.9351145038167938


In [180]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)  # Training KNN on Naive Bayes predictions
y_train_knn = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_train_knn)
accuracy

0.9312977099236641

In [177]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train_knn)  # Ensure training set matches
y_pred_rf = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_rf)
accuracy

0.9351145038167938