In [3]:
import pandas as pd
import string
import random
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from gensim.models import Word2Vec

eng_stopwords = stopwords.words('english')
punctuation_list = string.punctuation
wnl = WordNetLemmatizer()

def remove_stopwords(w_list):
    return [word for word in w_list if word not in eng_stopwords]

def remove_punctuation(w_list):
    return [word for word in w_list if word not in punctuation_list]

def remove_number(w_list):
    return [word for word in w_list if word.isalpha()]

def get_pos_tag(tag):
    if tag == 'jj':
        return 'a'
    elif tag in ['nn', 'rb', 'vb']:
        return tag[0]
    else:
        return None

def lemmatizing_words(w_list):
    lemmatized = []
    tagging = pos_tag(w_list)
    for word, tag in tagging:
        label = get_pos_tag(tag)
        if label != None:
            lemmatized.append(wnl.lemmatize(word, label))
        else:
            lemmatized.append(wnl.lemmatize(word))
    return lemmatized

def preprocess_text(text):
    words = word_tokenize(text)
    words = remove_stopwords(words)
    words = remove_punctuation(words)
    words = remove_number(words)
    words = lemmatizing_words(words)
    return ' '.join(words)

def train_word2vec(sentences, size=100, window=10, min_count=1, workers=5):
    model = Word2Vec(sentences, vector_size=size, window=window, min_count=min_count, workers=workers)
    return model

def preprocess_for_word2vec(texts):
    return [preprocess_text(text).split() for text in texts]

def get_average_word2vec(tokens_list, model, k=100):
    if len(tokens_list) < 1:
        return np.zeros(k)
    vectorized = [model.wv[word] if word in model.wv else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def preprocess_and_vectorize(texts, model):
    vectors = []
    for text in texts:
        tokens = preprocess_text(text).split()
        vectors.append(get_average_word2vec(tokens, model))
    return np.array(vectors)

def training_model():
    dataset = pd.read_csv('./dataset_rm.csv')

    comments_list = dataset['tweet'].to_list()
    label_list = dataset['label'].to_list()

    sentences = preprocess_for_word2vec(comments_list)
    word2vec_model = train_word2vec(sentences)
    
    X_word2vec = preprocess_and_vectorize(comments_list, word2vec_model)
    y = label_list

    X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)

    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    y_pred = nb_model.predict(X_test)

    def evaluate_model(y_test, y_pred, model_name):
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label=1)
        recall = recall_score(y_test, y_pred, pos_label=1)
        f1 = f1_score(y_test, y_pred, pos_label=1)

        print(f"{model_name} Accuracy: {accuracy}")
        print(f"{model_name} Classification Report:\n{report}")
        print(f'{model_name} Precision: {precision:.2f}')
        print(f'{model_name} Recall: {recall:.2f}')
        print(f'{model_name} F1 Score: {f1:.2f}')

    evaluate_model(y_test, y_pred, "Naive Bayes")

    # Save models
    with open('naive_bayes_word2vec.pickle', 'wb') as file:
        pickle.dump(nb_model, file)

    with open('word2vec_model.pickle', 'wb') as file:
        pickle.dump(word2vec_model, file)
    
    return nb_model, word2vec_model

model, word2vec_model = training_model()


Naive Bayes Accuracy: 0.5718888888888889
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.67      0.57      7660
           1       0.67      0.50      0.57     10340

    accuracy                           0.57     18000
   macro avg       0.58      0.58      0.57     18000
weighted avg       0.60      0.57      0.57     18000

Naive Bayes Precision: 0.67
Naive Bayes Recall: 0.50
Naive Bayes F1 Score: 0.57
