In [35]:
import pandas as pd
import string
import random
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.tag import pos_tag
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from gensim.models import Word2Vec

In [36]:
eng_stopwords = stopwords.words('english')
punctuation_list = string.punctuation
wnl = WordNetLemmatizer()

def remove_stopwords(w_list):
     return [word for word in w_list if word not in eng_stopwords]

def remove_punctuation(w_list):
     return [word for word in w_list if word not in punctuation_list]

def remove_number(w_list):
     return [word for word in w_list if word.isalpha()]

def get_pos_tag(tag):
     if tag == 'jj':
          return 'a'
     elif tag in ['nn', 'rb', 'vb']:
          return tag[0]
     else:
          return None

def lemmatizing_words(w_list):
     lemmatized = []
     tagging = pos_tag(w_list)
     for word, tag in tagging:
          label = get_pos_tag(tag)
          if label != None:
               lemmatized.append(wnl.lemmatize(word, label))
          else:
               lemmatized.append(wnl.lemmatize(word))
     return lemmatized


In [37]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = remove_stopwords(words)
    words = remove_punctuation(words)
    words = remove_number(words)
    words = lemmatizing_words(words)
    return ' '.join(words)

In [38]:
def train_word2vec(sentences, size=100, window=5, min_count=1, workers=4):
    model = Word2Vec(sentences, vector_size=size, window=window, min_count=min_count, workers=workers)
    return model

def preprocess_for_word2vec(texts):
    return [preprocess_text(text).split() for text in texts]

def get_average_word2vec(tokens_list, model, k=100):
    if len(tokens_list) < 1:
        return np.zeros(k)
    vectorized = [model.wv[word] if word in model.wv else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def preprocess_and_vectorize(texts, model):
    vectors = []
    for text in texts:
        tokens = preprocess_text(text).split()
        vectors.append(get_average_word2vec(tokens, model))
    return np.array(vectors)

In [40]:
def training_model():
     dataset = pd.read_csv('./dataset_rm.csv')

     word_list = []
     comments_list = dataset['tweet'].to_list()
     label_list = dataset['label'].to_list()

     for sentence in comments_list:
          words = word_tokenize(sentence)
          for word in words:
               word_list.append(word)
     
     word_list = remove_stopwords(word_list)
     word_list = remove_punctuation(word_list)
     word_list = remove_number(word_list)
     # word_list = lemmatizing_words(word_list)

     labeled_data = list(zip(word_list, label_list))
     fd = FreqDist(word_list)
     word_features = [word for word, _ in fd.most_common(n=5000)]
     features_sets = []

     for comment, label in labeled_data:
          features = {}

          check_list = word_tokenize(comment)
          check_list = remove_stopwords(check_list)
          check_list = remove_punctuation(check_list)
          check_list = remove_number(check_list)
          # check_list = lemmatizing_words(check_list)

          for word in word_features:
               features[word] = (word in check_list)
          features_sets.append((features, label))  
     
     random.shuffle(features_sets)
     train_count = int(len(features_sets)*0.8)
     train_dataset = features_sets[:train_count]
     test_dataset = features_sets[train_count:]

     classifier = NaiveBayesClassifier.train(train_dataset)
     classifier.show_most_informative_features(n=10)
     print(f"Training Accuracy: {accuracy(classifier, test_dataset)*100}")
    
     file = open('naive_bayes22.pickle','wb')
     pickle.dump(classifier, file)
     file.close()

     y_true = [label for (_, label) in test_dataset]
     y_pred = [classifier.classify(features) for (features, _) in test_dataset]

     precision = precision_score(y_true, y_pred, pos_label=1)
     recall = recall_score(y_true, y_pred, pos_label=1)
     f1 = f1_score(y_true, y_pred, pos_label=1)

     print(f'Precision: {precision:.2f}')
     print(f'Recall: {recall:.2f}')
     print(f'F1 Score: {f1:.2f}')
    
     return classifier


In [41]:
classifier = training_model()

MemoryError: 

In [17]:
# try:
#      print("Load model...")
#      file = open('naive_bayes.pickle', 'rb')
#      classifier = pickle.load(file)
#      file.close()
# except:
#      print("No Model...")
classifier = training_model()

Most Informative Features
                  NIGGER = True                1 : 0      =     79.6 : 1.0
                   Idiot = True                1 : 0      =     70.5 : 1.0
                 Radical = True                1 : 0      =     53.1 : 1.0
                feminazi = True                1 : 0      =     45.4 : 1.0
                   Miley = True                1 : 0      =     44.5 : 1.0
                   Jihad = True                1 : 0      =     43.7 : 1.0
                 colored = True                1 : 0      =     43.4 : 1.0
                 bullied = True                1 : 0      =     41.8 : 1.0
                    Rape = True                1 : 0      =     38.2 : 1.0
                    Joke = True                1 : 0      =     37.7 : 1.0
Training Accuracy: 66.16666666666666
Precision: 0.77
Recall: 0.59
F1 Score: 0.67
