In [11]:
import pandas as pd
import string
import random
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.tag import pos_tag

In [12]:
eng_stopwords = stopwords.words('english')
punctuation_list = string.punctuation
wnl = WordNetLemmatizer()

def remove_stopwords(w_list):
     return [word for word in w_list if word not in eng_stopwords]

def remove_punctuation(w_list):
     return [word for word in w_list if word not in punctuation_list]

def remove_number(w_list):
     return [word for word in w_list if word.isalpha()]

def get_pos_tag(tag):
     if tag == 'jj':
          return 'a'
     elif tag in ['nn', 'rb', 'vb']:
          return tag[0]
     else:
          return None

def lemmatizing_words(w_list):
     lemmatized = []
     tagging = pos_tag(w_list)
     for word, tag in tagging:
          label = get_pos_tag(tag)
          if label != None:
               lemmatized.append(wnl.lemmatize(word, label))
          else:
               lemmatized.append(wnl.lemmatize(word))
     return lemmatized


In [13]:
def training_model():
     dataset = pd.read_csv('./dataset_rm.csv')

     word_list = []
     comments_list = dataset['tweet'].to_list()
     label_list = dataset['label'].to_list()

     for sentence in comments_list:
          words = word_tokenize(sentence)
          for word in words:
               word_list.append(word)
     
     word_list = remove_stopwords(word_list)
     word_list = remove_punctuation(word_list)
     word_list = remove_number(word_list)
     word_list = lemmatizing_words(word_list)

     labeled_data = list(zip(comments_list, label_list))
     fd = FreqDist(word_list)
     word_features = [word for word, _ in fd.most_common(n=5000)]
     features_sets = []

     for comment, label in labeled_data:
          features = {}

          check_list = word_tokenize(comment)
          check_list = remove_stopwords(check_list)
          check_list = remove_punctuation(check_list)
          check_list = remove_number(check_list)
          check_list = lemmatizing_words(check_list)

          for word in word_features:
               features[word] = (word in check_list)
          features_sets.append((features, label))  
     
     random.shuffle(features_sets)
     train_count = int(len(features_sets)*0.8)
     train_dataset = features_sets[:train_count]
     test_dataset = features_sets[train_count:]

     classifier = NaiveBayesClassifier.train(train_dataset)
     classifier.show_most_informative_features(n=10)
     print(f"Training Accuracy: {accuracy(classifier, test_dataset)*100}")
    
     file = open('naive_bayes.pickle','wb')
     pickle.dump(classifier, file)
     file.close()
    
     return classifier


In [14]:
# try:
#      print("Load model...")
#      file = open('naive_bayes.pickle', 'rb')
#      classifier = pickle.load(file)
#      file.close()
# except:
#      print("No Model...")
classifier = training_model()