In [31]:
import pandas as pd
import nltk
import string

from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk import word_tokenize

from sklearn.model_selection import train_test_split

In [32]:
df = pd.read_csv('reviews_dataset2.csv')
print(df.head())

                                                text  label
0  one best crichton novel sphere michael crichto...      1
1  medicine future z accomplished heart surgeon f...      1
2  beautiful gorgeous network comic book contains...      1
3  lover robicheaux book lover robicheaux demon s...      1
4  excellent broad survey development civilizatio...      1


In [33]:
# loading prepared dataset and splitting it to train and test parts
nltk.download('punkt')

def word_tokenizer(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in string.punctuation]
    return tokens

def to_features(words):
    return {word: True for word in words}

dataset = [(to_features(word_tokenizer(text)), label) for text, label in zip(df['text'], df['label'])]

train_data, test_data = train_test_split(dataset, test_size=0.3, random_state=42)

[nltk_data] Downloading package punkt to /home/alex937/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
# NaiveBayesClassifier with direct usging Pandas DataFrame
classifier = NaiveBayesClassifier.train(train_data)

print(f'Accuracy: {accuracy(classifier, test_data):.2f}')
classifier.show_most_informative_features(5)

Accuracy: 0.88
Most Informative Features
                  refund = True                0 : 1      =     32.8 : 1.0
           directtovideo = True                0 : 1      =     18.6 : 1.0
               gibberish = True                0 : 1      =     18.6 : 1.0
             backordered = True                0 : 1      =     18.0 : 1.0
               excusable = True                0 : 1      =     18.0 : 1.0


In [19]:
# decrease size of dataset to reduce train and test times
print(len(dataset))
size_short_dataset = int(len(dataset)/4)
short_dataset = dataset[:size_short_dataset]
print(len(short_dataset))
print(short_dataset[:3])

short_train_data, short_test_data = train_test_split(short_dataset, test_size=0.3, random_state=42)

70481
17620
[({'one': True, 'best': True, 'crichton': True, 'novel': True, 'sphere': True, 'michael': True, 'excellant': True, 'certainly': True, 'hardest': True, 'put': True, 'read': True, 'story': True, 'revolves': True, 'around': True, 'man': True, 'named': True, 'norman': True, 'johnson': True, 'phycologist': True, 'travel': True, '4': True, 'civilans': True, 'remote': True, 'location': True, 'pacific': True, 'ocean': True, 'help': True, 'navy': True, 'top': True, 'secret': True, 'misssion': True, 'quickly': True, 'learn': True, 'half': True, 'mile': True, 'long': True, 'spaceship': True, 'center': True, '1000': True, 'foot': True, 'live': True, 'researching': True, 'spacecraft': True, 'joined': True, '5': True, 'personel': True, 'run': True, 'operation': True, 'however': True, 'surface': True, 'typhoon': True, 'come': True, 'support': True, 'ship': True, 'must': True, 'leave': True, 'team': True, 'ten': True, 'stuck': True, 'day': True, 'sea': True, 'find': True, 'actually': True,

In [24]:
# NaiveBayesClassifier with direct usging Pandas DataFrame and bigrams tokenizer
from nltk import bigrams

def tokenize_with_bigrams(text):
    tokens = word_tokenize(text)
    bigram_features = list(bigrams(tokens))
    all_features = tokens + ["_".join(bigram) for bigram in bigram_features]
    return {word: True for word in all_features}

print('Start tokenizing')
dataset_bigrams = [(tokenize_with_bigrams(text), label) for text, label in zip(df['text'], df['label'])]

print('Start splitting')
train_data_bg, test_data_bg = train_test_split(dataset_bigrams, test_size=0.3, random_state=42)

print('Start classifying')
classifier_bg = NaiveBayesClassifier.train(train_data_bg)

print(f'Accuracy: {accuracy(classifier_bg, test_data_bg):.2f}')

Start tokenizing
Start splitting
Start classifying
Accuracy: 0.82


In [26]:
# only bigrams
from nltk import bigrams

def tokenize_with_bigrams(text):
    tokens = word_tokenize(text)
    bigram_features = list(bigrams(tokens))
    all_features = ["_".join(bigram) for bigram in bigram_features]
    return {word: True for word in all_features}

print('Start tokenizing')
dataset_bigrams = [(tokenize_with_bigrams(text), label) for text, label in zip(df['text'], df['label'])]

print('Start splitting')
train_data_bg, test_data_bg = train_test_split(dataset_bigrams, test_size=0.3, random_state=42)

print('Start classifying')
classifier_bg = NaiveBayesClassifier.train(train_data_bg)

print(f'Accuracy: {accuracy(classifier_bg, test_data_bg):.2f}')


Start tokenizing
Start splitting
Start classifying
Accuracy: 0.68


In [27]:
# ngramms
from nltk.util import ngrams

def tokenize_with_ngrams(text, n=5):
    tokens = word_tokenize(text)
    ngram_features = list(ngrams(tokens, n))
    all_features = tokens + ["_".join(ngram) for ngram in ngram_features]
    return {word: True for word in all_features}

print('Start tokenizing')
dataset_ngrams = [(tokenize_with_ngrams(text), label) for text, label in zip(df['text'], df['label'])]

print('Start splitting')
train_data_ng, test_data_ng = train_test_split(dataset_ngrams, test_size=0.3, random_state=42)

print('Start classifying')
classifier_ng = NaiveBayesClassifier.train(train_data_ng)

print(f'Accuracy: {accuracy(classifier_ng, test_data_ng):.2f}')

Start tokenizing
Start splitting
Start classifying
Accuracy: 0.88


In [30]:
# save best model to file
import pickle

with open('naive_bayes_model_with_simple_tokenizer.pkl', 'wb') as file:
    pickle.dump(classifier, file)

#with open('naive_bayes_model_with_5_gramm_tokenizer.pkl', 'wb') as file:
#    pickle.dump(classifier_ng, file)

In [None]:
# NaiveBayesClassifier with PyTorch DataLoader
import torch
from torch.utils.data import Dataset, DataLoader

class ReviewsDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe['text'].values
        self.labels = dataframe['label'].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.labels[idx]
        return text, label

dataset = ReviewsDataset(df)
dataloader = DataLoader(dataset, batch_size=200, shuffle=True)

train_data = []
for texts, labels in dataloader:
    for text, label in zip(texts, labels):
        features = to_features(word_tokenizer(text))
        train_data.append((features, label))

# Обучение модели
classifier_2 = NaiveBayesClassifier.train(train_data)

print(f'Accuracy: {accuracy(classifier_2, test_data):.2f}')
classifier_2.show_most_informative_features(5)

In [10]:
# using NLTK + Custom Feature Extraction

from nltk.corpus import opinion_lexicon
# nltk.download('opinion_lexicon')

positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

def to_features_2(words):
    features = {}
    features['number_of_positive_features'] = len([word for word in words if word in positive_words])
    features['number_of_negative_features'] = len([word for word in words if word in negative_words])
    return features

all_dataset = [(to_features_2(word_tokenizer(text)), label) for text, label in zip(df['text'], df['label'])]

# filter wrong datasets to improve accuracy
filtered_train_dataset = [t for t in all_dataset if (t[1] == 1 and t[0]['number_of_positive_features'] > t[0]['number_of_negative_features']) 
                          or (t[1] == 0 and t[0]['number_of_positive_features'] < t[0]['number_of_negative_features'])]

from sklearn.model_selection import train_test_split

train_data_3_all, test_data_3_all = train_test_split(all_dataset, test_size=0.3, random_state=42)
train_data_3_filtered, test_data_3_filtered = train_test_split(filtered_train_dataset, test_size=0.3, random_state=42)

classifier_3 = NaiveBayesClassifier.train(train_data_3_filtered)
print(f'Accuracy: {accuracy(classifier_3, test_data_3_all):.2f}')


Accuracy: 0.83


In [None]:
# using DecisionTreeClassifier with direct usging Pandas DataFrame
# very slow method
from nltk.classify import DecisionTreeClassifier

classifier_4 = DecisionTreeClassifier.train(short_train_data)
print(f'Accuracy: {accuracy(classifier_4, short_test_data):.2f}')

In [15]:
# logistic regression with direct usging Pandas DataFrame
from nltk.classify import MaxentClassifier

classifier_5 = MaxentClassifier.train(train_data, max_iter=20)
print(f'Accuracy: {accuracy(classifier_5, test_data):.2f}')

  ==> Training (20 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.812


  exp_nf_delta = 2**nf_delta
  nf_exp_nf_delta = nftranspose * exp_nf_delta
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)


         Final               nan        0.188
Accuracy: 0.19


In [22]:
# maximum entropy classifier with direct usging Pandas DataFrame
from nltk.classify import ConditionalExponentialClassifier

classifier_6 = ConditionalExponentialClassifier.train(train_data)
print(f'Accuracy: {accuracy(classifier_6, test_data):.2f}')

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.812
         Final               nan        0.188
Accuracy: 0.19


In [11]:
# using opinion_lexicon without classifier
import random

from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize

positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

def to_features_2(words):
    features = {}
    features['number_of_positive_features'] = len([word for word in words if word in positive_words])
    features['number_of_negative_features'] = len([word for word in words if word in negative_words])
    return features

dataset_text_label = [(text, label) for text, label in zip(df['text'], df['label'])]
print(f'Number os dataset elements is: {len(dataset_text_label)}')

random.shuffle(dataset_text_label)

#print(dataset_text_label[:5])

def classify_text_by_opinion_lexicon(text):
    features = to_features_2(word_tokenize(text))
    return 1 if features['number_of_positive_features'] >= features['number_of_negative_features'] else 0

def calculate_accuracy(texts_set):
    correct = 0
    iterations_number = 0
    for text, label in texts_set:
        iterations_number += 1
        prediction = classify_text_by_opinion_lexicon(text)
        if prediction == label:
            correct += 1
        if iterations_number == 1000:
            iterations_number = 0
            #print(f"Next thousand was passed correct accuracy is {correct}")
    
    return correct

accuracy = calculate_accuracy(dataset_text_label) / len(dataset_text_label)
print(f"Accuracy: {accuracy:.2f}")



Number os dataset elements is: 70481
Accuracy: 0.82
