# Bayersian Inference on IMDB reviews

### Part 1. Count Vectorizer.

In [12]:
# Import and data read

import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer


with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Data preprocessing

reviews = "".join([char for char in reviews if char not in string.punctuation])

reviews = reviews.split('\n')
labels = labels.split('\n')

# Tokenization, Lemmatization, Stemming. Label numerical encoding

import itertools

reviews_tokenized = []
for review in reviews:
  splitted_review = nltk.word_tokenize(review)
  splitted_review = [word for word in splitted_review if word not in stop_words]
  splitted_review = [WordNetLemmatizer().lemmatize(w) for w in splitted_review]
  splitted_review = [PorterStemmer().stem(w).strip() for w in splitted_review]
  reviews_tokenized.append(splitted_review)
  
reviews_unrolled = list(itertools.chain(*reviews_tokenized))
  
labels = [1 if label == "positive" else 0 for label in labels]

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# Train test split
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

# Count vectorizer feature transformation
count_vector = CountVectorizer(stop_words = 'english', binary = False)

training_data = count_vector.fit_transform(X_train)
test_data = count_vector.transform(X_test)

In [18]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

# Multinomial Naive Bayes model predictions
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

predictions = naive_bayes.predict(test_data)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Model evaluation
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.8604279144171165
Precision score:  0.8769359564671411
Recall score:  0.8383353341336535
F1 score:  0.8572013093289689


### Part 2. TF-IDF

In [23]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

# TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

training_data = tfidf.fit_transform(X_train)
test_data = tfidf.transform(X_test)

In [24]:
# Naive Bayes model and predictions
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

predictions = naive_bayes.predict(test_data)

In [25]:
# Model evaluation
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.8686262747450509
Precision score:  0.8734793187347932
Recall score:  0.8619447779111644
F1 score:  0.8676737160120847
