In [34]:
# imports
import sys
import string

from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sw

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [35]:
# # the training data folder must be passed as first argument
# movie_review_data_folder = "../scikit-learn/doc/tutorial/text_analytics/data/movie_reviews/txt_sentoken"
# dataset = load_files(movie_review_data_folder, shuffle = False)

# print("n_samples: %d" % len(dataset.data))

In [36]:
# # split the dataset in training and test set:
# docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=None)

In [37]:
# tfidf = TfidfVectorizer(min_df = 3, max_df = 0.95)
# tfidf.fit_transform(docs_train, y_train)

In [38]:
# dataset.target_names

In [39]:
# tfidf

# Redo!

In [40]:
movie_review_data_folder = "../../scikit-learn/doc/tutorial/text_analytics/data/movie_reviews/txt_sentoken"
dataset = load_files(movie_review_data_folder, shuffle=False)
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=None)


# Create the document dictionary

In [41]:
stop_words = list(sw)

def term_freq(file):
    file = str(file)
    file = file.split()
    
    document_dict = set()
    
    for word in file:
        word = word.lower()
        
        for char in string.punctuation:
            word = word.replace(char, "")
            
        if word not in stop_words:
            if word not in document_dict:
                document_dict.add(word)
                
    return document_dict
    

# Count of every word in all movie reviews

In [42]:
word_counts = {}
for i in docs_train:
    current_dict = term_freq(i)
    
    for word in current_dict:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

# Build a list of positive and negative reviews

In [43]:
positive_reviews = []
negative_reviews = []

for idx in range(len(docs_train)):
    if y_train[idx] == 1:
        positive_reviews.append(docs_train[idx])
    else:
        negative_reviews.append(docs_train[idx])
        

# Count occurance of words in the positive reviews

In [44]:
positive_word_counts = {}

for i in positive_reviews:
    current_dict = term_freq(i)
    
    for word in current_dict:
        if word in positive_word_counts:
            positive_word_counts[word] += 1
        else:
            positive_word_counts[word] = 1

# Count occurance of words in the negative reviews

In [45]:
negative_word_counts = {}

for i in negative_reviews:
    current_dict = term_freq(i)
    
    for word in current_dict:
        if word in negative_word_counts:
            negative_word_counts[word] += 1
        else:
            negative_word_counts[word] = 1

# What is the total number of words in all of the reviews, just positive reviews, and just negative reviews?

In [46]:
total_words = 0
for word in word_counts:
    total_words += word_counts[word]
    
    
total_words_positive = 0
for word in positive_word_counts:
    total_words_positive += positive_word_counts[word]
    
    
total_words_negative = 0
for word in negative_word_counts:
    total_words_negative += negative_word_counts[word]
    
print(total_words)

400372


# Make the Baysean classifier

In [100]:
def Bayes(word):
    positive_review_probability = 768/1500
    negative_review_probability = 732/1500
    
    if word in word_counts and word in positive_word_counts and word in negative_word_counts:
        word_probability = word_counts[word] / total_words
        
        positive_word_probability = positive_word_counts[word] / total_words_positive
        negative_word_probability = negative_word_counts[word] / total_words_negative
        
        final_prbability_positive = (positive_review_probability * positive_word_probability) / word_probability
        final_prbability_negative = (negative_review_probability * negative_word_probability) / word_probability
        
        return (final_prbability_positive, final_prbability_negative)
#         return "P(Positive | '" + word + "') = " + str(final_prbability_positive) + "\nP(Negative | '" + word + "') = " + str(final_prbability_negative) + "\n"
    else:
        return (None, None)
#         return "The given word is not used in any of the reviews, therefore the probability cannot be defined\n"

In [95]:
print(Bayes("explosion"))
print(Bayes("tension"))
print(Bayes("mystery"))
print(Bayes("scifi"))
print(Bayes("action"))
print(Bayes("elegant"))
print(Bayes("mastery"))
print(Bayes("fiction"))

P(Positive | 'explosion') = 0.47558833942346734
P(Negative | 'explosion') = 0.5245062542067698

P(Positive | 'tension') = 0.5739859268903916
P(Negative | 'tension') = 0.4258530402493836

P(Positive | 'mystery') = 0.46369863093788055
P(Negative | 'mystery') = 0.5364268508932872

P(Positive | 'scifi') = 0.5276338558132053
P(Negative | 'scifi') = 0.47232552908842906

P(Positive | 'action') = 0.4399493144945189
P(Negative | 'action') = 0.5602378655405337

P(Positive | 'elegant') = 0.7989884102314251
P(Negative | 'elegant') = 0.2002660243334939

P(Positive | 'mastery') = 0.5992413076735689
P(Negative | 'mastery') = 0.4005320486669878

P(Positive | 'fiction') = 0.6128604283025135
P(Negative | 'fiction') = 0.3868775470078859



# Probability of a review being positive of negative

In [106]:
def TypeProbability(review):
    review = str(review).split()
    
    probability_positive_list = []
    probability_negative_list = []
    
    for word in review:
        positive_probability, negative_probability = Bayes(word)
        
        if positive_probability != None:
            probability_positive_list.append(positive_probability)
            probability_negative_list.append(negative_probability)
            
    probability_positive = 1
    probability_not_positive = 1
    for i in probability_positive_list:
        probability_positive = probability_positive * i
        probability_not_positive = probability_not_positive * (1 - i)
        
    final_probability_positive = probability_positive / (probability_positive + probability_not_positive)
    
    
    probability_negative = 1
    probability_not_negative = 1
    for i in probability_negative_list:
        probability_negative = probability_negative * i
        probability_not_negative = probability_not_negative * (1 - i)
        
    final_probability_negative = probability_negative / (probability_negative + probability_not_negative)
    
    return "P(Positive) = " + str(final_probability_positive) + "\nP(Negative) = " + str(final_probability_negative) + "\n"

In [115]:
print(TypeProbability("I really enjoyed that movie, it was well done and thought out in an elegant mannor"))
print(TypeProbability("This movie was a waste of my time. I would rather watch grass grow"))
print(TypeProbability("I am impartial to this movie. Not the worst movie I have seen, but also not the best movie I have seen"))
print(TypeProbability("Go see it!"))
print(TypeProbability("Don't go see it"))
print(TypeProbability("Best movie your eyes have ever witnessed"))


P(Positive) = 0.8331905225271645
P(Negative) = 0.16612493461597555

P(Positive) = 0.32338041359219083
P(Negative) = 0.6779354313030018

P(Positive) = 0.22559999529963995
P(Negative) = 0.7753213621293825

P(Positive) = 0.5
P(Negative) = 0.5

P(Positive) = 0.5
P(Negative) = 0.5

P(Positive) = 0.5959580637345812
P(Negative) = 0.4038811464694796

