In [2]:
# Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from collections import Counter
import re

In [3]:
# First we read in the dataset
dataset = pd.read_csv("Dataset/50k_imdb_movie_reviews.csv")

In [4]:
print(dataset[0:1])


                                              review  sentiment   set
0  I went and saw this movie last night after bei...          1  test


In [11]:
# The above command returned a dataframe (with column and row names)pd.
type(pd.DataFrame.as_matrix(dataset[0:3]))

numpy.ndarray

In [7]:
# Notice that we need to access rows and columns differently than a list
print(dataset.iloc[0:5,2:3])

    set
0  test
1  test
2  test
3  test
4  test


In [8]:
print(dataset[0:5])

                                              review  sentiment   set
0  I went and saw this movie last night after bei...          1  test
1  Actor turned director Bill Paxton follows up h...          1  test
2  As a recreational golfer with some knowledge o...          1  test
3  I saw this film in a sneak preview, and it is ...          1  test
4  Bill Paxton has taken the true story of the 19...          1  test


In [18]:
# The dataset is already broken into test and trainging,we now have two data set
train_data = dataset.loc[dataset['set'] == 'train']
test_data = dataset.loc[dataset['set'] == 'test']

In [20]:
len(train_data.index)

25000

In [12]:
print(train_data.iloc[0:5])

                                                  review  sentiment    set
25000  Bromwell High is a cartoon comedy. It ran at t...          1  train
25001  Homelessness (or Houselessness as George Carli...          1  train
25002  Brilliant over-acting by Lesley Ann Warren. Be...          1  train
25003  This is easily the most underrated film inn th...          1  train
25004  This is not the typical Mel Brooks film. It wa...          1  train


In [13]:
for row in train_data[0:5]:
    print(row['sentiment'])

TypeError: string indices must be integers

In [14]:
# This is how we can iterate over the rows
for index,row in train_data[0:5].iterrows():
    print(row['sentiment'])

1
1
1
1
1


In [15]:
# We need a function that will split the text based upon sentiment
def get_text(reviews, score):
  # Join together the text in the reviews for a particular sentiment.
  # We lowercase to avoid "Not" and "not" being seen as different words, for example.
   
    s = ""
    for index,row in reviews.iterrows():
        if row['sentiment'] == score:
            s = s + row['review'].lower()
    
    return s

In [16]:
# We also need a function that will count word frequency for each sample
def count_text(text):
  # Split text into words based on whitespace.  Simple but effective.
  words = re.split("\s+", text)
  # Count up the occurence of each word.
  return Counter(words)

In [17]:
# Now we will capture the negative and positive samples in the training set.
# We will create two large strings, one of all text from positive reviews and one from the negatives
# We will then use these to create the word counts
# This will make the computations of the probabilities easier

# This will take a few minutes and use up some memory!

negative_train_text = get_text(train_data, 0)
positive_train_text = get_text(train_data, 1)

In [18]:
print(positive_train_text[0:100])

bromwell high is a cartoon comedy. it ran at the same time as some other programs about school life,


In [19]:
# Here we generate the word counts for each sentiment
negative_counts = count_text(negative_train_text)
# Generate word counts for positive tone.
positive_counts = count_text(positive_train_text)

In [4]:
print(negative_counts)

NameError: name 'negative_counts' is not defined

In [20]:
# We need this function to calculate a count of a given classification
def get_y_count(score):
  # Compute the count of each classification occuring in the data.
  # return len([r for r in reviews if r[1] == str(score)])
    c = 0
    for index,row in train_data.iterrows():
        if row['sentiment'] == score:
            c = c + 1
    
    return c

In [21]:
# We need these counts to use for smoothing when computing the prediction.
positive_review_count = get_y_count(1)
negative_review_count = get_y_count(0)

In [22]:
# These are the class probabilities (we saw them in the formula as P(y)).
prob_positive = positive_review_count / len(train_data)
prob_negative = negative_review_count / len(train_data)

In [23]:
print(prob_positive)

0.5


In [24]:
# Finallt, we create a function that will, given a text example, allow us to calculate the probability
# of a positive or negative review

def make_class_prediction(text, counts, class_prob, class_count):
  prediction = 1
  text_counts = Counter(re.split("\s+", text))
  for word in text_counts:
      # For every word in the text, we get the number of times that word occured in the reviews for a given class, add 1 to smooth the value, and divide by the total number of words in the class (plus the class_count to also smooth the denominator).
      # Smoothing ensures that we don't multiply the prediction by 0 if the word didn't exist in the training data.
      # We also smooth the denominator counts to keep things even.
      prediction *=  text_counts.get(word) * ((counts.get(word, 0) + 1) / (sum(counts.values()) + class_count))
  # Now we multiply by the probability of the class existing in the documents.
  return prediction * class_prob

In [25]:
print(train_data.iloc[0,0])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [26]:
print("Negative prediction: {0}".format(make_class_prediction(train_data.iloc[0,0], negative_counts, prob_negative, negative_review_count)))
print("Positive prediction: {0}".format(make_class_prediction(train_data.iloc[0,0], positive_counts, prob_positive, positive_review_count)))

Negative prediction: 0.0
Positive prediction: 0.0


In [27]:
# Here we will create a function that will actually make the prediction
def make_decision(text, make_class_prediction):
    # Compute the negative and positive probabilities.
    negative_prediction = make_class_prediction(text, negative_counts, prob_negative, negative_review_count)
    positive_prediction = make_class_prediction(text, positive_counts, prob_positive, positive_review_count)

    # We assign a classification based on which probability is greater.
    if negative_prediction > positive_prediction:
      return 0
    return 1

In [28]:
print(make_decision(train_data.iloc[1,1], make_class_prediction))

TypeError: cannot use a string pattern on a bytes-like object

In [None]:
# Now we make predictions on the test data. Since it is a large set, we will simply select 200 movies.
predictions = [make_decision(row['review'], make_class_prediction) for index,row in test_data[500:700].iterrows()]

In [56]:
# We check the accuracy. Note that when we pull the column out of the data frame, we need to convert it to a list
# to compare with the predictions

actual = test_data['sentiment'].tolist()

actual = actual[500:700]


In [57]:
accuracy = sum(1 for i in range(len(predictions)) if predictions[i] == actual[i]) / float(len(predictions))
print("{0:.4f}".format(accuracy))

0.9400
