## Lesson: Curate a Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

PATH = "/home/isaac/UdacityDL/Week2/sentiment_network/"
g = open(PATH + "reviews.txt", "r") # Load in what we know!
reviews = list(map(lambda x:x[:-1], g.readlines()))
g.close()

g = open(PATH + "labels.txt", "r") # What we want to know
labels = list(map(lambda x:x[:-1].upper(), g.readlines()))
g.close()

In [3]:
len(reviews)

25000

In [4]:
reviews[1]

'story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane  violent mob by the crazy chantings of it  s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it  s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly .  '

In [5]:
labels[1]

'NEGATIVE'

## Lesson: Develop a Predictive Theory

In [6]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)

labels.txt 	 : 	 reviews.txt

NEGATIVE	:	this movie is terrible but it has some good effects .  ...
POSITIVE	:	adrian pasdar is excellent is this film . he makes a fascinating woman .  ...
NEGATIVE	:	comment this movie is impossible . is terrible  very improbable  bad interpretat...
POSITIVE	:	excellent episode movie ala pulp fiction .  days   suicides . it doesnt get more...
NEGATIVE	:	if you haven  t seen this  it  s terrible . it is pure trash . i saw this about ...
POSITIVE	:	this schiffer guy is a real genius  the movie is of excellent quality and both e...


## Project 1: Quick Theory Validation

In [7]:
from collections import Counter
import numpy as np

In [8]:
pos_counter = Counter()
neg_counter = Counter()
total_counts = Counter()

In [10]:
for i in range(len(reviews)):ra
    if(labels[i] == "POSITIVE"):
        for word in reviews[i].split(" "):
            pos_counter[word] += 1
            total_counts[word] += 1
    elif(labels[i] == "NEGATIVE"):
        for word in reviews[i].split(" "):
            neg_counter[word] += 1
            total_counts[word] += 1

In [26]:
# The result of pos_counter and neg_counter is almost the same
# Need to normalize the results
# Calculate the positive negative ratio
pos_neg_ratios = Counter()

for term, cnt in list(total_counts.most_common()):
    if (cnt > 50):
        pos_neg_ratio = pos_counter[term] / float(neg_counter[term] + 1)
        pos_neg_ratios[term] = pos_neg_ratio

for word, ratio in pos_neg_ratios.most_common():
    if(ratio > 1):
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = -np.log((1/(ratio + 0.01)))

In [30]:
pos_neg_ratios.most_common()[:30]

[('edie', 4.6913478822291435),
 ('antwone', 4.4773368144782069),
 ('din', 4.4067192472642533),
 ('gunga', 4.1896547420264252),
 ('goldsworthy', 4.1743872698956368),
 ('yokai', 4.0943445622221004),
 ('gypo', 4.0943445622221004),
 ('paulie', 4.0775374439057197),
 ('visconti', 3.9318256327243257),
 ('flavia', 3.9318256327243257),
 ('gundam', 3.4231762883809305),
 ('iturbi', 3.2771447329921766),
 ('felix', 3.1527360223636558),
 ('capote', 3.0122615755052013),
 ('fido', 3.0081547935525483),
 ('excellently', 2.9789251552376097),
 ('vance', 2.9444389791664403),
 ('kolchak', 2.9311937524164198),
 ('hayworth', 2.9267394020670396),
 ('deanna', 2.9267394020670396),
 ('sammo', 2.8526314299133175),
 ('sox', 2.8332133440562162),
 ('laputa', 2.8332133440562162),
 ('biko', 2.8332133440562162),
 ('polanski', 2.8233610476132043),
 ('alvin', 2.8183982582710754),
 ('matthau', 2.8067217286092401),
 ('mathieu', 2.7725887222397811),
 ('gilliam', 2.7245795030534206),
 ('victoria', 2.6810215287142909)]

In [28]:
list(reversed(pos_neg_ratios.most_common()))[0:30]

[('boll', -4.0778152602708904),
 ('uwe', -3.9218753018711578),
 ('thunderbirds', -3.6447082358007989),
 ('beowulf', -3.6346396851769396),
 ('dahmer', -3.5572016301387364),
 ('wayans', -3.5197159818974928),
 ('seagal', -3.3202501058581921),
 ('gamera', -3.0853444322436783),
 ('dreck', -3.0370818333726866),
 ('unwatchable', -3.0269848170580955),
 ('stinker', -2.9876839403711624),
 ('mst', -2.7753833211707968),
 ('incoherent', -2.7641396677532537),
 ('flimsy', -2.7555911458712101),
 ('dillinger', -2.6762095952465517),
 ('yawn', -2.66353354771016),
 ('unfunny', -2.5545257844967644),
 ('ugh', -2.4922059522696118),
 ('turd', -2.4922059522696118),
 ('waste', -2.4907515123361046),
 ('blah', -2.4475792789485005),
 ('slugs', -2.4427317247372873),
 ('tripe', -2.3838842974917549),
 ('horrid', -2.3715779644809971),
 ('wtf', -2.3715779644809971),
 ('pointless', -2.3451073877136341),
 ('atrocious', -2.3187369339642556),
 ('damme', -2.3051524884992913),
 ('redeeming', -2.2667790015910296),
 ('prom', -