# Scoring Opinions and Sentiments

## Understanding How Machines Read

In [1]:
text_1 = 'The quick brown fox jumps over the lazy dog.'
text_2 = 'My dog is quick and can jump over fences.'
text_3 = 'Your dog is so lazy that it sleeps all the day.'
corpus = [text_1, text_2, text_3]

In [2]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(binary=True).fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense())

[[0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1]]


In [3]:
print(vectorizer.vocabulary_)

{'day': 4, 'jumps': 11, 'that': 18, 'the': 19, 'is': 8, 'fences': 6, 'lazy': 12, 'and': 1, 'quick': 15, 'my': 13, 'can': 3, 'it': 9, 'so': 17, 'all': 0, 'brown': 2, 'dog': 5, 'jump': 10, 'over': 14, 'sleeps': 16, 'your': 20, 'fox': 7}


## Processing and Enhancing Text

In [4]:
text_4 = 'A black dog just passed by but my dog is brown.'
corpus.append(text_4)
vectorizer = text.CountVectorizer().fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense()[-1])

[[0 0 1 1 1 1 0 0 2 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0]]


In [5]:
TfidF = text.TfidfTransformer(norm='l1')
tfidf = TfidF.fit_transform(vectorized_text)

phrase = 3 # choose a number from 0 to 3
total = 0
for word in vectorizer.vocabulary_:
    pos = vectorizer.vocabulary_[word]
    value = list(tfidf.toarray()[phrase])[pos]
    if value !=0:
        print ("%10s: %0.3f" % (word, value))
        total += value
print ('\nSummed values of a phrase: %0.1f' % total)

        is: 0.077
        by: 0.121
     brown: 0.095
       dog: 0.126
      just: 0.121
        my: 0.095
     black: 0.121
    passed: 0.121
       but: 0.121

Summed values of a phrase: 1.0


In [6]:
bigrams = text.CountVectorizer(ngram_range=(2,2))
print (bigrams.fit(corpus).vocabulary_)

{'can jump': 6, 'by but': 5, 'over the': 21, 'it sleeps': 13, 'your dog': 31, 'the quick': 30, 'and can': 1, 'so lazy': 26, 'is so': 12, 'dog is': 7, 'quick brown': 24, 'lazy dog': 17, 'fox jumps': 9, 'is brown': 10, 'my dog': 19, 'passed by': 22, 'lazy that': 18, 'black dog': 2, 'brown fox': 3, 'that it': 27, 'quick and': 23, 'the day': 28, 'just passed': 16, 'dog just': 8, 'jump over': 14, 'sleeps all': 25, 'over fences': 20, 'jumps over': 15, 'the lazy': 29, 'but my': 4, 'all the': 0, 'is quick': 11}


## Stemming and removing stop words

In [9]:
from sklearn.feature_extraction import text

import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')

stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

vocab = ['Sam loves swimming so he swims all the time']
vect = text.CountVectorizer(tokenizer=tokenize, 
                           stop_words='english')
vec = vect.fit(vocab)

sentence1 = vec.transform(['George loves swimming too!'])

print (vec.get_feature_names())
print (sentence1.toarray())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
['love', 'sam', 'swim', 'time']
[[1 0 1 0]]


## Scraping Textual Datasets from the Web

In [10]:
from bs4 import BeautifulSoup
import pandas as pd
try:
    import urllib2 # Python 2.7.x
except:
    import urllib.request as urllib2 # Python 3.x

wiki = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
header = {'User-Agent': 'Mozilla/5.0'} 
query = urllib2.Request(wiki, headers=header)
page = urllib2.urlopen(query)
soup = BeautifulSoup(page, "lxml")

In [11]:
table = soup.find("table", { "class" : "wikitable sortable" })
final_table = list()
for row in table.findAll('tr'):
    cells = row.findAll("td")
    if len(cells) >=6:
        v1 = cells[1].find(text=True)
        v2 = cells[2].find(text=True)
        v3 = cells[3].find(text=True)
        v4 = cells[4].find(text=True)
        v5 = cells[6].findAll(text=True)
        v5 = v5[2].split()[0]
        final_table.append([v1, v2, v3, v4, v5])
cols = ['City','State','Population_2014','Census_2010'
        ,'Land_Area_km2']
df = pd.DataFrame(final_table, columns=cols)

In [12]:
df

Unnamed: 0,City,State,Population_2014,Census_2010,Land_Area_km2
0,New York,New York,8491079,8175133,783.8
1,Los Angeles,California,3928864,3792621,1213.9
2,Chicago,Illinois,2722389,2695598,589.6
3,Houston,Texas,2239558,2100263,1552.9
4,Philadelphia,Pennsylvania,1560297,1526006,347.3
5,Phoenix,Arizona,1537058,1445632,1338.3
6,San Antonio,Texas,1436697,1327407,1193.8
7,San Diego,California,1381069,1307402,842.2
8,Dallas,Texas,1281047,1197816,881.9
9,San Jose,California,1015785,945942,457.3


## Using Scoring and Classification

In [13]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, 
    categories = ['misc.forsale'],
     remove=('headers', 'footers', 'quotes'), random_state=101)
print ('Posts: %i' % len(dataset.data))



Posts: 585


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.95, 
            min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data)
from sklearn.decomposition import NMF
n_topics = 5
nmf = NMF(n_components=n_topics, random_state=101).fit(tfidf)

In [15]:
feature_names = vectorizer.get_feature_names()
n_top_words = 15
for topic_idx, topic in enumerate(nmf.components_):
   print ("Topic #%d:" % (topic_idx+1),)
   print (" ".join([feature_names[i] for i in 
                    topic.argsort()[:-n_top_words - 1:-1]]))

Topic #1:
drive hard card floppy monitor meg ram disk motherboard vga scsi brand color internal modem
Topic #2:
00 50 dos 20 10 15 cover 1st new 25 price man 40 shipping comics
Topic #3:
condition excellent offer asking best car old sale good new miles 10 000 tape cd
Topic #4:
email looking games game mail interested send like thanks price package list sale want know
Topic #5:
shipping vcr stereo works obo included amp plus great volume vhs unc mathes gibbs radley


In [16]:
print (nmf.components_[0,:].argsort()[:-n_top_words-1:-1]) 
# Gets top words for topic 0

[1337 1749  889 1572 2342 2263 2803 1290 2353 3615 3017  806 1022 1938 2334]


In [17]:
print (vectorizer.get_feature_names()[1337]) 
# Transforms index 1337 back to text 

drive


## Analyzing reviews from e-commerce

In [18]:
try:
    import urllib2 # Python 2.7.x
except:
    import urllib.request as urllib2 # Python 3.x
import requests, io, os, zipfile

UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip'
response = requests.get(UCI_url)
compressed_file = io.BytesIO(response.content)
z = zipfile.ZipFile(compressed_file)
print ('Extracting in %s' %  os.getcwd())
for name in z.namelist():
    filename = name.split('/')[-1]
    nameOK = ('MACOSX' not in name and '.DS' not in name)
    if filename and nameOK:
            newfile = os.path.join(os.getcwd(), 
                                   os.path.basename(filename))
            with open(newfile, 'wb') as f:
                f.write(z.read(name))
            print ('\tunzipping %s' % newfile)

Extracting in C:\Users\Luca\SciPkg\WinPython-64bit-3.4.4.1\notebooks\ML4D code
	unzipping C:\Users\Luca\SciPkg\WinPython-64bit-3.4.4.1\notebooks\ML4D code\amazon_cells_labelled.txt
	unzipping C:\Users\Luca\SciPkg\WinPython-64bit-3.4.4.1\notebooks\ML4D code\imdb_labelled.txt
	unzipping C:\Users\Luca\SciPkg\WinPython-64bit-3.4.4.1\notebooks\ML4D code\readme.txt
	unzipping C:\Users\Luca\SciPkg\WinPython-64bit-3.4.4.1\notebooks\ML4D code\yelp_labelled.txt


In [19]:
import numpy as np
import pandas as pd
dataset = 'imdb_labelled.txt'
data = pd.read_csv(dataset, header=None, sep=r"\t", engine='python')
data.columns = ['review','sentiment']

In [20]:
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [21]:
from sklearn.cross_validation import train_test_split
corpus, test_corpus, y, yt = train_test_split(data.ix[:,0], data.ix[:,1], test_size=0.25, random_state=101)

In [22]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(ngram_range=(1,2), 
                    stop_words='english').fit(corpus)
TfidF = text.TfidfTransformer()
X = TfidF.fit_transform(vectorizer.transform(corpus))
Xt = TfidF.transform(vectorizer.transform(test_corpus))

In [23]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
param_grid = {'C': [0.01, 0.1, 1.0, 10.0, 100.0]}
clf = GridSearchCV(LinearSVC(loss='hinge', 
                    random_state=101), param_grid)
clf = clf.fit(X, y)
print ("Best parameters: %s" % clf.best_params_)

Best parameters: {'C': 1.0}


In [24]:
from sklearn.metrics import accuracy_score
solution = clf.predict(Xt)
print("Achieved accuracy: %0.3f" % 
      accuracy_score(yt, solution))

Achieved accuracy: 0.816


In [25]:
print(test_corpus[yt!=solution])

601    There is simply no excuse for something this p...
32     This is the kind of money that is wasted prope...
887    At any rate this film stinks, its not funny, a...
668    Speaking of the music, it is unbearably predic...
408         It really created a unique feeling though.  
413         The camera really likes her in this movie.  
138    I saw "Mirrormask" last night and it was an un...
132    This was a poor remake of "My Best Friends Wed...
291                               Rating: 1 out of 10.  
904    I'm so sorry but I really can't recommend it t...
410    A world better than 95% of the garbage in the ...
55     But I recommend waiting for their future effor...
826    The film deserves strong kudos for taking this...
100            I don't think you will be disappointed.  
352                                    It is shameful.  
171    This movie now joins Revenge of the Boogeyman ...
814    You share General Loewenhielm's exquisite joy ...
218    It's this pandering to t