In [29]:
from collections import namedtuple
import bs4 as BeautifulSoup
import urllib.request
import pandas as pd
import json
import re

In [7]:
url = 'https://sf.eater.com/maps/best-coffee-shops-san-francisco-oakland-berkeley'

In [8]:
data = urllib.request.urlopen(url)
read_data = data.read()

In [9]:
parsed_data = BeautifulSoup.BeautifulSoup(read_data,'html.parser')

In [10]:
titles = list(map(
        lambda x: x.text[1:],
        parsed_data.select('div.c-mapstack__cards--improved section h1')[1:]
        ))
paragraphs = list(map(lambda x: x.text, filter(lambda y: 'id=' not in str(y), 
             parsed_data.select('div.c-mapstack__cards--improved section div.c-entry-content p'))))

In [11]:
titles = [re.sub(r'[0-9]+. ', '', t) for t in titles]

corpus_df = pd.DataFrame({'Name': titles, 
                          'Description': paragraphs})
corpus_df = corpus_df[['Name', 'Description']]

In [13]:
corpus_df

Unnamed: 0,Name,Description
0,Trouble Coffee,"Yeah yeah, they've got the toast you crave. Ci..."
1,Andytown Coffee Roasters,This small Outer Sunset shop chooses and roast...
2,Garden House Cafe,In the reaches of the Outer Richmond lies Gard...
3,Snowbird Coffee,"Opened by former filmmaker Eugene Kim, Snowbir..."
4,Flywheel Coffee,Haight-Ashbury's new school roaster and cafe F...
5,fifty/fifty,"Small and modern, fifty/fifty offers coffee fr..."
6,Ritual Roasters Coffee,"Ritual now has six locations in SF, the latest..."
7,The Mill,A Four Barrel-fueled outpost that highlights t...
8,Wrecking Ball Coffee Roasters,Partners Nick Cho and Trish Rothgeb are now of...
9,Lady Falcon Coffee Club,"Lady Falcon Club isn't a coffee shop, exactly...."


In [14]:
corpus_df.to_csv('reviews_blog_a.csv', index=False)

In [30]:
"""
from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
all credits go to alko and arturomp @ stack overflow.
"""

f = open('contractionList.txt', 'r')
cList = json.loads(f.read())
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

In [34]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer 
from nltk.tokenize import word_tokenize, sent_tokenize

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

new_words=('yeah', 'yea')
for i in new_words:
    stop_words.append(i)
    
stemmer = PorterStemmer() 
    
def normalizer(paragrahp):
    
    paragrahp = paragrahp.lower()
    paragrahp = expandContractions(paragrahp)
    paragrahp = paragrahp.split('.')
    paragrahp = paragrahp[:-1]
    # lower case and remove special character/whitespace
    paragrahp = [re.sub(r'[^a-zA-Z\s]','', sentence) for sentence in paragrahp]
    #paragrahp = [sentence.lower() for sentence in paragrahp]
    paragrahp = [sentence.strip() for sentence in paragrahp]
    tokens = [wpt.tokenize(sentence) for sentence in paragrahp]
    #tokens_filtered = [[word for word in token if word not in stop_words] for token in tokens]
    tokens_stemmed = [[stemmer.stem(word) for word in token] for token in  tokens]
    print (tokens_stemmed)
    doc = [' '.join(tokens) for tokens in tokens_stemmed]
    return doc

norm_corpus = normalizer(corpus_df['Description'][0])

[['got', 'toast', 'crave'], ['cinnamon', 'sugar', 'slathered', 'nutella', 'mitigate', 'toast', 'cravings'], ['also', 'got', 'best', 'coffee', 'neighborhood'], ['super', 'tiny', 'inside', 'expect', 'find', 'seat', 'instead', 'head', 'best', 'parklet', 'town', 'composed', 'huge', 'twisted', 'log'], ['also', 'great', 'place', 'seek', 'refuge', 'hours', 'long', 'wait', 'brunch', 'outerlands', 'next', 'door']]
[['got', 'toast', 'crave'], ['cinnamon', 'sugar', 'slather', 'nutella', 'mitig', 'toast', 'crave'], ['also', 'got', 'best', 'coffe', 'neighborhood'], ['super', 'tini', 'insid', 'expect', 'find', 'seat', 'instead', 'head', 'best', 'parklet', 'town', 'compos', 'huge', 'twist', 'log'], ['also', 'great', 'place', 'seek', 'refug', 'hour', 'long', 'wait', 'brunch', 'outerland', 'next', 'door']]


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True, ngram_range=(2,2))
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,also got,also great,best coffe,best parklet,brunch outerland,cinnamon sugar,coffe neighborhood,compos huge,expect find,find seat,...,seat instead,seek refug,slather nutella,sugar slather,super tini,tini insid,toast crave,town compos,twist log,wait brunch
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.63,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.42,0.0,0.0,0.0,0.0,...,0.0,0.0,0.42,0.42,0.0,0.0,0.34,0.0,0.0,0.0
2,0.5,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.27,0.27,...,0.27,0.0,0.0,0.0,0.27,0.27,0.0,0.27,0.27,0.0
4,0.0,0.3,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3
