In [82]:
import os
from bs4 import BeautifulSoup
import lxml
from lxml.html.clean import Cleaner
from collections import OrderedDict
from operator import itemgetter
import json
import nltk
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [83]:
camera_path = "./camera/"
text = []

for domain in os.listdir(camera_path):
    if domain != '.DS_Store':
        for site in os.listdir(camera_path + domain):
            if site.endswith(".html"):
                with open(camera_path + domain + "/" + site) as curr:
                    soup = BeautifulSoup(curr, "lxml")
                    for script in soup.find_all(['script', 'style']):
                        script.decompose()
                    soup = soup.get_text()
                    text.append(soup)
                    break                    

In [84]:
clean_text = []
for t in text:
    t = os.linesep.join([s for s in t.splitlines() if s])
    clean_text.append(t)

In [85]:
def is_punctuation(word):
    for char in word:
        if char in string.punctuation:
            return True
    return False

In [99]:
import nltk
nltk.download('stopwords')

strings = str(clean_text)
 
stop_words = set(stopwords.words('english'))
words = word_tokenize(strings)
words_filtered = []
count_punct = 0

for w in words:
    w = w.lower()
    if w not in stop_words:
        if not is_punctuation(w):
            words_filtered.append(w)
        else:
            count_punct += 1

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
porter = nltk.stem.PorterStemmer()

stems = []

for word in words_filtered:
    stem = porter.stem(word)
    stems.append(stem)

In [88]:
stem2freq = {}
count = 0
for stem in stems:
    if stem in stem2freq.keys():
        count = stem2freq[stem] + 1
        stem2freq[stem] = count
    else:
        stem2freq[stem] = 1

In [89]:
stem2freq

{'eo': 25,
 'm50': 17,
 'kiss': 4,
 'digit': 2,
 'photographi': 2,
 'weekli': 2,
 'newslett': 1,
 'news': 1,
 'tip': 1,
 'read': 1,
 'mode': 1,
 'guidessampl': 1,
 'camera': 14,
 'buy': 2,
 'indexcanoncanon': 1,
 'interchang': 1,
 'len': 3,
 'feb': 4,
 '26': 5,
 '•': 1,
 '24': 2,
 'megapixel': 2,
 '3″': 1,
 'screen': 2,
 '46': 1,
 'amazon': 1,
 'review': 4,
 'product': 1,
 'shortlist': 1,
 'silver': 2,
 'apr': 5,
 '11': 2,
 '78': 2,
 'sampl': 4,
 '2': 3,
 'option': 2,
 '–': 1,
 'bodi': 1,
 'black': 11,
 'base': 5,
 'mount': 5,
 'adapt': 5,
 '32gb': 4,
 'sdhc': 4,
 'memori': 4,
 'card': 4,
 'creator': 2,
 'kit': 3,
 'white': 3,
 '14': 1,
 'user': 2,
 'write': 1,
 'question': 1,
 'q': 1,
 'ask': 1,
 'new': 4,
 'want': 4,
 '2018': 6,
 'discuss': 1,
 'canon': 11,
 'talk': 1,
 'midrang': 1,
 'mirrorless': 5,
 '24mp': 2,
 'cmo': 1,
 'sensor': 2,
 'digic': 1,
 '8': 1,
 'processor': 1,
 'dual': 3,
 'pixel': 3,
 'af': 6,
 'select': 1,
 'lens': 2,
 'offer': 2,
 'coverag': 2,
 '88': 2,
 'x': 8,
 

In [90]:
hottest_stems = {k: v for k, v in stem2freq.items() if len(k) >= 3}

In [91]:
hottest_stems

{'m50': 17,
 'kiss': 4,
 'digit': 2,
 'photographi': 2,
 'weekli': 2,
 'newslett': 1,
 'news': 1,
 'tip': 1,
 'read': 1,
 'mode': 1,
 'guidessampl': 1,
 'camera': 14,
 'buy': 2,
 'indexcanoncanon': 1,
 'interchang': 1,
 'len': 3,
 'feb': 4,
 'megapixel': 2,
 'screen': 2,
 'amazon': 1,
 'review': 4,
 'product': 1,
 'shortlist': 1,
 'silver': 2,
 'apr': 5,
 'sampl': 4,
 'option': 2,
 'bodi': 1,
 'black': 11,
 'base': 5,
 'mount': 5,
 'adapt': 5,
 '32gb': 4,
 'sdhc': 4,
 'memori': 4,
 'card': 4,
 'creator': 2,
 'kit': 3,
 'white': 3,
 'user': 2,
 'write': 1,
 'question': 1,
 'ask': 1,
 'new': 4,
 'want': 4,
 '2018': 6,
 'discuss': 1,
 'canon': 11,
 'talk': 1,
 'midrang': 1,
 'mirrorless': 5,
 '24mp': 2,
 'cmo': 1,
 'sensor': 2,
 'digic': 1,
 'processor': 1,
 'dual': 3,
 'pixel': 3,
 'select': 1,
 'lens': 2,
 'offer': 2,
 'coverag': 2,
 '100': 1,
 '143': 1,
 'point': 2,
 'first': 1,
 'use': 1,
 'cr3': 1,
 'raw': 5,
 'format': 2,
 'allow': 2,
 'smaller': 1,
 'file': 1,
 'size': 1,
 'capabl'

In [92]:
hottest_stems_ordered = [(k, hottest_stems[k]) for k in sorted(hottest_stems, key=hottest_stems.get, reverse=True)]

In [93]:
hottest_stems_ordered

[('m50', 17),
 ('camera', 14),
 ('black', 11),
 ('canon', 11),
 ('2018', 6),
 ('lumix', 6),
 ('apr', 5),
 ('base', 5),
 ('mount', 5),
 ('adapt', 5),
 ('mirrorless', 5),
 ('raw', 5),
 ('video', 5),
 ('fotocamera', 5),
 ('kiss', 4),
 ('feb', 4),
 ('review', 4),
 ('sampl', 4),
 ('32gb', 4),
 ('sdhc', 4),
 ('memori', 4),
 ('card', 4),
 ('new', 4),
 ('want', 4),
 ('guid', 4),
 ('mar', 4),
 ('nowcanon', 4),
 ('digital', 4),
 ('obiettivo', 4),
 ('len', 3),
 ('kit', 3),
 ('white', 3),
 ('dual', 3),
 ('pixel', 3),
 ('connect', 3),
 ('best', 3),
 ('nov', 3),
 ('full', 3),
 ('without', 3),
 ('qualiti', 3),
 ('high', 3),
 ('good', 3),
 ('light', 3),
 ('italia', 3),
 ('con', 3),
 ('neroeur', 3),
 ('panason', 3),
 ('soni', 3),
 ('digit', 2),
 ('photographi', 2),
 ('weekli', 2),
 ('buy', 2),
 ('megapixel', 2),
 ('screen', 2),
 ('silver', 2),
 ('option', 2),
 ('creator', 2),
 ('user', 2),
 ('24mp', 2),
 ('sensor', 2),
 ('lens', 2),
 ('offer', 2),
 ('coverag', 2),
 ('point', 2),
 ('format', 2),
 ('allo

In [94]:
count = 1
hot_words = []
for hot_word, freq in hottest_stems_ordered:
    if count < 200:
        hot_words.append(hot_word)
    count += 1

In [95]:
hot_words

['m50',
 'camera',
 'black',
 'canon',
 '2018',
 'lumix',
 'apr',
 'base',
 'mount',
 'adapt',
 'mirrorless',
 'raw',
 'video',
 'fotocamera',
 'kiss',
 'feb',
 'review',
 'sampl',
 '32gb',
 'sdhc',
 'memori',
 'card',
 'new',
 'want',
 'guid',
 'mar',
 'nowcanon',
 'digital',
 'obiettivo',
 'len',
 'kit',
 'white',
 'dual',
 'pixel',
 'connect',
 'best',
 'nov',
 'full',
 'without',
 'qualiti',
 'high',
 'good',
 'light',
 'italia',
 'con',
 'neroeur',
 'panason',
 'soni',
 'digit',
 'photographi',
 'weekli',
 'buy',
 'megapixel',
 'screen',
 'silver',
 'option',
 'creator',
 'user',
 '24mp',
 'sensor',
 'lens',
 'offer',
 'coverag',
 'point',
 'format',
 'allow',
 'take',
 'dot',
 'crop',
 'look',
 'say',
 'will',
 'shutter',
 'someth',
 'imag',
 'access',
 'captur',
 'iso',
 'focal',
 'reflex',
 'stm',
 'heavi',
 'popular',
 'alpha',
 'day',
 'switch',
 'part',
 'week',
 'one',
 'landscap',
 'newslett',
 'news',
 'tip',
 'read',
 'mode',
 'guidessampl',
 'indexcanoncanon',
 'interch

In [96]:
with open('./data/and/camera_hot_words.txt', 'w') as file:
    for hot_word in hot_words:
        file.write(hot_word + '\n')

m50
camera
black
canon
2018
lumix
apr
base
mount
adapt
mirrorless
raw
video
fotocamera
kiss
feb
review
sampl
32gb
sdhc
memori
card
new
want
guid
mar
nowcanon
digital
obiettivo
len
kit
white
dual
pixel
connect
best
nov
full
without
qualiti
high
good
light
italia
con
neroeur
panason
soni
digit
photographi
weekli
buy
megapixel
screen
silver
option
creator
user
24mp
sensor
lens
offer
coverag
point
format
allow
take
dot
crop
look
say
will
shutter
someth
imag
access
captur
iso
focal
reflex
stm
heavi
popular
alpha
day
switch
part
week
one
landscap
newslett
news
tip
read
mode
guidessampl
indexcanoncanon
interchang
amazon
product
shortlist
bodi
write
question
ask
discuss
talk
midrang
cmo
digic
processor
select
100
143
first
use
cr3
smaller
file
size
capabl
burst
continu
autofocu
fulli
articul
touchscreen
lcd
million
well
ole
electron
viewfind
record
uhd
30p
albeit
substanti
includ
parent
recommend
compress
interview
competit
cannib
dslr
sale
need
announc
expand
51200
length
slot
compat
usb
480
