In [1]:
from collections import namedtuple
import csv
import nltk
import json
# import logging, gensim
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

nltk.download('punkt')   # tokenizer
nltk.download('averaged_perceptron_tagger') # POS
nltk.download('wordnet') # similarity

def with_tag(bookmarks, tag):
    return list(filter(lambda item: tag in item.tags, bookmarks))

  from collections import Sequence


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/heniozicukier/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/heniozicukier/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/heniozicukier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
bookmarks = []
Bookmark = namedtuple('Bookmark', ['title', 'tags'])

with open('marks.csv', newline='', encoding="utf8") as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    # transforms lists as strings to real lists
    for row in reader:
        bookmarks.append(Bookmark(row[0], eval(row[1])))
        
for b in bookmarks[100:105]: print(b)

Bookmark(title='deepnight.net', tags=['compsci', 'gaming'])
Bookmark(title='How to Prototype a Game in Under 7 Days', tags=['gamedev'])
Bookmark(title='Java and XML - Tutorial', tags=['compsci', 'java'])
Bookmark(title='Plain English explanation of Big O', tags=['compsci', 'maths'])
Bookmark(title='Mathigon | World of Mathematics', tags=['compsci', 'maths'])


In [4]:
for b in with_tag(bookmarks, "compsci")[:5]:
    print(b)

Bookmark(title='“Right click and save as” needs to go away', tags=['compsci'])
Bookmark(title='Fun with Java2D - Strokes', tags=['compsci'])
Bookmark(title='Books for Computer Science Graduate Students', tags=['compsci'])
Bookmark(title='Chrome Extensions Intro', tags=['compsci'])
Bookmark(title="Google's Python Class", tags=['compsci', 'python'])


In [7]:
all_tags = []
for bookmark in bookmarks:
    all_tags.extend(bookmark.tags)
    
all_tags = list(set(all_tags))
print(all_tags)

['people', 'ponte', 'travel', 'haskell', 'vim', 'c', 'philosophy', 'synth', '.jobs', 'algorithms', 'clojure', '.unlisted', 'interactive', 'mix', 'sql', 'wikipedia', 'proglangs', 'site', 'articles', 'tvtropes', 'music', 'ai', 'foundry', 'max', 'tcc', 'swift', 'functional', 'academia', 'webdev', 'hardware', 'js', 'history', 'private', 'typography', 'interviews', 'now', 'misc', 'maths', 'assembly', 'gamedev', 'gaming', '.apto', 'production', 'nlp', 'tweet', 'lists', 'music-theory', 'golang', 'grybo', 'film', 'animation', 'acoustics', 'nix', 'gallery', 'art', 'cah', 'ice', 'papers', '3d', 'graphics', 'java', 'inpe', 'food', 'networks', 'dsp', 'rym', 'photography', 'jazz', 'cpp', 'videos', '.tab-collection', 'programming', 'audiodev', 'infosec', 'birmingham', 'process', '.lucio', 'python', 'haha', 'later', 'awesome', 'instruments', 'css', 'tex', 'guitar', 'compsci', '.shopping', '.sp', '.campanha', 'installation', 'architecture', 'electronics', 'illustration', 'books', 'data', 'crypto', 'li

In [8]:
from sklearn.model_selection import train_test_split

bookmarks_train, bookmarks_test = train_test_split(bookmarks, 
                                                   test_size=0.20, 
                                                   random_state=42)

print("Train set:", len(bookmarks_train))
print("Test set: ", len(bookmarks_test))

Train set: 2380
Test set:  595


In [9]:
for bookmark in with_tag(bookmarks_train, "compsci")[:5]:
    tokenized = nltk.word_tokenize(bookmark.title)
    print(nltk.pos_tag(tokenized))

[('tamask', 'NN'), ("'s", 'POS'), ('(', '('), ('KR0', 'NNP'), (')', ')'), ('Gists', 'VBZ')]
[('Framer', 'NNP'), ('-', ':'), ('Animation', 'NN'), ('Prototyping', 'VBG'), ('Tool', 'NN')]
[('Open', 'NNP'), ('Source', 'NNP'), ('Game', 'NNP'), ('Clones', 'NNP')]
[('A', 'DT'), ('few', 'JJ'), ('useful', 'JJ'), ('things', 'NNS'), ('to', 'TO'), ('know', 'VB'), ('about', 'IN'), ('machine', 'NN'), ('learning', 'NN')]
[('You', 'PRP'), ('Might', 'MD'), ('Not', 'RB'), ('Need', 'VB'), ('jQuery', 'NN')]


In [10]:
import re
from collections import Counter, defaultdict
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
pst = PorterStemmer()
wnl = WordNetLemmatizer()

relevant_pos = [
    "NNP",
    "NN",
    "NNS",
    "JJ",
]

def train(stem_fn):
    tags_stems = defaultdict(lambda: defaultdict(int))
    stems_tags = defaultdict(lambda: defaultdict(int))
    words = []

    for bookmark in bookmarks_train:
        tokenized = nltk.word_tokenize(bookmark.title)
        for word, pos in nltk.pos_tag(tokenized):
            if pos in relevant_pos:
                stem = stem_fn(word.lower())
                if re.match(r"^[\w\d.-]+$", stem):
                    for tag in bookmark.tags:
                        tags_stems[tag][stem] += 1
                        stems_tags[stem][tag] += 1
                    words.append(stem)
                    
    return (tags_stems, stems_tags)
    
# tags_stems, stems_tags = train(porter_stemmer.stem)
tags_stems, stems_tags = train(pst.stem)

In [21]:
print(sorted(tags_stems["compsci"].items(),
             key=lambda k_v: k_v[1],
             reverse=True)[:10])

[('comput', 33), ('game', 23), ('python', 19), ('algorithm', 17), ('learn', 16), ('guid', 16), ('program', 13), ('book', 12), ('languag', 12), ('music', 12)]


In [22]:
print(sorted(stems_tags["plugin"].items(),
             key=lambda k_v: k_v[1],
             reverse=True)[:10])

[('programming', 2), ('compsci', 2), ('lists', 2), ('music', 2), ('production', 2), ('audiodev', 1), ('design', 1), ('swift', 1)]


In [23]:
import itertools

def autotag(title, tags_stems, stem_fn):
    stems = []
    tokenized = nltk.word_tokenize(title)
    for word, pos in nltk.pos_tag(tokenized):
        if pos in relevant_pos:
            stem = stem_fn(word)
            if re.match(r"^[\w\d.-]+$", stem):
                stems.append(stem)
    
    tags_points = defaultdict(int)
    
    for stem in stems:
        for tag in tags_stems:
            for t_stem, count in tags_stems[tag].items():
                if stem == t_stem:
                    tags_points[tag] += count
                    
    return list(itertools.islice(sorted(tags_points.items(), 
                                        key=lambda kv: -kv[1]), 
                                 3))

In [24]:
stem_fn = pst.stem
tags_stems, stems_tags = train(stem_fn)

for b in bookmarks_test[:5]:
    print(b.title)
    print("Computed: ", autotag(b.title, tags_stems, pst.stem))
    print("Real tags:", b.tags)
    print()

300 discos importantes da música brasileira
Computed:  [('music', 11), ('lists', 4), ('articles', 2)]
Real tags: ['lists', 'music']

The cinematography of films and TV shows
Computed:  [('film', 25), ('lists', 9), ('animation', 3)]
Real tags: ['colors', 'film']

woscilloscope
Computed:  []
Real tags: ['audiodev', 'music', 'viz']

Academic Earth - Free video lectures
Computed:  [('design', 9), ('music', 9), ('gaming', 8)]
Real tags: ['compsci', 'videos']

Five Worlds
Computed:  [('compsci', 4), ('misc', 4), ('illustration', 3)]
Real tags: ['articles', 'misc', 'programming']



In [25]:
# check if best tag is in real tagset

success = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems, stem_fn)
    if guess and guess[0][0] in b.tags:
        success += 1
    
print(success / len(bookmarks_test))

0.3831932773109244


In [26]:
# skip if no tags were found

success = 0
skips = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems, stem_fn)
    if not guess:
        skips += 1
        continue
    if guess[0][0] in b.tags: 
            success += 1
    
print(success / (len(bookmarks_test) - skips), 
      skips, 
      "skipped", " / ", 
      len(bookmarks_test))

0.456 95 skipped  /  595


In [27]:
# only consider cases where the best tag is above a treshold

success = 0
skips = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems, stem_fn)
    if not guess or guess[0][1] < 10:
        skips += 1
        continue
    if guess[0][0] in b.tags: 
            success += 1
    
print(success / (len(bookmarks_test) - skips), 
      skips, 
      "skipped", " / ", 
      len(bookmarks_test))

0.6428571428571429 385 skipped  /  595


In [None]:
# check if best tag is above a treshold and check if one of the three best tags is in the real tagset

success = 0
skips = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems, stem_fn)
    if not guess or guess[0][1] < 10:
        skips += 1
        continue
    
    for g in guess:
        if g[0] in b.tags:
            success += 1
            break
    
print(success / (len(bookmarks_test) - skips), skips, "skipped", " / ", len(bookmarks_test))

In [None]:
# same, but using wordnet lemmatizer

success = 0
skips = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems, wnl.lemmatize)
    if not guess or guess[0][1] < 10:
        skips += 1
        continue
    
    for g in guess:
        if g[0] in b.tags:
            success += 1
            break
    
print(success / (len(bookmarks_test) - skips), skips, "skipped", " / ", len(bookmarks_test))

In [None]:
def autotag2(title, stems_tags, stem_fn):
    stems = []
    tokenized = nltk.word_tokenize(title)
    for word, pos in nltk.pos_tag(tokenized):
        if pos in relevant_pos:
            stem = stem_fn(word)
            if re.match(r"^[\w\d.-]+$", stem):
                stems.append(stem)
    
    tags_points = defaultdict(int)
    
    for stem in stems:
        if stem in stems_tags:
            for tag in stems_tags[stem].items():
                tags_points[tag[0]] += tag[1]
        
    return list(itertools.islice(sorted(tags_points.items(), key=lambda kv: -kv[1]), 3))

In [None]:
stem_fn = pst.stem
tags_stems, stems_tags = train(stem_fn)

for b in bookmarks_test:
    print(b.title)
    print("Computed: ", autotag2(b.title, stems_tags, pst.stem))
    print("Real tags:", b.tags)
    print()

In [None]:
# skip if no tags were found

success = 0
skips = 0

for b in bookmarks_test:
    guess = autotag2(b.title, tags_stems, stem_fn)
    if not guess:
        skips += 1
        continue
    if guess[0][0] in b.tags: 
            success += 1
    
print(success / (len(bookmarks_test) - skips), skips, "skipped", " / ", len(bookmarks_test))

In [None]:
def autotag2(title, stems_tags, stem_fn):
    stems = []
    tokenized = nltk.word_tokenize(title)
    for word, pos in nltk.pos_tag(tokenized):
        if pos in relevant_pos:
            stem = stem_fn(word)
            if re.match(r"^[\w\d.-]+$", stem):
                stems.append(stem)
    
    tags_points = defaultdict(int)
    
    for stem in stems:
        if stem in stems_tags:
            for tag in stems_tags[stem].items():
                tags_points[tag[0]] += tag[1]
        
    return list(itertools.islice(sorted(tags_points.items(), key=lambda kv: -kv[1]), 3))