In [1]:
from collections import namedtuple
import csv
import nltk
import logging, gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

nltk.download('punkt')   # tokenizer
nltk.download('averaged_perceptron_tagger') # POS
nltk.download('wordnet') # similarity

def with_tag(bookmarks, tag):
    return list(filter(lambda item: tag in item.tags, bookmarks))

[nltk_data] Downloading package punkt to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
bookmarks = []
Bookmark = namedtuple('Bookmark', ['title', 'tags'])

with open('marks.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    # transforms lists as strings to real lists
    for row in reader:
        bookmarks.append(Bookmark(row[0], eval(row[1])))
        
for b in bookmarks[30:50]: print(b)

Bookmark(title='Stefan Tosheff - Illustrations', tags=['design', 'people'])
Bookmark(title='\u200eStrange films are strange (actually good)', tags=['film'])
Bookmark(title='OpenGL Reference Card', tags=['compsci', 'graphics'])
Bookmark(title='The Art of Rendering', tags=['compsci', 'graphics'])
Bookmark(title='A Singular Christmas', tags=['music'])
Bookmark(title='List of musical works released in a stem format', tags=['lists', 'music', 'wikipedia'])
Bookmark(title='Alan Ranta', tags=['music', 'people'])
Bookmark(title='James Spectrum Studio', tags=['music'])
Bookmark(title='Tools of the Trade: Zorch - Omnichords', tags=['instruments', 'music'])
Bookmark(title='How to add depth to a mix', tags=['music', 'production'])
Bookmark(title='Chord Theory for Guitar', tags=['music'])
Bookmark(title='List of Internet phenomena', tags=['lists', 'misc', 'wikipedia'])
Bookmark(title='ii-V-I: A Softly Spoken Magic Spell', tags=['music'])
Bookmark(title='Mixtape Alpha with useful links', tags=['music

In [3]:
all_tags = []
for bookmark in bookmarks:
    all_tags.extend(bookmark.tags)
    
all_tags = list(set(all_tags))
print(all_tags)

['golang', 'lisp', 'crypto', 'max', 'nlp', 'articles', 'papers', 'tex', 'lists', 'later', 'algorithms', '3d', 'private', 'misc', 'people', 'tweet', 'haskell', 'c', 'viz', 'css', 'cpp', 'music-theory', 'synth', 'architecture', 'gallery', 'clojure', 'mix', 'food', 'interviews', 'tcc', '.lucio', 'ponte', 'process', 'production', 'videos', 'birmingham', 'jazz', 'compsci', '.jobs', 'language', 'history', 'audiodev', 'guitar', 'ice', 'cah', 'philosophy', 'ai', 'proglangs', 'rym', 'foundry', 'site', 'swift', 'design', '.unlisted', 'functional', 'electronics', 'interactive', 'bmarks', 'art', 'wikipedia', 'books', 'maths', 'hardware', 'illustration', 'music', 'programming', 'awesome', 'photography', 'inpe', 'assembly', 'travel', 'dsp', 'blogs', 'js', '.apto', '.shopping', 'gamedev', 'java', 'data', 'sql', 'colors', 'tvtropes', 'vim', 'instruments', 'networks', 'grybo', 'python', 'infosec', 'ux', '.sp', 'now', 'animation', 'academia', 'film', 'typography', 'installation', 'graphics', '.campanha'

In [4]:
from sklearn.model_selection import train_test_split

bookmarks_train, bookmarks_test = train_test_split(bookmarks, test_size=0.20, random_state=42)

print("Train set:", len(bookmarks_train))
print("Test set: ", len(bookmarks_test))

Train set: 2380
Test set:  595


In [5]:
for bookmark in with_tag(bookmarks_train, "compsci")[:15]:
    tokenized = nltk.word_tokenize(bookmark.title)
    print(nltk.pos_tag(tokenized))

[('tamask', 'NN'), ("'s", 'POS'), ('(', '('), ('KR0', 'NNP'), (')', ')'), ('Gists', 'VBZ')]
[('Framer', 'NNP'), ('-', ':'), ('Animation', 'NN'), ('Prototyping', 'VBG'), ('Tool', 'NN')]
[('Open', 'NNP'), ('Source', 'NNP'), ('Game', 'NNP'), ('Clones', 'NNP')]
[('A', 'DT'), ('few', 'JJ'), ('useful', 'JJ'), ('things', 'NNS'), ('to', 'TO'), ('know', 'VB'), ('about', 'IN'), ('machine', 'NN'), ('learning', 'NN')]
[('You', 'PRP'), ('Might', 'MD'), ('Not', 'RB'), ('Need', 'VB'), ('jQuery', 'NN')]
[('Alzheimer', 'NNP'), ('e', 'NN'), ('Música', 'NNP')]
[('Multiple', 'JJ'), ('lights', 'NNS')]
[('Pac-Man', 'NN'), ('AI', 'NNP')]
[('Latency', 'NN'), ('Numbers', 'NNP'), ('Every', 'NNP'), ('Programmer', 'NNP'), ('Should', 'NNP'), ('Know', 'NNP')]
[('Easing', 'VBG'), ('Equations', 'NNS')]
[('JavaScript', 'NNP'), ('for', 'IN'), ('kids', 'NNS')]
[('Miller', 'NNP'), ('Puckette', 'NNP'), ('MUS171', 'NNP'), ('Videos', 'NNP')]
[('A', 'DT'), ('Course', 'NNP'), ('in', 'IN'), ('Machine', 'NNP'), ('Learning', 'NN

In [6]:
import re
from collections import Counter, defaultdict
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

tags_stems = defaultdict(lambda: defaultdict(int))
words = []

relevant_pos = [
    "NNP",
    "NN",
    "NNS",
    "JJ",
    "VB",
    "VBG",
]

for bookmark in bookmarks_train:
    tokenized = nltk.word_tokenize(bookmark.title)
    for word, pos in nltk.pos_tag(tokenized):
        if pos in relevant_pos:
            stem = porter_stemmer.stem(word).lower()
            if re.match(r"^[\w\d.-]+$", stem):
                for tag in bookmark.tags:
                    tags_stems[tag][stem] += 1
                words.append(stem)
            
for tag in tags_stems:
    print(tag)
    print(sorted(tags_stems[tag].items(), key=lambda k_v: k_v[1], reverse=True))
    print()

.unlisted
[('video', 1), ('game', 1), ('catherin', 1)]

functional
[('program', 4), ('function', 3), ('lisp', 2), ('haskel', 2), ('jargon', 1), ('mit', 1), ('simpl', 1), ('term', 1), ('clojur', 1), ('build', 1), ('land', 1), ('own', 1), ('true', 1), ('brave', 1), ('learn', 1), ('fast', 1), ('faust', 1), ('world', 1), ('music', 1), ('sicp', 1), ('languag', 1), ('book', 1), ('comic', 1), ('convers', 1), ('hard', 1), ('six-year-old', 1)]

golang
[('go', 5), ('program', 2), ('interact', 1), ('languag', 1), ('survey', 1), ('result', 1), ('effect', 1), ('jupyt', 1), ('introduct', 1)]

electronics
[('electron', 2), ('analog', 2), ('stereo', 2), ('vintag', 2), ('reverb', 2), ('instrument', 2), ('synthes', 2), ('keyb', 2), ('equip', 1), ('fuzz', 1), ('sale', 1), ('synth', 1), ('bastl', 1), ('ond', 1), ('novo', 1), ('model', 1), ('music', 1), ('auxren', 1), ('geek', 1), ('control', 1), ('from', 1), ('midi', 1), ('circuit', 1), ('distorcedor', 1), ('handmad', 1), ('manufactur', 1), ('outer', 1), 

In [7]:
import itertools

def autotag(title, tags_stems):
    stems = []
    tokenized = nltk.word_tokenize(title)
    for word, pos in nltk.pos_tag(tokenized):
        if pos in relevant_pos:
            stem = porter_stemmer.stem(word)
            if re.match(r"^[\w\d.-]+$", stem):
                stems.append(stem)
    
    tags_points = defaultdict(int)
    
    for stem in stems:
        for tag in tags_stems:
            for t_stem, count in tags_stems[tag].items():
                if stem == t_stem:
                    tags_points[tag] += count
                    
    return list(itertools.islice(sorted(tags_points.items(), key=lambda kv: -kv[1]), 3))

for b in bookmarks_test:
    print(b.title)
    print("Computed: ", autotag(b.title, tags_stems))
    print("Real tags:", b.tags)
    print()

300 discos importantes da música brasileira
Computed:  [('music', 11), ('lists', 4), ('articles', 2)]
Real tags: ['lists', 'music']

The cinematography of films and TV shows
Computed:  [('film', 26), ('lists', 9), ('animation', 3)]
Real tags: ['colors', 'film']

woscilloscope
Computed:  []
Real tags: ['audiodev', 'music', 'viz']

Academic Earth - Free video lectures
Computed:  [('music', 9), ('design', 9), ('gaming', 8)]
Real tags: ['compsci', 'videos']

Five Worlds
Computed:  [('compsci', 4), ('misc', 4), ('illustration', 3)]
Real tags: ['articles', 'misc', 'programming']

99% Invisible Podcast - Sounds
Computed:  [('music', 25), ('audiodev', 16), ('production', 11)]
Real tags: ['music', 'production']

Soundslice Tech Talk
Computed:  [('people', 4), ('design', 3), ('audiodev', 2)]
Real tags: ['compsci']

Principle of charity
Computed:  [('design', 5), ('animation', 2), ('compsci', 1)]
Real tags: ['philosophy']

Matthew Butterick
Computed:  [('design', 1), ('typography', 1), ('awesome'

Computed:  []
Real tags: ['music', 'rym']

Acapela group - Voice synthesis - TTS
Computed:  [('tcc', 13), ('audiodev', 10), ('music', 6)]
Real tags: ['tcc']

C++ Frequently Questioned Answers
Computed:  [('compsci', 3), ('production', 1), ('ice', 1)]
Real tags: ['cpp', 'proglangs', 'programming']

Contracampo's Top Brazilian Cinema
Computed:  [('design', 5), ('lists', 4), ('music', 4)]
Real tags: ['film']

Semantic UI
Computed:  [('programming', 2), ('awesome', 1), ('inpe', 1)]
Real tags: ['design']

Apple needs to tackle digital addiction
Computed:  [('design', 6), ('audiodev', 5), ('compsci', 3)]
Real tags: ['articles']

Big Cartoon DB
Computed:  [('design', 4), ('compsci', 3), ('film', 2)]
Real tags: ['animation', 'film']

The Line Animation — Work
Computed:  [('animation', 13), ('film', 12), ('compsci', 9)]
Real tags: ['animation', 'people']

Introduction to QT programming - YouTube
Computed:  [('compsci', 32), ('programming', 20), ('proglangs', 10)]
Real tags: ['compsci']

MindLab

Computed:  [('film', 25), ('lists', 9), ('animation', 3)]
Real tags: ['film', 'lists']

Max/MSP twisted comics
Computed:  [('misc', 2), ('awesome', 2), ('illustration', 2)]
Real tags: ['music']

Canal de alguém do The Mars Volta
Computed:  [('production', 2), ('music', 2)]
Real tags: ['music']

OpenGL Reference Card
Computed:  [('compsci', 2), ('programming', 2), ('graphics', 2)]
Real tags: ['compsci', 'graphics']

Sheet Music Art
Computed:  [('music', 66), ('audiodev', 29), ('design', 26)]
Real tags: ['design']

Filosofia da linguagem (6): Austin e Searle e os atos de fala
Computed:  [('papers', 4), ('tcc', 4), ('music', 3)]
Real tags: ['tcc']

Crystal Towers - Skyscrapers heights visualised
Computed:  [('architecture', 2), ('photography', 2), ('design', 1)]
Real tags: ['3d', 'art', 'awesome', 'viz']

Bastard Tetris
Computed:  [('compsci', 2), ('awesome', 1), ('programming', 1)]
Real tags: ['gaming', 'haha']

Telling the time - Sundials
Computed:  [('misc', 4), ('music', 4), ('compsci

Computed:  [('compsci', 9), ('algorithms', 4), ('awesome', 3)]
Real tags: ['algorithms', 'compsci', 'later', 'videos']

Designing with Microcontrollers Final Projects
Computed:  [('design', 63), ('compsci', 14), ('books', 12)]
Real tags: ['academia', 'compsci']

Bart Hopkin - Instrument inventor / designer
Computed:  [('design', 59), ('music', 17), ('instruments', 15)]
Real tags: ['instruments', 'music', 'people']

Google Interview Books
Computed:  [('books', 28), ('compsci', 24), ('design', 17)]
Real tags: ['compsci']

imgui - Bloat-free Immediate Mode Graphical User interface for C++
Computed:  [('design', 19), ('ux', 12), ('compsci', 11)]
Real tags: ['compsci', 'programming', 'ux']

Logic - Sound on Sound
Computed:  [('music', 50), ('audiodev', 33), ('production', 22)]
Real tags: ['music']

Stefan Tosheff - Illustrations
Computed:  [('illustration', 14), ('people', 7), ('design', 5)]
Real tags: ['design', 'people']

Wham Line (music)
Computed:  [('music', 61), ('audiodev', 28), ('co

Computed:  [('compsci', 38), ('books', 6), ('awesome', 6)]
Real tags: ['books', 'compsci', 'later']

The Magical Number Seven, Plus or Minus Two
Computed:  [('music', 4), ('compsci', 2), ('haha', 2)]
Real tags: ['maths', 'misc']

How the Mellotron Works
Computed:  [('music', 6), ('compsci', 3), ('programming', 3)]
Real tags: ['instruments', 'videos']

Julian House - Portfolio
Computed:  [('design', 2), ('people', 2), ('art', 2)]
Real tags: ['design', 'people']

Get files inside given directory
Computed:  [('programming', 4), ('audiodev', 3), ('compsci', 3)]
Real tags: ['compsci', 'ice']

Studio Headphones
Computed:  [('production', 8), ('music', 7), ('people', 3)]
Real tags: ['music']

Retro Synth Ads
Computed:  [('synth', 11), ('music', 8), ('design', 6)]
Real tags: ['design', 'illustration', 'synth']

Spurious Correlations
Computed:  [('tcc', 2), ('awesome', 1), ('articles', 1)]
Real tags: ['compsci']

An Illustrated Book of Bad Arguments
Computed:  [('books', 30), ('illustration', 1

Computed:  [('compsci', 58), ('programming', 19), ('books', 12)]
Real tags: ['misc', 'programming']

CSS Diner
Computed:  [('webdev', 6), ('css', 6), ('compsci', 1)]
Real tags: ['compsci', 'css']

Mathematics of the DFT
Computed:  [('compsci', 7), ('maths', 4), ('books', 1)]
Real tags: ['audiodev', 'maths']

manfred mohr - digital art
Computed:  [('design', 28), ('art', 20), ('illustration', 13)]
Real tags: ['art', 'people']

Online Etymology Dictionary
Computed:  [('misc', 6), ('language', 2), ('awesome', 2)]
Real tags: ['language']

Foundations of Data Science
Computed:  [('compsci', 19), ('viz', 6), ('algorithms', 4)]
Real tags: ['ai', 'books', 'data']

The Mill
Computed:  []
Real tags: ['design']

Jazz Blues Chord Substitutions for Guitar
Computed:  [('music', 27), ('jazz', 12), ('guitar', 11)]
Real tags: ['guitar', 'jazz', 'music', 'music-theory']

Cynicism (philosophy)
Computed:  [('philosophy', 3), ('compsci', 1), ('film', 1)]
Real tags: ['philosophy', 'wikipedia']

Roteiros de 

Computed:  [('design', 59), ('compsci', 13), ('books', 11)]
Real tags: ['design', 'people']

Daniel Danger Moody Drawings
Computed:  [('design', 2), ('programming', 2), ('python', 1)]
Real tags: ['design', 'people']

The Last Line Effect
Computed:  [('compsci', 3), ('film', 3), ('programming', 3)]
Real tags: ['compsci']

Eli Bendersky
Computed:  []
Real tags: ['blogs', 'compsci', 'people', 'programming']

Bookz - Encyclopedia Dramatica
Computed:  [('compsci', 1), ('music', 1)]
Real tags: ['books']

DJ Food
Computed:  [('misc', 1), ('food', 1)]
Real tags: ['music']

Andreas Wannerstedt - Motion graphics
Computed:  [('design', 15), ('graphics', 6), ('people', 6)]
Real tags: ['design', 'people']

IBM Type - Design
Computed:  [('design', 65), ('compsci', 14), ('typography', 14)]
Real tags: ['design', 'typography']

Superparticular ratio
Computed:  [('misc', 1), ('lists', 1), ('wikipedia', 1)]
Real tags: ['compsci', 'maths', 'wikipedia']

Pipl - People Search
Computed:  [('.jobs', 2), ('wik

A Private Jazz Pod in a Town House | Noise reduction
Computed:  [('music', 14), ('jazz', 8), ('guitar', 2)]
Real tags: ['acoustics', 'jazz', 'music']

Dillon Marsh Photography
Computed:  [('people', 9), ('design', 8), ('photography', 7)]
Real tags: ['design']

Home — Google Web Fundamentals
Computed:  [('compsci', 12), ('design', 12), ('webdev', 10)]
Real tags: ['design']

Scriptographer.org
Computed:  []
Real tags: ['design']

888 (number)
Computed:  [('haha', 2), ('maths', 2), ('wikipedia', 1)]
Real tags: ['maths', 'wikipedia']

Font Library (open fonts!)
Computed:  [('typography', 39), ('design', 30), ('compsci', 13)]
Real tags: ['design', 'typography']

Nick's Pedals
Computed:  [('music', 3), ('synth', 1), ('guitar', 1)]
Real tags: ['guitar', 'music']

Susan Kare
Computed:  [('design', 4), ('illustration', 4), ('.shopping', 2)]
Real tags: ['design', 'people', 'typography', 'ux']

A Matriz Quadrada | Ilha Quadrada
Computed:  []
Real tags: ['music']

Inside Abbey Road - Google
Comput

In [13]:
# check if best tag is in real tagset

success = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems)
    if guess and guess[0][0] in b.tags:
        success += 1
    
print(success / len(bookmarks_test))

0.38823529411764707


In [14]:
# skip if no tags were found

success = 0
skips = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems)
    if not guess:
        skips += 1
        continue
    if guess[0][0] in b.tags: 
            success += 1
    
print(success / (len(bookmarks_test) - skips))

0.4538310412573674


In [15]:
# only consider cases where the best tag is above a treshold

success = 0
skips = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems)
    if not guess or guess[0][1] < 10:
        skips += 1
        continue
    if guess[0][0] in b.tags: 
            success += 1
    
print(success / (len(bookmarks_test) - skips))

0.6171171171171171


In [17]:
# check if best tag is above a treshold and check if one of the three best tags is in the real tagset

success = 0
skips = 0

for b in bookmarks_test:
    guess = autotag(b.title, tags_stems)
    if not guess or guess[0][1] < 10:
        skips += 1
        continue
    
    for g in guess:
        if g[0] in b.tags:
            success += 1
            break
    
print(success / (len(bookmarks_test) - skips))

0.8513513513513513
