In [118]:
from collections import namedtuple
import csv
import nltk
import logging, gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

nltk.download('punkt')   # tokenizer
nltk.download('averaged_perceptron_tagger') # POS
nltk.download('wordnet') # similarity

def with_tag(bookmarks, tag):
    return list(filter(lambda item: tag in item.tags, bookmarks))

[nltk_data] Downloading package punkt to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [102]:
bookmarks = []
Bookmark = namedtuple('Bookmark', ['title', 'tags'])

with open('marks.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    # transforms lists as strings to real lists
    for row in reader:
        bookmarks.append(Bookmark(row[0], eval(row[1])))
        
for b in bookmarks[30:50]: print(b)

Bookmark(title='Stefan Tosheff - Illustrations', tags=['design', 'people'])
Bookmark(title='\u200eStrange films are strange (actually good)', tags=['film'])
Bookmark(title='OpenGL Reference Card', tags=['compsci', 'graphics'])
Bookmark(title='The Art of Rendering', tags=['compsci', 'graphics'])
Bookmark(title='A Singular Christmas', tags=['music'])
Bookmark(title='List of musical works released in a stem format', tags=['lists', 'music', 'wikipedia'])
Bookmark(title='Alan Ranta', tags=['music', 'people'])
Bookmark(title='James Spectrum Studio', tags=['music'])
Bookmark(title='Tools of the Trade: Zorch - Omnichords', tags=['instruments', 'music'])
Bookmark(title='How to add depth to a mix', tags=['music', 'production'])
Bookmark(title='Chord Theory for Guitar', tags=['music'])
Bookmark(title='List of Internet phenomena', tags=['lists', 'misc', 'wikipedia'])
Bookmark(title='ii-V-I: A Softly Spoken Magic Spell', tags=['music'])
Bookmark(title='Mixtape Alpha with useful links', tags=['music

In [113]:
all_tags = []
for bookmark in bookmarks:
    all_tags.extend(bookmark.tags)
    
all_tags = list(set(all_tags))
print(all_tags)

['3d', 'golang', 'ice', 'photography', 'maths', 'misc', 'vim', 'videos', 'architecture', 'food', 'functional', 'cpp', 'acoustics', 'infosec', '.apto', 'programming', 'blogs', 'sql', 'travel', 'audiodev', 'css', 'rym', '.shopping', 'birmingham', 'cah', 'interviews', 'networks', 'electronics', 'illustration', 'inpe', 'gaming', 'ux', 'now', 'private', 'c', 'later', 'papers', 'java', 'lisp', '.sp', 'python', 'awesome', 'js', 'lists', 'art', 'tex', 'max', 'clojure', '.jobs', 'grybo', 'typography', 'site', 'nlp', 'gallery', 'music-theory', 'design', 'viz', 'academia', 'hardware', 'bmarks', 'ai', 'crypto', 'nix', 'articles', '.campanha', 'assembly', 'guitar', 'dsp', 'webdev', 'data', 'production', 'colors', 'installation', 'language', 'film', 'people', 'synth', 'proglangs', 'tcc', 'jazz', 'philosophy', 'foundry', 'algorithms', 'music', '.lucio', 'instruments', 'wikipedia', 'interactive', '.unlisted', 'mix', 'gamedev', 'compsci', 'tvtropes', 'swift', 'graphics', 'books', 'history', 'ponte', '.

In [104]:
from sklearn.model_selection import train_test_split

bookmarks_train, bookmarks_test = train_test_split(bookmarks, test_size=0.20, random_state=42)

print(len(bookmarks_train))
print(len(bookmarks_test))

2380
595


In [119]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
words = []

for bookmark in with_tag(bookmarks_train, "production"):
    lemmatized = [wordnet_lemmatizer.lemmatize(w.lower()) for w in bookmark.title.split()]
    words.extend(lemmatized)

In [120]:
for bookmark in with_tag(bookmarks_train, "production"):
    tokenized = nltk.word_tokenize(bookmark.title)
    print(nltk.pos_tag(tokenized))

[('Studio', 'NNP'), ('Monitor', 'NNP'), ('Placement', 'NNP'), ('—', 'NNP'), ('Finding', 'VBG'), ('the', 'DT'), ('Sweet', 'NNP'), ('Spot', 'NNP')]
[('Kid', 'NNP'), ('Koala', 'NNP'), ('|', 'NNP'), ('Equipboard', 'NNP')]
[('Vinylium', 'NN')]
[('Kid', 'VB'), ('A', 'NNP'), ('(', '('), ('The', 'DT'), ('Wire', 'NNP'), (')', ')')]
[('Beautifully', 'RB'), ('Produced', 'NNP'), ('Albums', 'NNP')]
[('Aphex', 'NNP'), ('Twin', 'NNP'), ("'s", 'POS'), ('automated', 'VBN'), ('snare', 'NN'), ('drum', 'NN')]
[('Audio-Technica', 'NNP'), ('AT4051', 'NNP')]
[('Musicianship', 'NN'), ('of', 'IN'), ('Brian', 'NNP'), ('Wilson', 'NNP')]
[('Patch', 'NNP'), ('&', 'CC'), ('Tweak', 'NNP'), ('•', 'VBZ'), ('the', 'DT'), ('new', 'JJ'), ('book', 'NN'), ('about', 'IN'), ('modular', 'JJ'), ('synthesis', 'NN')]
[('Watson', 'NNP'), ('Wu', 'NNP')]
[('Sinevibes', 'NNS')]
[('Behind', 'IN'), ('The', 'DT'), ('Sounds', 'NNPS'), (':', ':'), ('I', 'PRP'), ('Know', 'VBP'), ('There', 'EX'), ("'s", 'VBZ'), ('An', 'DT'), ('Answer', 'NN

In [183]:
import re
from collections import Counter, defaultdict
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

tags_stems = defaultdict(lambda: defaultdict(int))
words = []

relevant_pos = [
    "NNP",
    "NN",
    "NNS",
    "JJ",
    "VB",
    "VBG",
]

for bookmark in bookmarks_train:
    tokenized = nltk.word_tokenize(bookmark.title)
    for word, pos in nltk.pos_tag(tokenized):
        if pos in relevant_pos:
            stem = porter_stemmer.stem(word)
            if re.match(r"^[\w\d.-]+$", stem):
                for tag in bookmark.tags:
                    tags_stems[tag][stem] += 1
                words.append(stem)
            
for tag in tags_stems:
    print(tag)
    print(sorted(tags_stems[tag].items(), key=lambda k_v: k_v[1], reverse=True))

blogs
[('blog', 5), ('sound', 4), ('stuff', 3), ('read', 3), ('horror', 2), ('code', 2), ('game', 2), ('design', 2), ('obscur', 2), ('art', 2), ('mutant', 2), ('synth', 2), ('moinsound', 1), ('My', 1), ('new', 1), ('algorithm', 1), ('creativ', 1), ('piotr', 1), ('day', 1), ('eye', 1), ('surviv', 1), ('ricbit', 1), ('strang', 1), ('celebr', 1), ('indi', 1), ('complet', 1), ('entertain', 1), ('delight', 1), ('comput', 1), ('rick', 1), ('letter', 1), ('thing', 1), ('univers', 1), ('aesthet', 1), ('fine', 1), ('jami', 1), ('discours', 1), ('toda', 1), ('bojack', 1), ('zone', 1), ('translat', 1), ('dump', 1), ('matt', 1), ('random', 1), ('artist', 1), ('histori', 1), ('float', 1), ('Ok', 1), ('zucker', 1), ('richard', 1), ('colin', 1), ('gener', 1), ('project', 1), ('styx', 1), ('techniqu', 1), ('magazin', 1), ('unpopular', 1), ('ascii', 1), ('interfac', 1), ('houaiss', 1), ('tokyo', 1), ('morti', 1), ('travelcard', 1), ('mjbshaw', 1), ('kazmierczak', 1), ('zawinski', 1), ('rekal', 1), ('qu

In [218]:
import itertools

def autotag(title, tags_stems):
    stems = []
    tokenized = nltk.word_tokenize(title)
    for word, pos in nltk.pos_tag(tokenized):
        if pos in relevant_pos:
            stem = porter_stemmer.stem(word)
            if re.match(r"^[\w\d.-]+$", stem):
                stems.append(stem)
    
    tags_points = defaultdict(int)
    
    for stem in stems:
        for tag in tags_stems:
            for t_stem, count in tags_stems[tag].items():
                if stem == t_stem:
                    tags_points[tag] += count
                    
    return list(itertools.islice(sorted(tags_points.items(), key=lambda kv: -kv[1]), 3))

for b in bookmarks_test:
    print(b.title)
    print(autotag(b.title, tags_stems))
    print(b.tags)
    print()

300 discos importantes da música brasileira
[('music', 11), ('lists', 4), ('articles', 2)]
['lists', 'music']

The cinematography of films and TV shows
[('film', 29), ('lists', 11), ('misc', 3)]
['colors', 'film']

woscilloscope
[]
['audiodev', 'music', 'viz']

Academic Earth - Free video lectures
[('design', 9), ('music', 9), ('gaming', 8)]
['compsci', 'videos']

Five Worlds
[('misc', 4), ('compsci', 4), ('illustration', 3)]
['articles', 'misc', 'programming']

99% Invisible Podcast - Sounds
[('music', 25), ('audiodev', 16), ('production', 11)]
['music', 'production']

Soundslice Tech Talk
[('people', 4), ('design', 3), ('audiodev', 2)]
['compsci']

Principle of charity
[('design', 5), ('animation', 2), ('compsci', 1)]
['philosophy']

Matthew Butterick
[('typography', 1), ('awesome', 1), ('design', 1)]
['functional', 'people', 'programming', 'typography']

Extensions • Tab collection
[('design', 7), ('.tab-collection', 5), ('gallery', 4)]
['.tab-collection', 'private']

Build Your Own

[('music', 66), ('audiodev', 29), ('books', 28)]
['music']

Ovens - Triple LP
[]
['music', 'rym']

Acapela group - Voice synthesis - TTS
[('tcc', 13), ('audiodev', 10), ('music', 6)]
['tcc']

C++ Frequently Questioned Answers
[('compsci', 3), ('music', 1), ('gamedev', 1)]
['cpp', 'proglangs', 'programming']

Contracampo's Top Brazilian Cinema
[('design', 5), ('lists', 4), ('music', 4)]
['film']

Semantic UI
[('programming', 3), ('awesome', 1), ('webdev', 1)]
['design']

Apple needs to tackle digital addiction
[('design', 6), ('audiodev', 5), ('programming', 3)]
['articles']

Big Cartoon DB
[('design', 4), ('compsci', 3), ('3d', 2)]
['animation', 'film']

The Line Animation — Work
[('animation', 13), ('film', 12), ('compsci', 9)]
['animation', 'people']

Introduction to QT programming - YouTube
[('compsci', 32), ('programming', 20), ('proglangs', 10)]
['compsci']

MindLab
[]
['music']

V&A Illustration Awards
[('illustration', 16), ('design', 8), ('people', 7)]
['awesome', 'gallery', 'i

[('compsci', 9), ('history', 7), ('design', 5)]
['history', 'nix']

Finagle's law
[('wikipedia', 1), ('misc', 1), ('ux', 1)]
['misc', 'wikipedia']

Computer Stupidities
[('compsci', 34), ('books', 6), ('awesome', 6)]
['compsci', 'haha']

Introduction to Microservices
[('compsci', 8), ('ai', 2), ('algorithms', 2)]
['private', 'programming']

Inmetro - Grafia de unidades
[]
['language', 'maths']

Squircle
[]
['wikipedia']

TalComponent.cpp
[]
['audiodev', 'programming']

Quincy Jones in conversation
[('film', 1), ('tweet', 1), ('misc', 1)]
['haha', 'interviews', 'music']

Asemic writing
[('programming', 2), ('compsci', 2), ('birmingham', 1)]
['wikipedia']

Popular Stacks
[('webdev', 4), ('music', 1), ('typography', 1)]
['compsci', 'webdev']

Swiss Army Man - Official Movie Site
[('film', 15), ('lists', 6), ('design', 6)]
['awesome', 'graphics', 'webdev']

IMDb: Math films
[('film', 26), ('lists', 10), ('compsci', 8)]
['film', 'maths']

Marpi: WebGL demos
[('webdev', 4), ('graphics', 2), 

[('compsci', 36), ('books', 34), ('programming', 23)]
['compsci']

Tweetable Mathematical Art
[('design', 24), ('art', 19), ('illustration', 13)]
['compsci']

Thinking Machines: Art and Design in the Computer Age, 1959–1989 | MoMA
[('design', 85), ('compsci', 61), ('art', 25)]
['art', 'compsci', 'design', 'history']

Marta Salogni
[]
['people', 'production']

How To Add A Security Key To Your Gmail
[('private', 6), ('music', 5), ('programming', 4)]
['infosec']

Animated Bézier
[('compsci', 1), ('graphics', 1)]
['compsci', 'maths']

NanaOn-Sha
[]
['gaming']

The Killing of a Sacred Deer and the Bleeding Edge of Poster Design
[('design', 63), ('compsci', 13), ('people', 11)]
['design', 'film']

I made a camera that prints a GIF instantly
[('awesome', 2), ('gamedev', 2), ('programming', 2)]
['awesome', 'compsci']

Steve Losh ScrollSpy
[('music', 1), ('typography', 1), ('programming', 1)]
['site']

Convert a Graphics2D to an Image or BufferedImage
[('gallery', 3), ('misc', 3), ('compsci', 

[('webdev', 4), ('design', 3), ('misc', 2)]
['awesome', 'css']

Google Visual Assets Guidelines
[('viz', 17), ('compsci', 16), ('design', 11)]
['design']

Jaga Jazzist Family playlist
[('typography', 4), ('music', 2), ('design', 1)]
['music']

What Should I Read Next? Book recommendations from readers like you
[('books', 36), ('compsci', 20), ('design', 13)]
['books']

Fluid Simulation (with WebGL demo)
[('webdev', 4), ('graphics', 3), ('programming', 2)]
['compsci', 'graphics', 'maths', 'programming']

Brian Reitzell's Studio: A Photo Essay
[('music', 9), ('production', 9), ('design', 7)]
['music', 'people', 'photography', 'production']

Brutalist Websites
[('webdev', 2), ('programming', 1), ('tweet', 1)]
['art', 'design', 'webdev']

C++ Developer jobs in London, average salaries and trends
[('.jobs', 18), ('private', 16), ('programming', 12)]
['.jobs', 'cpp', 'private']

Tone.js
[]
['audiodev', 'js', 'webdev']

Information is Beautiful Awards
[('compsci', 6), ('film', 4), ('design', 

[('tcc', 3), ('music', 3), ('papers', 3)]
['articles', 'haha', 'music']

Vadik Marmeladov
[]
['design', 'people']

Harry F. Olson
[('people', 1), ('webdev', 1), ('art', 1)]
['audiodev', 'people']

TodoMVC
[]
['compsci']

More PS examples
[('birmingham', 4), ('private', 4), ('compsci', 3)]
['birmingham', 'private']

How To Scroll
[]
['compsci']

A Private Jazz Pod in a Town House | Noise reduction
[('music', 14), ('jazz', 8), ('music-theory', 2)]
['acoustics', 'jazz', 'music']

Dillon Marsh Photography
[('people', 9), ('design', 8), ('photography', 7)]
['design']

Home — Google Web Fundamentals
[('compsci', 12), ('design', 12), ('webdev', 10)]
['design']

Scriptographer.org
[]
['design']

888 (number)
[('maths', 2), ('haha', 2), ('music', 1)]
['maths', 'wikipedia']

Font Library (open fonts!)
[('typography', 39), ('design', 30), ('compsci', 13)]
['design', 'typography']

Nick's Pedals
[('music', 3), ('guitar', 1), ('people', 1)]
['guitar', 'music']

Susan Kare
[('illustration', 4), ('de