In [91]:
from collections import namedtuple
import csv
import nltk
import logging, gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

nltk.download('punkt')   # tokenizer
nltk.download('averaged_perceptron_tagger') # POS
nltk.download('wordnet') # similarity

def with_tag(tag):
    return list(filter(lambda item: tag in item.tags, bookmarks))

[nltk_data] Downloading package punkt to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/felipecortez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [102]:
bookmarks = []
Bookmark = namedtuple('Bookmark', ['title', 'tags'])

with open('marks.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    # transforms lists as strings to real lists
    for row in reader:
        bookmarks.append(Bookmark(row[0], eval(row[1])))
        
for b in bookmarks[30:50]: print(b)

Bookmark(title='Stefan Tosheff - Illustrations', tags=['design', 'people'])
Bookmark(title='\u200eStrange films are strange (actually good)', tags=['film'])
Bookmark(title='OpenGL Reference Card', tags=['compsci', 'graphics'])
Bookmark(title='The Art of Rendering', tags=['compsci', 'graphics'])
Bookmark(title='A Singular Christmas', tags=['music'])
Bookmark(title='List of musical works released in a stem format', tags=['lists', 'music', 'wikipedia'])
Bookmark(title='Alan Ranta', tags=['music', 'people'])
Bookmark(title='James Spectrum Studio', tags=['music'])
Bookmark(title='Tools of the Trade: Zorch - Omnichords', tags=['instruments', 'music'])
Bookmark(title='How to add depth to a mix', tags=['music', 'production'])
Bookmark(title='Chord Theory for Guitar', tags=['music'])
Bookmark(title='List of Internet phenomena', tags=['lists', 'misc', 'wikipedia'])
Bookmark(title='ii-V-I: A Softly Spoken Magic Spell', tags=['music'])
Bookmark(title='Mixtape Alpha with useful links', tags=['music

In [113]:
all_tags = []
for bookmark in bookmarks:
    all_tags.extend(bookmark.tags)
    
all_tags = list(set(all_tags))
print(all_tags)

['3d', 'golang', 'ice', 'photography', 'maths', 'misc', 'vim', 'videos', 'architecture', 'food', 'functional', 'cpp', 'acoustics', 'infosec', '.apto', 'programming', 'blogs', 'sql', 'travel', 'audiodev', 'css', 'rym', '.shopping', 'birmingham', 'cah', 'interviews', 'networks', 'electronics', 'illustration', 'inpe', 'gaming', 'ux', 'now', 'private', 'c', 'later', 'papers', 'java', 'lisp', '.sp', 'python', 'awesome', 'js', 'lists', 'art', 'tex', 'max', 'clojure', '.jobs', 'grybo', 'typography', 'site', 'nlp', 'gallery', 'music-theory', 'design', 'viz', 'academia', 'hardware', 'bmarks', 'ai', 'crypto', 'nix', 'articles', '.campanha', 'assembly', 'guitar', 'dsp', 'webdev', 'data', 'production', 'colors', 'installation', 'language', 'film', 'people', 'synth', 'proglangs', 'tcc', 'jazz', 'philosophy', 'foundry', 'algorithms', 'music', '.lucio', 'instruments', 'wikipedia', 'interactive', '.unlisted', 'mix', 'gamedev', 'compsci', 'tvtropes', 'swift', 'graphics', 'books', 'history', 'ponte', '.

In [104]:
from sklearn.model_selection import train_test_split

bookmarks_train, bookmarks_test = train_test_split(bookmarks, test_size=0.20, random_state=42)

print(len(bookmarks_train))
print(len(bookmarks_test))

2380
595


In [106]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
words = []

for bookmark in with_tag("production"):
    lemmatized = [wordnet_lemmatizer.lemmatize(w.lower()) for w in bookmark.title.split()]
    words.extend(lemmatized)

In [110]:
for bookmark in with_tag("production"):
    tokenized = nltk.word_tokenize(bookmark.title)
    print(nltk.pos_tag(tokenized))

[('How', 'WRB'), ('to', 'TO'), ('add', 'VB'), ('depth', 'NN'), ('to', 'TO'), ('a', 'DT'), ('mix', 'NN')]
[('The', 'DT'), ('14', 'CD'), ('synthesizers', 'NNS'), ('that', 'WDT'), ('shaped', 'VBD'), ('modern', 'JJ'), ('music', 'NN')]
[('Beautifully', 'RB'), ('Produced', 'NNP'), ('Albums', 'NNP')]
[('Best', 'JJS'), ('music', 'NN'), ('producers', 'NNS')]
[('The', 'DT'), ('Making', 'NNP'), ('of', 'IN'), ('a', 'DT'), ('Mars', 'NNP'), ('Volta', 'NNP'), ('album', 'NN')]
[('27', 'CD'), ('best', 'JJS'), ('free', 'JJ'), ('VSTs', 'NN')]
[('Damon', 'NNP'), ('Albarn', 'NNP'), ('Studio', 'NNP'), ('13', 'CD'), ('equipment', 'NN')]
[('Sinevibes', 'NNS')]
[('Hydrophones', 'NNS'), (':', ':'), (':', ':'), ('H1a', 'NN')]
[('Schroeder', 'NNP'), ('Reverbs', 'NNP')]
[('Audio-Technica', 'NNP'), ('AT4051', 'NNP')]
[('Zach', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('Recorder', 'NNP'), ('Recommendations', 'NNS')]
[('Ivor-Mairants', 'NNS')]
[('Paulstretch', 'NN')]
[('808', 'CD'), (',', ','), ('SP1200', 'NNP'), (',', ',

[('Genelec', 'NNP'), ('8010A', 'CD'), ('3', 'CD'), ("''", "''"), ('Powered', 'VBN'), ('Studio', 'NNP'), ('Monitor', 'NNP')]
[('Synthedelia', 'NNS'), (':', ':'), ('Psychedelic', 'NNP'), ('Electronic', 'NNP'), ('Music', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('1960s', 'NNS')]
[('Rogério', 'NNP'), ('Duprat', 'NNP'), ('–', 'NNP'), ('Tropicália', 'NNP')]
[('Unknown', 'IN'), ('Mortal', 'NNP'), ('Orchestra', 'NNP'), ('AMA', 'NNP')]
[('lines', 'NNS')]
[('Girl', 'NNP'), ('Turk', 'NNP'), (':', ':'), ('Mechanical', 'JJ'), ('Turk', 'NNP'), ('Meets', 'NNP'), ('Girl', 'NNP'), ('Talk', 'NNP'), ("'s", 'POS'), ('``', '``'), ('Feed', 'NNP'), ('the', 'DT'), ('Animals', 'NNS'), ("''", "''")]
[('Return', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Obra', 'NNP'), ('Dinn', 'NNP'), ('-', ':'), ('Sound', 'NN')]
[('Klanghelm', 'NNP'), ('SDRR', 'NNP')]


In [111]:
from collections import Counter
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

words = []

relevant_pos = [
    "NNP",
    "NN",
    "NNS",
    "JJ",
    "VB",
    "VBG",
]

for bookmark in bookmarks:
    tokenized = nltk.word_tokenize(bookmark.title)
    for word, pos in nltk.pos_tag(tokenized):
        if pos in relevant_pos:
            words.append( porter_stemmer.stem(word))
            
most_freq = Counter(words)
print(most_freq.most_common(50))

[('music', 112), ('design', 105), ('game', 92), ('|', 77), ('art', 70), ('program', 61), ('comput', 57), ('book', 57), ('’', 53), ('sound', 49), ('guid', 48), ('list', 43), ('learn', 41), ('googl', 37), ('visual', 37), ('anim', 37), ('audio', 37), ('languag', 36), ('python', 35), ('font', 33), ('film', 31), ('algorithm', 31), ('code', 30), ('world', 30), ('make', 30), ('–', 30), ('web', 28), ('develop', 27), ('graphic', 26), ('free', 26), ('use', 24), ('engin', 24), ('illustr', 23), ('interview', 23), ('tutori', 23), ('type', 22), ('synthesi', 21), ('—', 21), ('instrument', 21), ('color', 21), ('project', 21), ('movi', 20), ('video', 20), ('scienc', 20), ('album', 20), ('new', 20), ('machin', 20), ('jazz', 19), ('data', 19), ('guitar', 19)]
