### Load unique tags

In [1]:
import pandas as pd

In [2]:
unique_tags = pd.read_csv("./unique_tags.csv")

In [3]:
unique_tags = unique_tags.drop("Unnamed: 0",axis=1)

### Load word2vec model 

In [4]:
import gensim
from gensim.models import Word2Vec

In [5]:
word_vec_model2 = Word2Vec.load('Word2Vec_fullds_model2')

In [17]:
len(word_vec_model2.wv.vocab)

65012

In [22]:
word_vec_model2.wv.most_similar("happy")

[('happier', 0.27740973234176636),
 ('happily', 0.2672647535800934),
 ('happiest', 0.22986197471618652),
 ('upbeat', 0.22156524658203125),
 ('glad', 0.20897459983825684),
 ('optimistic', 0.20554432272911072),
 ('hopeful', 0.20425133407115936),
 ('unhappy', 0.19973137974739075),
 ('cheerful', 0.19736379384994507),
 ('awww', 0.18560005724430084)]

In [6]:
word_vec_model2.wv.vocab

{'big': <gensim.models.keyedvectors.Vocab at 0x7f2874f65c18>,
 'fan': <gensim.models.keyedvectors.Vocab at 0x7f2874f65cf8>,
 'animate': <gensim.models.keyedvectors.Vocab at 0x7f2874f65d68>,
 'movies': <gensim.models.keyedvectors.Vocab at 0x7f2874f65dd8>,
 'come': <gensim.models.keyedvectors.Vocab at 0x7f2874f65e48>,
 'pixar': <gensim.models.keyedvectors.Vocab at 0x7f2874f65eb8>,
 'studios': <gensim.models.keyedvectors.Vocab at 0x7f2874f65f28>,
 'always': <gensim.models.keyedvectors.Vocab at 0x7f2874f65f98>,
 'look': <gensim.models.keyedvectors.Vocab at 0x7f2874f71048>,
 'newest': <gensim.models.keyedvectors.Vocab at 0x7f2874f710b8>,
 'technological': <gensim.models.keyedvectors.Vocab at 0x7f2874f710f0>,
 'possibilities': <gensim.models.keyedvectors.Vocab at 0x7f2874f71128>,
 'use': <gensim.models.keyedvectors.Vocab at 0x7f2874f71198>,
 'create': <gensim.models.keyedvectors.Vocab at 0x7f2874f71208>,
 'worth': <gensim.models.keyedvectors.Vocab at 0x7f2874f71278>,
 'watch': <gensim.models

In [14]:
similar_terms = {}
items_no_neighbors = []

In [15]:
for index, row in unique_tags.iterrows():
    tag = row['tags']
    if tag in word_vec_model2.wv.vocab:
        similar_terms[tag] = word_vec_model2.wv.most_similar(tag)
    else:
        items_no_neighbors.append(tag)

In [16]:
items_no_neighbors

['007 (series)',
 '18th century',
 '19th century',
 '9/11',
 'aardman studios',
 'action packed',
 'adapted from:book',
 'adapted from:comic',
 'adapted from:game',
 'afi 100',
 'afi 100 (laughs)',
 'afi 100 (movie quotes)',
 'aging',
 'aids',
 'alien invasion',
 'aliens',
 'alone in the world',
 'alter ego',
 'alternate endings',
 'alternate history',
 'alternate reality',
 'alternate universe',
 'amazing cinematography',
 'amazing photography',
 'american civil war',
 'amy smart',
 'android(s)/cyborg(s)',
 'animal movie',
 'animated',
 'anti-hero',
 'anti-semitism',
 'anti-war',
 'arms dealer',
 'art house',
 'artificial intelligence',
 'author:alan moore',
 'author:neil gaiman',
 'awesome soundtrack',
 'bad acting',
 'bad cgi',
 'bad ending',
 'bad plot',
 'bad science',
 'bad script',
 'bad sequel',
 'bank robbery',
 'based on a book',
 'based on a comic',
 'based on a play',
 'based on a true story',
 'based on a tv show',
 'based on a video game',
 'based on book',
 'based on com

### TODO: Identify similar tags for those not in word_vec_model

### Load previous tag classes

In [32]:
tag_classes = pd.read_csv("./tag_classes.csv")

tag_classes = tag_classes.drop("Unnamed: 0",axis=1)

In [42]:
tag_classes.tail(10)

Unnamed: 0,tag,tagClass
3100,young kids,facts
3101,younger men,facts
3102,your out of ur element donnie!!!!,other
3103,youth,facts
3104,zack snyder,facts
3105,zero 7,other
3106,zombie,facts
3107,zombies,facts
3108,zoo,facts
3109,[ok],subjective


In [68]:
tag_classes.groupby("tagClass").count()

Unnamed: 0_level_0,tag
tagClass,Unnamed: 1_level_1
facts,1862
other,183
personal,99
subjective,966


#### Categorize tags in tag genome based on tag_classes dataset (from 2006)

In [110]:
#tag_classes[tag_classes.tag == "007"].iloc[1]

result = tag_classes.loc[tag_classes.tag == "007","tagClass"]
result.values[0]

'facts'

In [111]:
tag_class_col = []

for index, row in unique_tags.iterrows():
    aTag = row['tags']
    if any(tag_classes.tag == aTag):
        print(aTag)
        tag_class_col.append(tag_classes.loc[tag_classes.tag == aTag,"tagClass"].values[0])
    else:
        tag_class_col.append("TBD")

007
18th century
1920s
1930s
1950s
1960s
1970s
1980s
19th century
80s
aardman
abortion
action
adaptation
addiction
adolescence
adoption
adultery
adventure
africa
aging
aids
airport
alcatraz
alcoholism
alien
aliens
alternate endings
alternate reality
alternate universe
amazing cinematography
amazing photography
american civil war
amnesia
androids
animal movie
animals
animated
animation
anime
antarctica
anti-semitism
archaeology
arms dealer
arnold
art
art house
artistic
artsy
assassin
assassination
assassins
astronauts
australia
australian
autism
aviation
awesome
awful
bad acting
ballet
baseball
based on a book
based on a true story
based on a tv show
based on book
based on comic
basketball
batman
bdsm
beatles
beautiful
beer
best of 2005
best war films
better than expected
biblical
biographical
biography
biopic
birds
bittersweet
black and white
black comedy
blaxploitation
blindness
blood
bloody
boarding school
boat
bond
book
book was better
books
boring
boston
bowling
boxing
brainwashing

In [112]:
len(tag_class_col)

1128

In [113]:
unique_tags['tag_class']=tag_class_col

In [115]:
unique_tags.to_csv("./unique_tags_classes.csv")