### Load tag genome tags

In [1]:
import pandas as pd

In [2]:
unique_tags = pd.read_csv("./unique_tags.csv")

In [3]:
unique_tags = unique_tags.drop("Unnamed: 0",axis=1)

In [4]:
unique_tags_list = unique_tags.tags.tolist()

### Load all tags

In [33]:
tag_score = pd.read_csv("./tag_with_scores.csv")

In [34]:
tag_score.drop(['Unnamed: 0','movieId'],axis=1,inplace=True)

In [35]:
tag_score.dropna(inplace=True)

In [36]:
tag_list = tag_score['tag'].tolist()

### Use word2vec to identify synonyms

In [9]:
from __future__ import print_function
from os.path import dirname
import os,sys,inspect

current_dir = dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
manage_dir = dirname(dirname(dirname(current_dir)))

if not manage_dir in sys.path:
    sys.path.insert(0, manage_dir)

# from pyml4.common import context, db
# context.local_context.print_summary()

import gensim
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import logging
import nltk
import nltk.data
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
import pymysql.cursors

In [10]:
word_vec_model2 = Word2Vec.load('Word2Vec_fullds_model2')

In [8]:
len(word_vec_model2.wv.vocab)

65012

In [20]:
word_vec_model2.wv.similar_by_word("happy")

[('happier', 0.27740973234176636),
 ('happily', 0.2672647535800934),
 ('happiest', 0.22986197471618652),
 ('upbeat', 0.22156524658203125),
 ('glad', 0.20897459983825684),
 ('optimistic', 0.20554432272911072),
 ('hopeful', 0.20425133407115936),
 ('unhappy', 0.19973137974739075),
 ('cheerful', 0.19736379384994507),
 ('awww', 0.18560005724430084)]

In [9]:
synonym = {}

for tag in tag_list:
    if tag in word_vec_model2.wv.vocab:
        if tag not in synonym:
            synonym[tag] = []

        for (k,v) in word_vec_model2.wv.similar_by_word(tag):
            if k in tag_list:
                synonym[tag].append(k)
            

### Use stemming to identify potential similar tags

In [21]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [22]:
ps = PorterStemmer()

In [110]:
stem_result = {}
dup_words = []

for w in tag_list:
    #print(w) 
    curstem = ps.stem(w)
    if curstem in stem_result:
        stem_result[curstem].append(w)
    else:
        stem_result[curstem] = []
        stem_result[curstem].append(w)

In [111]:
stem_result["zombi"]

['zombies', 'zombie', 'Zombie', 'Zombies', 'zombis']

In [54]:
for key, value in stem_result.items():
    if len(value) > 1:
        for word in value:
            dup_words.append(word)

In [56]:
len(dup_words)

17743

In [59]:
import csv


with open('./dup_tags_among_all.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    for val in dup_words:
        wr.writerow([val])

#### Add tag stems as a new column to tag_score

In [109]:
tag_stems = []

for w in tag_list:
    curstem = ps.stem(w)
    tag_stems.append(curstem)

 

In [37]:
tag_score["tag_stem"] = tag_stems

In [100]:
tag_score_sorted = tag_score.sort_values(by=["tag_stem","score"],ascending=False)

In [93]:
tag_score_sorted_small = tag_score_sorted[tag_score_sorted["score"]>=2]

In [112]:
## For each tag, find tags with the same stem. 
## Add 'tag_same_stem' as a new column to tag_score_sorted.

tag_same_stem = []

for index, row in tag_score_sorted.iterrows():
    tag = row[0]
    stem = row[7]
    temp = ''
    if stem in stem_result:
        #same_stem_list = stem_result[stem]
        
        for t in stem_result[stem]:
            if t != tag:
                #temp.append(k)
                temp += t+";"
    if temp != '':
        tag_same_stem.append(temp)
    else:
        tag_same_stem.append(None)



In [113]:
tag_score_sorted["tags_same_stem"] = tag_same_stem

In [114]:
tag_score_sorted.count()

tag                      72776
numApps                  72776
numPositive              72776
numNeutral               72776
numNegative              72776
numDownvotes             72776
score                    72776
tag_stem                 72776
word2vec_similar_tags     9853
tags_same_stem           17743
dtype: int64

#### Exclude tags with scores < 2 (bad tags)

In [124]:
tag_score_sorted_small = tag_score_sorted[tag_score_sorted["score"]>=2].copy()

In [125]:
tag_score_sorted_small.count()

tag                      25305
numApps                  25305
numPositive              25305
numNeutral               25305
numNegative              25305
numDownvotes             25305
score                    25305
tag_stem                 25305
word2vec_similar_tags     6212
tags_same_stem           10169
dtype: int64

In [126]:
tag_score_sorted_small.drop_duplicates("tag_stem",keep="first",inplace=True)

In [127]:
tag_score_sorted_small

Unnamed: 0,tag,numApps,numPositive,numNeutral,numNegative,numDownvotes,score,tag_stem,word2vec_similar_tags,tags_same_stem
14598,Érotique,4,0,4,0,0,4.0,érotiqu,,
19808,Å¡Åuk,4,0,4,0,2,2.0,å¡åuk,,
19750,Ángel Salazar,2,1,1,0,0,2.0,ángel salazar,,
11223,Álex de la Iglesia,10,2,8,0,5,5.0,álex de la iglesia,,
9402,zurich,7,0,7,0,0,7.0,zurich,switzerland;istanbul;heidelberg;amsterdam;paris;,Zurich;
18433,zulu,3,1,2,0,0,3.0,zulu,algiers;bataan;waterloo;carthage;,
22151,Zosia Mamet,2,0,1,1,0,2.0,zosia mamet,,
19787,zorro,2,0,2,0,0,2.0,zorro,swashbuckling;telenovela;swashbuckler;swordsman;,Zorro;
18434,zoophilia,4,1,2,1,1,3.0,zoophilia,,
19965,zoologist,2,0,2,0,0,2.0,zoologist,archaeologist;paleontologist;biologist;scienti...,


### Use word2vec to identify similar tags

In [9]:
from __future__ import print_function
from os.path import dirname
import os,sys,inspect

current_dir = dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
manage_dir = dirname(dirname(dirname(current_dir)))

if not manage_dir in sys.path:
    sys.path.insert(0, manage_dir)

# from pyml4.common import context, db
# context.local_context.print_summary()

import gensim
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import logging
import nltk
import nltk.data
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
import pymysql.cursors

In [10]:
word_vec_model2 = Word2Vec.load('Word2Vec_fullds_model2')

In [8]:
len(word_vec_model2.wv.vocab)

65012

In [20]:
word_vec_model2.wv.similar_by_word("happy")

[('happier', 0.27740973234176636),
 ('happily', 0.2672647535800934),
 ('happiest', 0.22986197471618652),
 ('upbeat', 0.22156524658203125),
 ('glad', 0.20897459983825684),
 ('optimistic', 0.20554432272911072),
 ('hopeful', 0.20425133407115936),
 ('unhappy', 0.19973137974739075),
 ('cheerful', 0.19736379384994507),
 ('awww', 0.18560005724430084)]

In [9]:
synonym = {}

for tag in tag_list:
    if tag in word_vec_model2.wv.vocab:
        if tag not in synonym:
            synonym[tag] = []

        for (k,v) in word_vec_model2.wv.similar_by_word(tag):
            if k in tag_list:
                synonym[tag].append(k)
            

In [45]:
similar_tags = []

for index, row in tag_score.iterrows():
    tag = row[0]
    temp = ''
    if tag in word_vec_model2.wv.vocab:
        for (k,v) in word_vec_model2.wv.similar_by_word(tag):
            if k in tag_list:
                #temp.append(k)
                temp += k+";"
    if len(temp) > 0:
        similar_tags.append(temp)
    else:
        similar_tags.append(None)

In [48]:
len(similar_tags)

72776

In [49]:
tag_score['word2vec_similar_tags'] = similar_tags

In [50]:
tag_score.head()

Unnamed: 0,tag,numApps,numPositive,numNeutral,numNegative,numDownvotes,score,tag_stem,word2vec_similar_tags
0,sci-fi,9094,6887,2083,124,1286,7808.0,sci-fi,
1,atmospheric,6127,5612,487,28,763,5364.0,atmospher,eerie;atmosphere;evocative;moody;stylish;suspe...
2,surreal,5115,4313,677,125,554,4561.0,surreal,surrealistic;nightmarish;dreamlike;bizarre;sur...
3,action,6006,3740,2030,236,1519,4487.0,action,suspense;suspenseful;pace;slapstick;drama;tens...
4,twist ending,4666,4081,477,108,589,4077.0,twist end,


### Tags 1 edit away

#### Find tags 1 edit away for each tag; append to tag_score table

In [53]:
one_edit = []

for i in range (1,len(tag_list)):
    temp = ''
    tag1 = tag_list[i]
    for j in range (i+1,len(tag_list)):
        tag2 = tag_list[j]

        if isEditDistanceOne(tag1, tag2):
            temp += tag2 + ";"
    one_edit.append(temp)

KeyboardInterrupt: 

In [None]:
one_edit

In [None]:
len(one_edit)

======================old stuff================

In [11]:
duplicates = []

for i in range (1,len(tag_list)):
    count = 0
    tag1 = tag_list[i]
    for j in range (i+1,len(tag_list)):
        tag2 = tag_list[j]

        if isEditDistanceOne(tag1, tag2):
            count = count + 1
            duplicates.append(tag1)
            duplicates.append(tag2)

        
        

In [None]:
duplicates

In [14]:
import csv

with open('./tag_1_edit_away.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    for val in duplicates:
        wr.writerow([val])

### A function that identifies if two words are <=1 edit away

In [52]:
def isEditDistanceOne(s1, s2):
 
    # Find lengths of given strings
    m = len(s1)
    n = len(s2)
 
    # If difference between lengths is more than 1,
    # then strings can't be at one distance
    if abs(m - n) > 1:
        return False
 
    count = 0    # Count of isEditDistanceOne
 
    i = 0
    j = 0
    while i < m and j < n:
        # If current characters dont match
        if s1[i] != s2[j]:
            if count == 1:
                return False
 
            # If length of one string is
            # more, then only possible edit
            # is to remove a character
            if m > n:
                i+=1
            elif m < n:
                j+=1
            else:    # If lengths of both strings is same
                i+=1
                j+=1
 
            # Increment count of edits
            count+=1
 
        else:    # if current characters match
            i+=1
            j+=1
 
    # if last character is extra in any string
    if i < m or j < n:
        count+=1
 
    return count <= 1
