In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import *

# Aux functions

In [2]:
MOVIE_DATA_PATH = "ml-latest/movies.csv"
TAGS_DATA_PATH = "ml-latest/tags.csv"
NO_GENRES = "(no genres listed)"

In [3]:
def prepare_movie_data(line):
    line = line.strip().split(',')
    id, title, genres = line[0], line[-2], line[-1]
    if genres == NO_GENRES:
        genres = ''
    year_of_release = title.split()[-1][1:5]
    genres = genres.split('|')
    return id, year_of_release, genres

def prepare_tags_data(line):
    line = line.strip().lower().split(',')
    tags = ' '.join(line[2:-1])
    tags = ''.join(e for e in tags if e.isalnum() or e==' ')
    return line[1],tags

def prepare_tags_dict(tags):
    tags_dict = {}
    for movie_id, tag in tags:
        movie_id = int(movie_id)
        if movie_id in tags_dict:
            tags_dict[movie_id] += " " + tag
        else:
            tags_dict[movie_id] = tag
    return tags_dict

# Gathering the data

In [4]:
movies_data = [prepare_movie_data(line) for line in open(MOVIE_DATA_PATH).readlines()[1:]]

In [5]:
tags_data = [prepare_tags_data(line) for line in open(TAGS_DATA_PATH).readlines()[1:]]

In [6]:
tags_dict = prepare_tags_dict(tags_data)
tags = list(tags_dict.values())
movie_ids = list(tags_dict.keys())

# Calculating tags similarity

In [7]:
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 3),
                     lowercase=False,
                     binary=True,
                     stop_words='english')

tags_vectors = vectorizer.fit_transform(tags)

cos_similarity = cosine_similarity(tags_vectors,tags_vectors)

# Example

In [8]:
cos_similarity[2571]

array([ 0.00183787,  0.00112496,  0.        , ...,  0.        ,
        0.        ,  0.        ])

In [9]:
sorted(enumerate(cos_similarity[2571]),reverse=True, key=lambda x:x[1])[:10]

[(2571, 0.99999999999999989),
 (7039, 0.12105372187800925),
 (8444, 0.09144529563969174),
 (8721, 0.08308096676795651),
 (8732, 0.08308096676795651),
 (8777, 0.08308096676795651),
 (10375, 0.077082572794193449),
 (22563, 0.072520828797440359),
 (22697, 0.072520828797440359),
 (20511, 0.068883788614363589)]

In [10]:
print(tags[2571])
print("---==binary==---")
print(tags[7039])
print("---==counting==---")
print(tags[8444])

animation  live action interact animation insipid saccharine bobola animation james marsden russel brand slow pacing russell brand animation tim hill
---==binary==---
brendan fraser looney tunes brendan fraser steve martin animation  live action interact bad acting heterosexual live actionanimation looney tunes steve martin
---==counting==---
funny jennifer garner russell brand russell brand stupid jennifer garner russell brand funny good plot russell brand funny russel brand russel brand russel brand russell brand
