In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
# Read in the dataset, and do a little preprocessing, mostly to set the column datatypes.
users = pandas.read_csv('./users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pandas.read_csv('./ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pandas.read_csv('./movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_test = pandas.read_csv('./movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')                         
movies_train['genre'] = movies_train.genre.str.split('|')
movies_test['genre'] = movies_test.genre.str.split('|')

In [3]:
# Convert the columns to the appropriate type.
users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')


In [4]:
# Pivot and create movie-user matrix
movie_to_user = ratings.pivot(index='movieid', columns='userid', values='rating').fillna(0)
movie_to_user

userid,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Transform matrix to scipy sparse matrix
movie_to_user_sparse = csr_matrix(movie_to_user.values)
movie_to_user_sparse

<3706x6040 sparse matrix of type '<class 'numpy.float64'>'
	with 1000209 stored elements in Compressed Sparse Row format>

Fitting a KNN model:

In [6]:
knn_movie_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_movie_model.fit(movie_to_user_sparse)

In [7]:
def get_movie_name(movieid):
    try:
        return movies_train.loc[movieid].title
    except KeyError:
        return movies_test.loc[movieid].title

In [8]:
## function to find top n similar users of the given input user 
def get_similar_movies(movieid, n = 15):
    ## input to this function is the movieid and number of top similar movies you want.
    knn_input = np.asarray(movie_to_user.loc[movieid]).reshape(1,-1)
    n = min(n, movie_to_user.shape[0])
    distances, indices = knn_movie_model.kneighbors(knn_input, n_neighbors = n + 1)
    # print("Top",n,"movies which are very much similar to the Movie",get_movie_name(movieid), "are: ")
    # print(" ")
    # for i in range(1, len(distances.flatten())):
    #     if indices.flatten()[i] != movieid:
    #         print(get_movie_name(movie_to_user.index[indices.flatten()[i]]) + " with distance of " + str(distances.flatten()[i]))
    return distances, indices

Predict the genre of the movie from similar movies.

In [9]:
def predict_genres(movieid, n = 15):
    distances, indices = get_similar_movies(movieid, n)
    genres = []
    for i in range(1, len(distances.flatten())):
        # Exclude movies in the test set
        if movie_to_user.index[indices.flatten()[i]] in movies_train.index:
            genres.extend(movies_train.loc[movie_to_user.index[indices.flatten()[i]]].genre)
    genres = pandas.Series(genres)
    threshold = genres.value_counts().max() / 2
    return genres.value_counts()[genres.value_counts() > threshold].head(3).index.tolist()

In [10]:
# Remove all movies that don't have a rating
movies_test = movies_test[movies_test.index.isin(ratings.movieid)]

# Predict genres for all movies in the test set
movies_test['predicted_genres'] = movies_test.index.map(predict_genres)
movies_test.head()

Unnamed: 0_level_0,title,genre,predicted_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]","[Comedy, Children's]"
2067,Doctor Zhivago (1965),"[Drama, Romance, War]","[Drama, Romance, War]"
2651,Frankenstein Meets the Wolf Man (1943),[Horror],[Horror]
2989,For Your Eyes Only (1981),[Action],[Action]
3415,"Mirror, The (Zerkalo) (1975)",[Drama],[Drama]


In [11]:
# Calculate multi-label f1 score
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(movies_test.genre)
y_pred = mlb.transform(movies_test.predicted_genres)
f1_score(y_true, y_pred, average='weighted')

0.7411876034165057