<a href="https://colab.research.google.com/github/21020673/movie-genre-prediction/blob/main/knn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [None]:
!pip install gdown
import gdown
url = 'https://drive.google.com/u/0/uc?id=1YR6UGMjADg9ygjXDrpZNVpHst9Gk4OqM&export=download'
output = 'ml1m.zip'

gdown.download(url, output, quiet=False)
!unzip -o ml1m.zip

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1


Downloading...
From (uriginal): https://drive.google.com/u/0/uc?id=1YR6UGMjADg9ygjXDrpZNVpHst9Gk4OqM&export=download
From (redirected): https://drive.google.com/uc?id=1YR6UGMjADg9ygjXDrpZNVpHst9Gk4OqM&export=download&confirm=t&uuid=d2c25fa9-6ca3-44fa-bc51-d96cb4746fad
To: /kaggle/working/ml1m.zip
100%|██████████| 105M/105M [00:01<00:00, 63.1MB/s] 


Archive:  ml1m.zip
   creating: content/dataset/
  inflating: content/dataset/movies_test.dat  
  inflating: content/dataset/users.dat  
  inflating: content/dataset/genres.txt  
  inflating: content/dataset/ratings.dat  
  inflating: content/dataset/movies_train.dat  
   creating: content/dataset/ml1m-images/
  inflating: content/dataset/ml1m-images/2816.jpg  
  inflating: content/dataset/ml1m-images/1043.jpg  
  inflating: content/dataset/ml1m-images/941.jpg  
  inflating: content/dataset/ml1m-images/2.jpg  
  inflating: content/dataset/ml1m-images/160.jpg  
  inflating: content/dataset/ml1m-images/1201.jpg  
  inflating: content/dataset/ml1m-images/276.jpg  
  inflating: content/dataset/ml1m-images/2119.jpg  
  inflating: content/dataset/ml1m-images/599.jpg  
  inflating: content/dataset/ml1m-images/2829.jpg  
  inflating: content/dataset/ml1m-images/3628.jpg  
  inflating: content/dataset/ml1m-images/1867.jpg  
  inflating: content/dataset/ml1m-images/924.jpg  
  inflating: content

In [None]:
# Read in the dataset, and do a little preprocessing, mostly to set the column datatypes.
users = pandas.read_csv('./content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pandas.read_csv('./content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pandas.read_csv('./content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_test = pandas.read_csv('./content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_test['genre'] = movies_test.genre.str.split('|')

In [None]:
# Convert the columns to the appropriate type.
users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')

In [None]:
# Change all ratings to be 1
ratings.rating = 1

In [None]:
# Pivot and create movie-user matrix
movie_to_user = ratings.pivot(index='movieid', columns='userid', values='rating').fillna(0)
movie_to_user

userid,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Transform matrix to scipy sparse matrix
movie_to_user_sparse = csr_matrix(movie_to_user.values)
movie_to_user_sparse

<3706x6040 sparse matrix of type '<class 'numpy.float64'>'
	with 1000209 stored elements in Compressed Sparse Row format>

Fitting a KNN model:

In [None]:
metric = 'cosine'
knn_movie_model = NearestNeighbors(metric=metric, algorithm='brute')
try:
    knn_movie_model.fit(movie_to_user_sparse)
except Exception as e:
    knn_movie_model.fit(movie_to_user)

In [None]:
def get_movie_name(movieid):
    try:
        return movies_train.loc[movieid].title
    except KeyError:
        return movies_test.loc[movieid].title

In [None]:
## function to find top n similar users of the given input user
def get_similar_movies(movieid, n = 15):
    ## input to this function is the movieid and number of top similar movies you want.
    knn_input = np.asarray(movie_to_user.loc[movieid]).reshape(1,-1)
    n = min(n, movie_to_user.shape[0])
    distances, indices = knn_movie_model.kneighbors(knn_input, n_neighbors = n + 1)
    # print("Top",n,"movies which are very much similar to the Movie",get_movie_name(movieid), "are: ")
    # print(" ")
    # for i in range(1, len(distances.flatten())):
    #     if indices.flatten()[i] != movieid:
    #         print(get_movie_name(movie_to_user.index[indices.flatten()[i]]) + " with distance of " + str(distances.flatten()[i]))
    return distances, indices

Predict the genre of the movie from similar movies.

In [None]:
def predict_genres(movieid, n = 15):
    distances, indices = get_similar_movies(movieid, n)
    genres = []
    for i in range(1, len(distances.flatten())):
        # Exclude movies in the test set
        if movie_to_user.index[indices.flatten()[i]] in movies_train.index:
            genres.extend(movies_train.loc[movie_to_user.index[indices.flatten()[i]]].genre)
    genres = pandas.Series(genres)
    threshold = genres.value_counts().max() / 2
    return genres.value_counts()[genres.value_counts() > threshold].head(3).index.tolist()

In [None]:
def predict_genres_weighted(movieid, n = 15):
    distances, indices = get_similar_movies(movieid, n)
    genres = {}
    for i in range(1, len(distances.flatten())):
        # Exclude movies in the test set
        if movie_to_user.index[indices.flatten()[i]] in movies_train.index:
            for genre in movies_train.loc[movie_to_user.index[indices.flatten()[i]]].genre:
                genres[genre] = genres.get(genre, 0) + 1 / distances.flatten()[i]
    threshold = max(genres.values()) / 2
    genres = {k: v for k, v in genres.items() if v > threshold}
    return sorted(genres.keys(), key=lambda x: x[1], reverse=True)[:3]

In [None]:
# Remove all movies that don't have a rating
originallen = len(movies_test)
movies_test = movies_test[movies_test.index.isin(ratings.movieid)]
print('Removed %d movies without ratings' % (originallen - len(movies_test)))

# Predict genres for all movies in the test set
# Testing k values
k_value = 10
movies_test['predicted_genres'] = movies_test.index.map(lambda x: predict_genres(x, k_value))
# movies_test['predicted_genres'] = movies_test.index.map(lambda x: predict_genres_weighted(x, k_value))
movies_test.head()

Removed 34 movies without ratings


Unnamed: 0_level_0,title,genre,predicted_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]","[Comedy, Children's]"
2067,Doctor Zhivago (1965),"[Drama, Romance, War]","[Drama, Romance, War]"
2651,Frankenstein Meets the Wolf Man (1943),[Horror],[Horror]
2989,For Your Eyes Only (1981),[Action],[Action]
3415,"Mirror, The (Zerkalo) (1975)",[Drama],[Drama]


In [None]:
# Calculate multi-label f1 score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(movies_test.genre)
y_pred = mlb.transform(movies_test.predicted_genres)
print(classification_report(y_true, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

      Action       0.80      0.82      0.81        89
   Adventure       0.74      0.62      0.67        47
   Animation       0.95      0.90      0.93        21
  Children's       0.88      0.92      0.90        48
      Comedy       0.80      0.83      0.82       239
       Crime       0.42      0.34      0.38        29
 Documentary       1.00      0.67      0.80        24
       Drama       0.73      0.84      0.78       293
     Fantasy       1.00      0.57      0.73         7
   Film-Noir       0.75      0.50      0.60         6
      Horror       0.93      0.86      0.90        74
     Musical       0.71      0.77      0.74        13
     Mystery       0.88      0.39      0.54        18
     Romance       0.83      0.48      0.61        92
      Sci-Fi       0.89      0.88      0.88        48
    Thriller       0.70      0.71      0.70       106
         War       0.92      0.44      0.59        25
     Western       0.88    

In [None]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# Calculate MAP@k
actual = movies_test.genre.tolist()
predicted = movies_test.predicted_genres.tolist()
k = 3
print('MAP@%d: %f' % (k, mapk(actual, predicted, k)))

MAP@3: 0.772357
