# Item-Based Collaborative Filtering

# 1. read dataset and merge join on movieId

In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)

def missing_values_analysis(data):
    na_columns = [col for col in data.columns if data[col].isnull().sum() > 0]
    n_miss = data[na_columns].isnull().sum().sort_values(ascending=True)
    ratio = (data[na_columns].isnull().sum() / data.shape[0] * 100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

def check_df(data, row_num=5, col_num=10):
    print("*************** Dataset Shape ***************")
    print("No. of Rows:", data.shape[0], "\nNo. of Columns:", data.shape[1])
    print("*************** Dataset Information ***************")
    print(data.info())
    print("*************** Types of Columns ***************")
    print(data.dtypes)
    print(f"*************** First {row_num} Rows ***************")
    print(data.iloc[:row_num,:col_num])
    print(f"*************** Last {row_num} Rows ***************")
    print(data.iloc[-row_num:,:col_num])
    print("*************** Summary Statistics of The Dataset ***************")
    print(data.describe([0.10, 0.25, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99]).T)
    print("*************** Dataset Missing Values Analysis ***************")
    print(missing_values_analysis(data))

movie = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
rating = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
df = movie.merge(rating, how="left", on="movieId")
check_df(df)

*************** Dataset Shape ***************
No. of Rows: 20000797 
No. of Columns: 6
*************** Dataset Information ***************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000797 entries, 0 to 20000796
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     float64
 4   rating     float64
 5   timestamp  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 915.6+ MB
None
*************** Types of Columns ***************
movieId        int64
title         object
genres        object
userId       float64
rating       float64
timestamp     object
dtype: object
*************** First 5 Rows ***************
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story

# 2. let's filter low rated movies and create new dataframe

In [13]:
def create_user_movie_df(dataframe):
    comment_counts = pd.DataFrame(dataframe["title"].value_counts())
    rare_movies = comment_counts[comment_counts["count"] <= 1000].index
    common_movies = dataframe[~dataframe["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df

user_movie_df = create_user_movie_df(df)

# 3. create an item based recommender using correlation on movies

In [14]:
def item_based_recommender(movie_name, user_movie_df):
    movie_name = user_movie_df[movie_name]
    return user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(10)

item_based_recommender("Matrix, The (1999)", user_movie_df) # test function with a movie name

title
Matrix, The (1999)                                           1.000000
Matrix Reloaded, The (2003)                                  0.516906
Matrix Revolutions, The (2003)                               0.449588
Animatrix, The (2003)                                        0.367151
Blade (1998)                                                 0.334493
Terminator 2: Judgment Day (1991)                            0.333882
Minority Report (2002)                                       0.332434
Edge of Tomorrow (2014)                                      0.326762
Mission: Impossible (1996)                                   0.320815
Lord of the Rings: The Fellowship of the Ring, The (2001)    0.318726
dtype: float64

In [17]:
movie_name = pd.Series(user_movie_df.columns).sample(1).values[0] # get a sample movie to send item_based_recommender func
item_based_recommender(movie_name, user_movie_df)

title
Man Bites Dog (C'est arrivé près de chez vous) (1992)    1.000000
Black Cat, White Cat (Crna macka, beli macor) (1998)     0.634521
Bridge Too Far, A (1977)                                 0.582884
Farinelli: il castrato (1994)                            0.545283
Burnt by the Sun (Utomlyonnye solntsem) (1994)           0.543254
Inside Job (2010)                                        0.521777
Green Street Hooligans (a.k.a. Hooligans) (2005)         0.511026
Sanjuro (Tsubaki Sanjûrô) (1962)                         0.506479
Old Man and the Sea, The (1958)                          0.503566
French Twist (Gazon maudit) (1995)                       0.494804
dtype: float64