# User-Based Collaborative Filtering and Recommendation System
**Business Problem**
> The company that tested content-based recommendation systems and item-based recommendation systems want to give users more customization
Recommendations have been made for movies based on similar liking structures, but the company wants to supply customized recommendations on the similarity of the users to the other users.


# 1. read dataset and merge join on movieId

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)

def missing_values_analysis(data):
    na_columns = [col for col in data.columns if data[col].isnull().sum() > 0]
    n_miss = data[na_columns].isnull().sum().sort_values(ascending=True)
    ratio = (data[na_columns].isnull().sum() / data.shape[0] * 100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

def check_df(data, row_num=5, col_num=10):
    print("*************** Dataset Shape ***************")
    print("No. of Rows:", data.shape[0], "\nNo. of Columns:", data.shape[1])
    print("*************** Dataset Information ***************")
    print(data.info())
    print("*************** Types of Columns ***************")
    print(data.dtypes)
    print(f"*************** First {row_num} Rows ***************")
    print(data.iloc[:row_num,:col_num])
    print(f"*************** Last {row_num} Rows ***************")
    print(data.iloc[-row_num:,:col_num])
    print("*************** Summary Statistics of The Dataset ***************")
    print(data.describe([0.10, 0.25, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99]).T)
    print("*************** Dataset Missing Values Analysis ***************")
    print(missing_values_analysis(data))

movie = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
rating = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
df = movie.merge(rating, how="left", on="movieId")
check_df(df)

*************** Dataset Shape ***************
No. of Rows: 20000797 
No. of Columns: 6
*************** Dataset Information ***************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000797 entries, 0 to 20000796
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     float64
 4   rating     float64
 5   timestamp  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 915.6+ MB
None
*************** Types of Columns ***************
movieId        int64
title         object
genres        object
userId       float64
rating       float64
timestamp     object
dtype: object
*************** First 5 Rows ***************
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story

# 2. let's filter low rated movies and create new dataframe

In [2]:
def create_user_movie_df(dataframe):
    comment_counts = pd.DataFrame(dataframe["title"].value_counts())
    rare_movies = comment_counts[comment_counts["count"] <= 1000].index
    common_movies = df[~df["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df

user_movie_df = create_user_movie_df(df)

In [9]:
def user_based_recommender(random_user, user_movie_df, ratio=60, cor_th=0.65, score=3.5):
    random_user_df = user_movie_df[user_movie_df.index == random_user]
    movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist() # select watched and rated movie by random_user
    movies_watched_df = user_movie_df[movies_watched]
    user_movie_count = movies_watched_df.T.notnull().sum()
    user_movie_count = user_movie_count.reset_index()
    user_movie_count.columns = ["userId", "movie_count"]
    perc = len(movies_watched) * ratio / 100
    users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"] #select other users who watched same movies

    final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                          random_user_df[movies_watched]])

    corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
    corr_df = pd.DataFrame(corr_df, columns=["corr"])
    corr_df.index.names = ['user_id_1', 'user_id_2']
    corr_df = corr_df.reset_index()

    top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= cor_th)][
        ["user_id_2", "corr"]].reset_index(drop=True) # select similar users have correlation over cor_th on random_user

    top_users = top_users.sort_values(by='corr', ascending=False)
    top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
    rating = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
    top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
    top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating'] # calculate rating*corr score

    recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
    recommendation_df = recommendation_df.reset_index()

    movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > score].sort_values("weighted_rating", ascending=False)
    movie = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
    return movies_to_be_recommend.merge(movie[["movieId", "title"]])


In [10]:
random_user = int(pd.Series(user_movie_df.index).sample(1).values)
user_based_recommender(random_user, user_movie_df, cor_th=0.70, score=4)

Unnamed: 0,movieId,weighted_rating,title
0,31948,4.320904,"Phone Box, The (Cabina, La) (1972)"
1,1489,4.249283,Cats Don't Dance (1997)
2,3318,4.063336,Deterrence (1999)
3,4595,4.063336,Fat Man and Little Boy (1989)
4,2310,4.004677,"Mighty, The (1998)"
