### Import Library and dataset

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df_movie = pd.read_csv("movies.csv")
df_rating = pd.read_csv("ratings.csv")

### Data Preparation

In [5]:
df_movie.info()
print("="*50)
df_rating.info()
# tidak ada data yang null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [6]:
df_movie.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [7]:
df_movie["year"] = df_movie["title"].apply(lambda x:x[-5:-1])
df_movie["year"][df_movie["year"]==df_movie["year"].unique()[-1]] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
def tahun(x):
    x = int(x)
    if x>2001:
        return "2000>"
    elif 1981<x<2000:
        return "<2000"
    elif 1950<x<1980:
        return "<1980"
    else:
        return "<1950"

In [9]:
df_movie["year"] = df_movie["year"].apply(lambda x:tahun(x))

In [10]:
df_movie["title"] = df_movie["title"].apply(lambda x:x[:-7])
df_movie["genres"] = df_movie["genres"].apply(lambda x:x.replace("|"," "))

In [11]:
df_movie["gen_year"] = df_movie.apply(lambda x: x["genres"] + " " + x["year"],axis=1)

In [12]:
df_movie = df_movie.set_index("movieId")
df_movie.head(2)

Unnamed: 0_level_0,title,genres,year,gen_year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story,Adventure Animation Children Comedy Fantasy,<2000,Adventure Animation Children Comedy Fantasy <2000
2,Jumanji,Adventure Children Fantasy,<2000,Adventure Children Fantasy <2000


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cm = cv.fit_transform(df_movie["gen_year"])

In [14]:
# cv.get_feature_names()
df_cm = pd.DataFrame(cm.toarray(),columns=cv.get_feature_names())
df_cm.head(2)

Unnamed: 0,1950,1980,2000,action,adventure,animation,children,comedy,crime,documentary,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,0,0,1,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Create Model for Content Based

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
df_cos = pd.DataFrame(cosine_similarity(df_cm), columns=df_movie.index, index=df_movie.index)
df_cos.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.816497,0.471405,0.408248,0.57735,0.204124,0.471405,0.707107,0.288675,0.408248,...,0.182574,0.288675,0.235702,0.235702,0.288675,0.816497,0.288675,0.57735,0.288675,0.204124
2,0.816497,1.0,0.288675,0.25,0.353553,0.25,0.288675,0.866025,0.353553,0.5,...,0.223607,0.353553,0.288675,0.0,0.353553,0.5,0.0,0.353553,0.353553,0.25


In [18]:
movie_dict = df_movie["title"].to_dict()
movie_dict = {v: k for k, v in movie_dict.items()}
id_mov = movie_dict["Superman vs. The Elite"]
id_rec = df_cos.loc[id_mov].sort_values(ascending=False).head(6).index.tolist()
df_movie.loc[id_rec]

Unnamed: 0_level_0,title,genres,year,gen_year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26913,Street Fighter II: The Animated Movie (Sutorît...,Action Animation,<2000,Action Animation <2000
79274,Batman: Under the Red Hood,Action Animation,2000>,Action Animation 2000>
138104,Justice League: Gods and Monsters,Action Animation,2000>,Action Animation 2000>
124867,Justice League: Throne of Atlantis,Action Animation,2000>,Action Animation 2000>
99813,"Batman: The Dark Knight Returns, Part 2",Action Animation,2000>,Action Animation 2000>
94974,Superman vs. The Elite,Action Animation,2000>,Action Animation 2000>


### Create Model for collaborative filtering

In [19]:
df_rating.isna().sum() 

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [20]:
rating = df_rating.pivot_table(index="userId", columns="movieId", values="rating")
rating.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,5.0,,2.0,,3.0,,,,,,...,,,,,,,,,,


In [21]:
rating.fillna(0,inplace=True) # handling na values with zero 
rating = rating.apply(lambda x: (x-np.mean(x)/(np.max(x)-np.min(x)))) #normalisasi data
rating.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.271407,-0.092365,-0.055389,-0.011602,-0.060329,-0.140269,-0.055689,-0.004117,-0.01976,-0.145509,...,-0.001497,-0.001497,-0.002566,-0.001497,-0.00262,-0.001497,-0.001497,-0.001497,-0.004325,-0.001497
2,4.728593,-0.092365,1.944611,-0.011602,2.939671,-0.140269,-0.055689,-0.004117,-0.01976,-0.145509,...,-0.001497,-0.001497,-0.002566,-0.001497,-0.00262,-0.001497,-0.001497,-0.001497,-0.004325,-0.001497


#### item to item collaborative filtering with cosine similarity

In [22]:
cos = pd.DataFrame(cosine_similarity(rating.T), columns=rating.columns, index=rating.columns)
cos.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.336278,0.301333,0.113949,0.323881,0.338812,0.313052,0.082256,0.092727,0.306269,...,0.059018,-0.010195,0.066291,-0.014275,0.042784,-0.009515,-0.012371,-0.011101,0.048614,-0.009515
2,0.336278,1.0,0.168717,0.108648,0.185449,0.263781,0.171256,0.048366,0.082986,0.387812,...,0.086464,0.087006,0.045586,0.084564,0.122416,-0.005736,-0.007458,0.086464,0.092919,0.087412
3,0.301333,0.168717,1.0,0.16241,0.499046,0.283035,0.31875,0.168266,0.214538,0.120166,...,-0.005197,-0.004772,0.040096,-0.006682,-0.006237,-0.004454,-0.005791,-0.005197,0.040887,-0.004454
4,0.113949,0.108648,0.16241,1.0,0.114819,0.15934,0.086526,0.188022,0.019667,0.151326,...,-0.002371,-0.002201,-0.003072,-0.002964,-0.002905,-0.002074,-0.002608,-0.002371,-0.003418,-0.002074
5,0.323881,0.185449,0.499046,0.114819,1.0,0.19361,0.400419,0.184233,0.116874,0.13465,...,-0.005411,-0.004969,0.048938,-0.006958,-0.006495,-0.004638,-0.00603,-0.005411,0.049978,-0.004638


In [24]:
movie_dict = df_movie["title"].to_dict()
movie_dict = {v: k for k, v in movie_dict.items()}
id_mov = movie_dict["Being Flynn"]
id_rec = cos.loc[id_mov].sort_values(ascending=False).head(6).index.tolist()
df_movie.loc[id_rec]

Unnamed: 0_level_0,title,genres,year,gen_year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
71139,Paraíso Travel,Adventure Drama Romance,2000>,Adventure Drama Romance 2000>
78829,"Betrayal, The (Nerakhoon)",Documentary,2000>,Documentary 2000>
79596,"Lineup, The",Crime Drama Film-Noir,<1980,Crime Drama Film-Noir <1980
105863,Child's Pose,Drama,2000>,Drama 2000>
51094,Gray Matters,Comedy Drama Romance,2000>,Comedy Drama Romance 2000>
106144,"Selfish Giant, The",Drama,2000>,Drama 2000>


#### collaborative filtering with pearson correlation

In [None]:
rat_corr = rating.corr()

In [None]:
movie_dict = df_movie["title"].to_dict()
movie_dict = {v: k for k, v in movie_dict.items()}
id_mov = movie_dict["Being Flynn"]
id_rec = rat_corr.loc[id_mov].sort_values(ascending=False).head(6).index.tolist()
df_movie.loc[id_rec]