In [64]:
import pandas as pd
from surprise import Dataset, Reader, SVD

In [65]:
def load_data():
    data = Dataset.load_builtin('ml-100k')
    raw_ratings = data.raw_ratings
    df = pd.DataFrame(raw_ratings, columns=['user_id', 'movie_id', 'rating', 'timestamp'])
    return df

df = load_data()
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596
...,...,...,...,...
99995,880,476,3.0,880175444
99996,716,204,5.0,879795543
99997,276,1090,1.0,874795795
99998,13,225,2.0,882399156


In [66]:
type(df)

pandas.core.frame.DataFrame

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100000 non-null  object 
 1   movie_id   100000 non-null  object 
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 3.1+ MB


In [68]:
df["movie_id"] = df["movie_id"].astype(int)
df["user_id"] = df["user_id"].astype(int)

In [69]:
movie_titles = pd.read_csv('movies.csv', delimiter=',', header=0, encoding='latin-1')
movie_titles = movie_titles[pd.to_numeric(movie_titles['movieId'], errors='coerce').notnull()]
movie_titles["movieId"] = movie_titles["movieId"].astype("int32")
movie_titles = movie_titles[["movieId", "title"]]
movie_titles

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
62418,209157,We (2018)
62419,209159,Window of the Soul (2001)
62420,209163,Bad Poems (2018)
62421,209169,A Girl Thing (2001)


In [70]:
links = pd.read_csv('links.csv')
links['movieId'] = links['movieId'].astype(int)
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
62418,209157,6671244,499546.0
62419,209159,297986,63407.0
62420,209163,6755366,553036.0
62421,209169,249603,162892.0


In [71]:
links.dropna(inplace=True)

In [72]:
links.tmdbId = links.tmdbId.astype(int)

In [73]:
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862
...,...,...,...
62418,209157,6671244,499546
62419,209159,297986,63407
62420,209163,6755366,553036
62421,209169,249603,162892


In [74]:
df = pd.merge(df, movie_titles, left_on='movie_id', right_on='movieId')
df

Unnamed: 0,user_id,movie_id,rating,timestamp,movieId,title
0,196,242,3.0,881250949,242,Farinelli: il castrato (1994)
1,186,302,3.0,891717742,302,"Queen Margot (Reine Margot, La) (1994)"
2,22,377,1.0,878887116,377,Speed (1994)
3,244,51,2.0,880606923,51,Guardian Angel (1994)
4,166,346,1.0,886397596,346,Backbeat (1993)
...,...,...,...,...,...,...
98494,880,476,3.0,880175444,476,"Inkwell, The (1994)"
98495,716,204,5.0,879795543,204,Under Siege 2: Dark Territory (1995)
98496,276,1090,1.0,874795795,1090,Platoon (1986)
98497,13,225,2.0,882399156,225,Disclosure (1994)


In [75]:
df = df[["user_id", "movieId", "rating", "title"]]
df

Unnamed: 0,user_id,movieId,rating,title
0,196,242,3.0,Farinelli: il castrato (1994)
1,186,302,3.0,"Queen Margot (Reine Margot, La) (1994)"
2,22,377,1.0,Speed (1994)
3,244,51,2.0,Guardian Angel (1994)
4,166,346,1.0,Backbeat (1993)
...,...,...,...,...
98494,880,476,3.0,"Inkwell, The (1994)"
98495,716,204,5.0,Under Siege 2: Dark Territory (1995)
98496,276,1090,1.0,Platoon (1986)
98497,13,225,2.0,Disclosure (1994)


In [76]:
df = pd.merge(df, links, on='movieId',how="inner")
df = df[["user_id", "movieId", "rating", "title", "tmdbId"]]

In [77]:
df

Unnamed: 0,user_id,movieId,rating,title,tmdbId
0,196,242,3.0,Farinelli: il castrato (1994),10954
1,186,302,3.0,"Queen Margot (Reine Margot, La) (1994)",10452
2,22,377,1.0,Speed (1994),1637
3,244,51,2.0,Guardian Angel (1994),117164
4,166,346,1.0,Backbeat (1993),12635
...,...,...,...,...,...
98258,880,476,3.0,"Inkwell, The (1994)",59930
98259,716,204,5.0,Under Siege 2: Dark Territory (1995),3512
98260,276,1090,1.0,Platoon (1986),792
98261,13,225,2.0,Disclosure (1994),8984


In [78]:
#save the data to a csv file
df.to_csv("ratings.csv", index=False)

In [79]:
d = pd.read_csv("ratings.csv")
d

Unnamed: 0,user_id,movieId,rating,title,tmdbId
0,196,242,3.0,Farinelli: il castrato (1994),10954
1,186,302,3.0,"Queen Margot (Reine Margot, La) (1994)",10452
2,22,377,1.0,Speed (1994),1637
3,244,51,2.0,Guardian Angel (1994),117164
4,166,346,1.0,Backbeat (1993),12635
...,...,...,...,...,...
98258,880,476,3.0,"Inkwell, The (1994)",59930
98259,716,204,5.0,Under Siege 2: Dark Territory (1995),3512
98260,276,1090,1.0,Platoon (1986),792
98261,13,225,2.0,Disclosure (1994),8984


________________________

In [2]:
import pandas as pd
d = pd.read_csv("ratings.csv")
d

Unnamed: 0,user_id,movieId,rating,title,tmdbId
0,196,242,3.0,Farinelli: il castrato (1994),10954
1,186,302,3.0,"Queen Margot (Reine Margot, La) (1994)",10452
2,22,377,1.0,Speed (1994),1637
3,244,51,2.0,Guardian Angel (1994),117164
4,166,346,1.0,Backbeat (1993),12635
...,...,...,...,...,...
98258,880,476,3.0,"Inkwell, The (1994)",59930
98259,716,204,5.0,Under Siege 2: Dark Territory (1995),3512
98260,276,1090,1.0,Platoon (1986),792
98261,13,225,2.0,Disclosure (1994),8984


In [3]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
ratings_v2 = pd.merge(d, movies, on="movieId")
ratings_v2

Unnamed: 0,user_id,movieId,rating,title_x,tmdbId,title_y,genres
0,196,242,3.0,Farinelli: il castrato (1994),10954,Farinelli: il castrato (1994),Drama|Musical
1,186,302,3.0,"Queen Margot (Reine Margot, La) (1994)",10452,"Queen Margot (Reine Margot, La) (1994)",Drama|Romance
2,22,377,1.0,Speed (1994),1637,Speed (1994),Action|Romance|Thriller
3,244,51,2.0,Guardian Angel (1994),117164,Guardian Angel (1994),Action|Drama|Thriller
4,166,346,1.0,Backbeat (1993),12635,Backbeat (1993),Drama|Musical
...,...,...,...,...,...,...,...
98258,880,476,3.0,"Inkwell, The (1994)",59930,"Inkwell, The (1994)",Comedy|Drama
98259,716,204,5.0,Under Siege 2: Dark Territory (1995),3512,Under Siege 2: Dark Territory (1995),Action
98260,276,1090,1.0,Platoon (1986),792,Platoon (1986),Drama|War
98261,13,225,2.0,Disclosure (1994),8984,Disclosure (1994),Drama|Thriller


In [6]:
ratings_v2.drop(columns=["title_y"], inplace=True)
ratings_v2.rename(columns={"title_x":"title"}, inplace=True)
ratings_v2

Unnamed: 0,user_id,movieId,rating,title,tmdbId,genres
0,196,242,3.0,Farinelli: il castrato (1994),10954,Drama|Musical
1,186,302,3.0,"Queen Margot (Reine Margot, La) (1994)",10452,Drama|Romance
2,22,377,1.0,Speed (1994),1637,Action|Romance|Thriller
3,244,51,2.0,Guardian Angel (1994),117164,Action|Drama|Thriller
4,166,346,1.0,Backbeat (1993),12635,Drama|Musical
...,...,...,...,...,...,...
98258,880,476,3.0,"Inkwell, The (1994)",59930,Comedy|Drama
98259,716,204,5.0,Under Siege 2: Dark Territory (1995),3512,Action
98260,276,1090,1.0,Platoon (1986),792,Drama|War
98261,13,225,2.0,Disclosure (1994),8984,Drama|Thriller


Content based data preparation

In [7]:
ratings_v2.shape

(98263, 6)