# Finding Similar Movies

In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
movie_cols = ['movie_id', 'title']
movies = pd.read_csv("movies.dat", names=movie_cols, usecols=range(2), header=0, sep="::", engine='python')

ratings_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv("ratings.dat", names=ratings_cols, usecols=range(3), header=0, sep="::", engine='python')

movie_ratings = pd.merge(movies, ratings, left_on="movie_id", right_on="movie_id")
movie_ratings.shape

(998131, 4)

In [3]:
movie_ratings.head(3)

Unnamed: 0,movie_id,title,user_id,rating
0,2,Jumanji (1995),10,5
1,2,Jumanji (1995),13,3
2,2,Jumanji (1995),18,2


In [5]:
movieRatings = movie_ratings.pivot_table(index=['user_id'],columns=['title'],values='rating')
movieRatings.head()

title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kj�rlighetens kj�tere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [None]:
corrMatrix = movieRatings.corr(method='pearson', min_periods=100)
corrMatrix.head()

In [None]:
favoriteMovieRatings = movieRatings['Saving Private Ryan (1998)']

In [186]:
# create correlations between Saving Private Ryan and all other movies in our data set
similarMovies = movieRatings.corrwith(favoriteMovieRatings)
similarMovies = similarMovies.dropna()
similarMovies_df = pd.DataFrame(similarMovies)
similarMovies_df.head()

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
"$1,000,000 Duck (1971)",0.069881
'Night Mother (1986),0.167915
'Til There Was You (1997),0.152794
"'burbs, The (1989)",0.049999
...And Justice for All (1979),0.248957


In [187]:
movieStats = movie_ratings.groupby('title').agg({'rating': [np.size, np.mean]})

Let's get rid of any movies rated by fewer than 600 people, and check the top-rated ones that are left:

In [198]:
popularMovies = movieStats['rating']['size'] >= 600
movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)[:15]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),628,4.56051
"Shawshank Redemption, The (1994)",2227,4.554558
"Godfather, The (1972)",2223,4.524966
"Close Shave, A (1995)",657,4.520548
"Usual Suspects, The (1995)",1783,4.517106
Schindler's List (1993),2304,4.510417
"Wrong Trousers, The (1993)",882,4.507937
Raiders of the Lost Ark (1981),2514,4.477725
Rear Window (1954),1050,4.47619
Star Wars: Episode IV - A New Hope (1977),2991,4.453694


### Similar movies

In [201]:
# join movie stats and similarity dataframe on title
df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']));
df.sort_values(['similarity'], ascending=False).head(15)

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Saving Private Ryan (1998),2653,4.337354,1.0
Forrest Gump (1994),2194,4.087967,0.363563
Schindler's List (1993),2304,4.510417,0.343939
Apollo 13 (1995),1251,4.073541,0.327342
Braveheart (1995),2443,4.234957,0.304084
Patton (1970),645,4.266667,0.302743
Dances with Wolves (1990),1451,3.915231,0.298823
"Green Mile, The (1999)",1222,4.154664,0.29457
Ben-Hur (1959),704,4.110795,0.287531
"Bridge on the River Kwai, The (1957)",938,4.386994,0.287072


And, sort these new results by similarity score. That's more like it!