# Movie Recommender Assignment
### By Jacob Metzger
### Due 04/25/2016

In [1]:
import pandas as pd
import numpy as np
from __future__ import division

In [2]:
# Import class movie reviews -- these have been slightly cleaned using Excel.
M = pd.read_csv("""MovieReviews.csv""", index_col='Name')

In [3]:
# Calculates Pearson's r
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [4]:
# Provided by in assignment by Mike Bernico
# Returns num most similar items to movie_name in M
# Note: This need not be a movie.
# For example, I use this same method later to find the most similar people to a person.

def get_recs(movie_name, M, num):

    import numpy as np
    reviews = []
    for title in M.columns:
        if title == movie_name:
            continue
        cor = pearson(M[movie_name], M[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((title, cor))
    
    reviews.sort(key=lambda tup: tup[1], reverse=True)
    return reviews[:num]

### Question 1:  What movie is most Similar to 'The fault in our stars' (60 pts)


In [5]:
#Straightforward application of get_recs

movieName = """The Fault in Our Stars"""
print get_recs(movieName, M,1)

[('Godzilla', 0.32655088533483884)]


### Question 2:  Which movie(s) would you most like to see, based on your classmates experience? (40 pts)

In [6]:
#Grab the movies that I've seen and sort them by rating.
moviesSeen = M.ix['Jake Metzger'].dropna()
moviesSeen.sort_values(inplace=True, ascending=False)

#Method 1
#For each of the movies I've seen, find the movie most similar to it, leaving out duplicates.
recommendations = [_[0][0] for _ in moviesSeen.index.map(lambda x: get_recs(x, M, 1))]
recommendations = pd.DataFrame(np.unique(recommendations), columns=['Recommendations'])
recommendations

Unnamed: 0,Recommendations
0,American Sniper
1,Godzilla
2,The Fault in Our Stars
3,The Hobbit
4,The Lego Movie


In [7]:
#Method 2
#Find the 5 most similar people to me and provide their top picks, leaving out duplicates.

Mprime = M.transpose() #Instead of (movie x person), get (person x movie)
similarPeople = pd.Series(get_recs('Jake Metzger', Mprime, 5)).map(lambda _: _[0]) #The transpose lets us reuse this function!
   
recommendations = pd.Series(similarPeople).map(lambda x: M.ix[x].dropna().sort_values(ascending=False).index[0])
recommendations = pd.DataFrame(np.unique(recommendations), columns = ["Recommendations"])
recommendations

Unnamed: 0,Recommendations
0,Big Hero 6
1,Gone Girl
2,Guardians of the Galaxy
3,The Hobbit


### Question 3: Bonus Question...  For all the movies you haven't seen, can you predict how you'd rate them using the class reviews? (10 pts)

#### One way to do this is to simply take the rating of the person most similar to me who has seen the movie.

In [8]:
recommendations = []
similarPeople = pd.Series(get_recs('Jake Metzger', Mprime, Mprime.size)).map(lambda _: _[0])

for movie, rating in enumerate(M.ix['Jake Metzger']):
    if np.isnan(rating):
        for name in similarPeople:
            if not np.isnan(M.ix[name][movie]):
                recommendations.append((M.columns[movie], M.ix[name][movie]))
                break
            else:
                continue
    else:
        recommendations.append(((M.columns[movie], rating)))
recommendations

[('American Sniper', 4.0),
 ('The Hunger Games: Mockingjay - Part 1', 3.0),
 ('Guardians of the Galaxy', 4.0),
 ('The Lego Movie', 3.0),
 ('The Hobbit', 4.0),
 ('Transformers', 2.0),
 ('Malificent', 3.0),
 ('Big Hero 6', 4.0),
 ('Godzilla', 2.0),
 ('Interstellar', 4.0),
 ('How to Train your Dragon 2', 3.0),
 ('Gone Girl', 5.0),
 ('Divergent', 4.0),
 ('The Fault in Our Stars', 3.0),
 ('Unbroken', 3.0),
 ('300: Rise of an Empire', 3.0)]

#### Another way, analogous to the above, is to take the rating of the movie most similar to it that I have seen.

In [9]:
recommendations = []
for movie, rating in enumerate(M.ix['Jake Metzger']):
    if np.isnan(rating):
        similarMovies = get_recs(M.columns[movie], M, M.size)
        for movieName, _ in similarMovies:
            if not np.isnan(M.ix['Jake Metzger'].ix[movieName]):
                recommendations.append((M.columns[movie], M.ix['Jake Metzger'].ix[movieName]))
                break
            else:
                continue
    else:
        recommendations.append((M.columns[movie], rating))
recommendations

[('American Sniper', 5.0),
 ('The Hunger Games: Mockingjay - Part 1', 4.0),
 ('Guardians of the Galaxy', 4.0),
 ('The Lego Movie', 4.0),
 ('The Hobbit', 2.0),
 ('Transformers', 2.0),
 ('Malificent', 3.0),
 ('Big Hero 6', 4.0),
 ('Godzilla', 2.0),
 ('Interstellar', 2.0),
 ('How to Train your Dragon 2', 2.0),
 ('Gone Girl', 5.0),
 ('Divergent', 3.0),
 ('The Fault in Our Stars', 2.0),
 ('Unbroken', 2.0),
 ('300: Rise of an Empire', 3.0)]

#### If deciding between these two methods, it would be better to pick the method that has more support. In my case, it's likely to be the person-based method rather than the movie-based method.