In [1]:
import pandas as pd

# Importing both sets of data, movies and ratings
df_movies = pd.read_csv('../data/raw/ml-latest-small/movies.csv')
df_ratings = pd.read_csv('../data/raw/ml-latest-small/ratings.csv')
# Creating an inner join table
df_merged = pd.merge(df_movies, df_ratings, on='movieId', how='inner')

# Peaking at the created table to verify structure and content
# df_merged.head()

# Create pivot table 
    # Columns represent movie titles
    # Rows represent individual userIds
    # Values in each cell represent that user's rating for that movie
df = df_merged.pivot_table(index='userId', columns='title', values='rating')

# Only retain films that have at least 5 ratings
df = df.dropna(thresh=5, axis=1)
# Replace any missing data with value of 0
df.fillna(0, inplace=True)

# Peaking at created table to verify structure and content
# df.head()

# Create a data frame using the Pearson correlation to determine similarity value
df_similarity = df.corr(method='pearson')

#Store the data for later to be used in building the API
df_similarity.to_csv('../data/processed/movie_similarity.csv')

# Peaking at similarity generation to verify structure and content
# df_similarity.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.063117,0.235908,-0.023768,0.143482,0.011998,0.087931,0.224052,-0.018608,0.034223,...,0.134701,0.153158,0.101301,0.049897,0.003233,-0.017905,0.187953,0.062174,-0.014025,0.353194
(500) Days of Summer (2009),0.063117,1.0,0.133949,0.142471,0.273989,0.19396,0.148903,0.142141,0.066567,0.159756,...,0.068407,0.414585,0.355723,0.252226,0.216007,0.126147,0.053614,0.241092,0.139511,0.125905
*batteries not included (1987),0.235908,0.133949,1.0,0.035596,0.061144,-0.017106,0.073459,0.1061,-0.012561,0.026377,...,0.039055,0.19453,0.12101,0.071852,-0.024573,-0.012086,0.115396,-6e-05,-0.009467,0.234514
10 Cloverfield Lane (2016),-0.023768,0.142471,0.035596,1.0,-0.005799,0.112396,0.006139,-0.016835,-0.017692,0.031704,...,-0.023477,0.272347,0.241751,0.195054,0.319371,0.082246,0.177846,0.096638,0.081429,0.002733
10 Things I Hate About You (1999),0.143482,0.273989,0.061144,-0.005799,1.0,0.24467,0.223481,0.211473,0.109729,0.011784,...,0.13246,0.091853,0.158637,0.281934,0.050031,0.088391,0.121029,0.130813,0.068745,0.110612
