In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sqlalchemy import create_engine
import pickle



In [3]:
#os.getenv('SQLALCHEMY_URL')
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/movies', echo=False)
engine

Engine(postgresql+psycopg2://postgres:***@localhost:5432/movies)

In [4]:
df_tags=pd.read_csv('tags.csv')
df_tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [5]:
df_movies=pd.read_csv('movies.csv')
df_movies2 = df_movies.join(df_movies['genres'].str.split('|', expand=True).add_prefix('genre'))
df_movies2 = pd.DataFrame(df_movies2.set_index(['movieId', 'title', 'genres']).stack())
df_movies2.reset_index(inplace=True)
df_movies2.drop('level_3',1, inplace=True)
df_movies2=df_movies2.rename(columns={0:'genre'})
df_movies2.sort_values(['movieId','genre'], ascending=[True,True])
df_movies2 = df_movies2.join(pd.get_dummies(df_movies2['genre'], drop_first=False))
df_movies2.head()

Unnamed: 0,movieId,title,genres,genre,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Animation,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Children,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Comedy,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Fantasy,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_ratings=pd.read_csv('ratings.csv')
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [7]:
df = df_ratings.merge(df_movies, on='movieId', how='left')
df['title'] = df['title'].map(lambda x: str(x)[:-7])
df=df.drop_duplicates(subset='title')
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men,Comedy|Romance
2,1,6,4.0,964982224,Heat,Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100820,610,160341,2.5,1479545749,Bloodmoon,Action|Thriller
100821,610,160527,4.5,1479544998,Sympathy for the Underdog,Action|Crime|Drama
100823,610,160836,3.0,1493844794,Hazard,Action|Drama|Thriller
100827,610,163937,3.5,1493848789,Blair Witch,Horror|Thriller


In [8]:
df.to_sql('movies', engine, if_exists='replace', method='multi', chunksize=10000)

In [9]:
df = pd.read_sql('movies', engine, index_col=0)
df.tail()

Unnamed: 0,index,userId,movieId,rating,timestamp,title,genres
9440,100820,610,160341,2.5,1479545749,Bloodmoon,Action|Thriller
9441,100821,610,160527,4.5,1479544998,Sympathy for the Underdog,Action|Crime|Drama
9442,100823,610,160836,3.0,1493844794,Hazard,Action|Drama|Thriller
9443,100827,610,163937,3.5,1493848789,Blair Witch,Horror|Thriller
9444,100828,610,163981,3.5,1493850155,31,Horror


In [10]:
reviews = pd.pivot_table(df, values='rating', index='userId', columns='title')
reviews

title,Unnamed: 1_level_0,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,,,,,,,,,,,...,,,,,,,,,,
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,


In [11]:
mean_values = round(reviews.mean().mean(), 1)
reviews = reviews.fillna(mean_values)
reviews

title,Unnamed: 1_level_0,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,4.0,3.3
2,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3
3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3
4,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3
5,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3
606,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3
607,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3
608,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,...,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3,3.3


**NMF**

In [12]:
# n_components - between 5 and 20. If too much - slower model, overfit
nmf = NMF(n_components = 10)

In [13]:
nmf.fit(reviews)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=10, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [14]:
pickle.dump(nmf, open('nmf_model.pkl', 'wb')) 

In [15]:
loaded_model = pickle.load(open('nmf_model.pkl', 'rb'))

In [16]:
nmf_q = loaded_model.components_

In [17]:
Q = nmf_q
P = nmf.transform(reviews)
P.shape, Q.shape

((376, 10), (10, 9445))

In [18]:
Q = nmf.components_
P = nmf.transform(reviews)
P.shape, Q.shape

((376, 10), (10, 9445))

In [19]:
Rhat = np.dot(P,Q)

In [20]:
predictions = pd.DataFrame(Rhat, columns=reviews.columns, index = reviews.index)
predictions.head()

title,Unnamed: 1_level_0,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.289442,3.29801,3.30103,3.300911,3.300713,3.30102,3.313653,3.299729,3.301955,3.300043,...,3.300777,3.300826,3.300832,3.300832,3.301111,3.301,3.298321,3.295188,4.00078,3.299862
2,3.30015,3.30397,3.303867,3.302476,3.306662,3.301076,3.305215,3.306685,3.301041,3.303905,...,3.301079,3.301083,3.301081,3.301081,3.301071,3.299683,3.295555,3.299879,3.300916,3.295524
3,3.279159,3.299843,3.299624,3.298361,3.302167,3.297097,3.301814,3.302193,3.296925,3.299654,...,3.297084,3.297096,3.297094,3.297094,3.297087,3.295832,3.292045,3.296459,3.298215,3.292057
4,3.315339,3.314311,3.314764,3.315108,3.314073,3.315598,3.206488,3.313961,3.314971,3.314274,...,3.315466,3.315451,3.315505,3.315505,3.315621,3.315926,3.316448,3.318473,3.321589,3.31691
5,3.301175,3.301866,3.301806,3.300425,3.304575,3.299038,3.301697,3.304589,3.299019,3.30183,...,3.299041,3.299043,3.299041,3.299041,3.299035,3.297656,3.293543,3.297786,3.299941,3.293523


In [21]:
nmf.reconstruction_err_

90.34392507058796

In [114]:
# sample movie
film = reviews.columns[np.random.randint(len(reviews.columns))]
# sample rating
rating = 3

In [115]:
user_input = (film, rating)
user_input

('Dark Star', 3)

In [None]:
# single movie input
#query = query.fillna(mean_values)

In [None]:
#query = np.zeros(len(reviews.columns))
#list(reviews.columns).index(film)

In [None]:
#query.reshape(-1,1).shape

In [None]:
#query[list(reviews.columns).index(film)] = rating

In [116]:
# multiple movies input
user_movies = 'Toy Story', 'Zulu', 'Titanic'

In [117]:
query = np.zeros(len(reviews.columns))
for i in range(len(user_movies)):
        query[list(reviews.columns).index(user_movies_matched[i])] = user_ratings[i]

In [118]:
dfdf=pd.DataFrame()
dfdf['query']=query
dfdf['query'].unique()

array([0., 1., 2., 3.])

In [124]:
new_query=query.reshape(-1,1).T

In [125]:
new_p = nmf.transform(new_query)
new_p.shape

(1, 10)

In [126]:
new_prediction = np.dot(new_p,Q)

In [127]:
pd.DataFrame(new_prediction, columns=reviews.columns)

title,Unnamed: 1,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
0,0.000964,0.000762,0.00077,0.000773,0.000763,0.000777,0.001435,0.00076,0.000778,0.000768,...,0.000776,0.000776,0.000776,0.000776,0.000777,0.00078,0.000781,0.000766,0.003212,0.000786


In [128]:
# Movies suggestion
reviews.columns[np.argsort(new_prediction)[0][-5:-1]]

Index(['American History X', 'Desperado', 'Bambi', 'Canadian Bacon'], dtype='object', name='title')