In [1]:

import pandas as pd
albums = pd.read_csv('albums.csv')
albums.head()

bands = pd.read_csv('bands.csv')
bands.head()

reviews = pd.read_csv('reviews.csv')
reviews.head()

from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
reviews['content'] = reviews['content'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(reviews['content'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(21510, 87131)

In [2]:
#fidf.get_feature_names()[5000:5010]
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [3]:
#Construct a reverse map of indices and reviews.
indices = pd.Series(reviews.index, index=reviews['content']).drop_duplicates()
indices.shape



(21510,)

In [4]:
index = 0
sim_scores = list(enumerate(cosine_sim[index]))
#Get 10 most similar reviews 
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:6]
sim_scores

[(20347, 0.2775676666110631),
 (9763, 0.21185362197211285),
 (12190, 0.19241484297501776),
 (11156, 0.19089152727849493),
 (15314, 0.18726147779156557)]

In [5]:
rec_indices = [k[0] for k in sim_scores]
album_ids = reviews['album'].iloc[rec_indices]
print(album_ids)
album_names = []
for id in album_ids:
        album_names.append(albums.loc[albums['id'] == id])
print(album_names)
#sorted(album_names, key= lambda x : reviews.loc[reviews['id']])
reviews.loc[reviews['album'] == 67]

20347    27390
9763     14389
12190    19172
11156    17554
15314    23267
Name: album, dtype: int64
[          id   band  title  year
27389  27390  37439  Golem  1988,           id   band                   title  year
14388  14389  22382  Machinegunnery of Doom  1997,           id   band                     title  year
19171  19172  29539  Carving a Crimson Career  1999,           id   band      title  year
17553  17554  27467  Hellbound  2008,           id   band               title  year
23266  23267  34100  Breathing the Fire  2009]


Unnamed: 0,id,album,title,score,content
12,13,67,Pure Fucking Goremageddon!,0.87,This was my first Aborted album| I basically c...
13,14,67,Essential Brutality,1.0,Aborted quickly became one of my favorite deat...
14,15,67,Now this is Goregrind at it's finest,0.95,I gotta say I've always been an avid Aborted f...
15,16,67,Aborted will carve you up!,0.86,‘The Doctor is in….’ – the first four words ut...
16,17,67,yet another gore band,0.87,now lets get one thing sorted straight from th...
17,18,67,Saw wielding brutality,0.85,This album has quite possibly the most appropr...


In [6]:
import recommend
import helpers
data = helpers.read_data()
recommender = recommend.Recommender()


In [7]:
import pandas as pd
recommender.get_album_recommendations_from_review(recommender.reviews.iloc[0]['content'], indices = pd.Series(recommender.reviews.index, index=recommender.reviews['content']).drop_duplicates())



20347    27390
9763     14389
12190    19172
11156    17554
15314    23267
Name: album, dtype: int64


[          id   band  title  year
 27389  27390  37439  Golem  1988,
           id   band                   title  year
 14388  14389  22382  Machinegunnery of Doom  1997,
           id   band                     title  year
 19171  19172  29539  Carving a Crimson Career  1999,
           id   band      title  year
 17553  17554  27467  Hellbound  2008,
           id   band               title  year
 23266  23267  34100  Breathing the Fire  2009]

In [12]:
float(recommender.reviews.iloc[0]['score'])

0.88

In [9]:
album_id = recommender.albums.loc[recommender.albums['title'] == 'Taste the Hate']['id']
album_id

0    1
Name: id, dtype: int64