In [1]:
import pandas as pd
import numpy as np
import sys
import os
import csv
import json
from pathlib import Path
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel
import math
import time
import pickle
import dill
import string
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
sys.path.append(str(Path.cwd().parent))

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dataset = Path.cwd().parent/"input/opera-dtestone2/opera_dataset.csv"

In [3]:
data = pd.read_csv(dataset)
data.head()

Unnamed: 0,Name,Description,Composer,Genre
0,L'Orfeo,In the fields of Thrace the marriage of Orpheu...,Claudio Monteverdi,Baroque
1,Orfeo ed Euridice,"Orpheus stands before Eurydice's tomb, lost in...",Christoph Willibald Gluck,Classical
2,Orphée aux Enfers,A melodrama (Introduction and Melodrame) opens...,Jacques Offenbach,Opera Comic
3,Ormindo,"The Maghreb princes, Amida and Ormindo, who ar...",Francesco Cavalli,Baroque
4,Giasone,Hercules (Ercole) has persuaded Jason (Giasone...,Francesco Cavalli,Baroque


In [4]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 1), stop_words='english', decode_error = 'ignore')

tfidf_matrix = tf.fit_transform(data['Description'].values.astype(str))
print(tf.vocabulary_)



In [5]:
tic = time.process_time()
similarity = cosine_similarity(tfidf_matrix,tfidf_matrix)
print(similarity)
toc = time.process_time()
print(toc - tic)

[[1.00000000e+00 5.24014328e-01 1.98835549e-01 3.74938950e-03
  2.03955966e-02 8.47039840e-03 2.26060847e-02 9.26076549e-03
  1.56264795e-02 4.04086360e-02 1.64166426e-02 2.81433724e-02
  1.12684182e-02 1.00649154e-02 1.60461621e-02 1.15873445e-02
  2.11193383e-02 1.97487431e-02 3.12893261e-02 1.68197459e-02
  1.92416029e-02 9.29702898e-03 1.13765335e-02 6.97202912e-03
  3.32585822e-03 9.03740079e-03]
 [5.24014328e-01 1.00000000e+00 2.48232627e-01 2.57013330e-02
  1.83214121e-02 1.22167416e-02 3.64503633e-02 3.38965055e-02
  1.96510606e-02 3.96176674e-02 2.68141502e-02 4.32213545e-02
  1.90815873e-02 2.36447851e-02 1.16504367e-02 2.58396284e-02
  3.35047434e-02 2.05208462e-02 1.79704656e-02 3.99780317e-02
  3.83454597e-02 6.50587270e-03 2.02295626e-02 1.37707220e-02
  1.49360635e-02 1.61001732e-02]
 [1.98835549e-01 2.48232627e-01 1.00000000e+00 1.14658591e-02
  2.18525541e-02 1.38388784e-02 3.26773490e-02 1.22437979e-02
  3.48754615e-02 7.75716386e-02 1.78775590e-02 3.88462930e-02
  2.

In [6]:
def recommend(title):
    
    # initialise empty list of recommended movies
    l = []
    # get the index in tmdb corresponding to movie title that has been passed to the function
    idx = next(iter(data[data['Name'].str.lower()==title.lower()].index), 'no match')

    # creating series of the similarity scores corresponding to the index in descending order
    sim_score = pd.Series(similarity[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies - index 0 is the film passed to the function
    top_10_index= list(sim_score.iloc[1:11].index)
    top_10_score = sim_score.iloc[1:11]
    avg_similarity = np.mean(top_10_score)
    
    # populating the list with the titles of the best 10 matching movies
    
    for i in top_10_index:
        l.append(data.loc[i,'Name'])
    
    top_10 = pd.DataFrame( np.column_stack((top_10_score,l)), columns=['similarity','Name']) 
    
    return top_10,avg_similarity

In [7]:
recommend('Medea')

(             similarity                Name
 0    0.4027213194487712             Giasone
 1    0.0366462337126598             Ormindo
 2   0.03174059693961892            Turandot
 3  0.026387936992844727         Les Troyens
 4  0.021864465801626768              Armide
 5   0.02150983598328334     Die Zauberflöte
 6   0.01736483285991814    Castor et Pollux
 7  0.016269567709492373    Madama Butterfly
 8  0.015371048063446427     Dido and Aeneas
 9   0.01387613669538401  Le Nozze di Figaro,
 0.06037519742070459)