In [9]:
import os
import time
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [60]:
class TFIDFModel:
    def _save_pickle_file(self, file_name, data):
        file_name = f'./models/TFIDF/{file_name}.pickle'
        mapping_file = open(file_name, 'wb')
        pickle.dump(data, mapping_file)
        mapping_file.close()  
    
    def save(self, indices, similarities):
        if not os.path.exists('./models/TFIDF'):
            os.makedirs('./models/TFIDF')
        
        self._save_pickle_file('indices', indices)
        self._save_pickle_file('similarities', similarities)
    
    @staticmethod
    def train():
        data = pd.read_json('../data/movies_data.json', orient='split')
        tfidf_matrix = tf.fit_transform(data['content'])
        cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
        ids = data['id']
        indices = pd.Series(data.index, index=data['id'])
        
        return indices, cosine_similarities

In [98]:
class TFIDFRecommender:
    def __init__(self):
        self.indices = TFIDFRecommender.load_pickle_file('./models/TFIDF/indices')
        self.similarities = TFIDFRecommender.load_pickle_file('./models/TFIDF/similarities')
    
    @staticmethod
    def load_pickle_file(file_name):
        file = open(f'{file_name}.pickle', 'rb')
        object_file = pickle.load(file)
        return object_file
    
    def recommend(self, movie_id, n=10):
        start = time.time()
        
        idx = self.indices[movie_id]
        sim_scores = list(enumerate(self.similarities[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:int(n) + 1]
        movie_indices = [{str(self.indices[self.indices == i[0]].index.tolist()[0]): i[1]} for i in sim_scores]

        end = time.time()
        print(f'Finished in: {end - start}')
        
        return movie_indices

In [76]:
model = TFIDFModel()
indices, similarities = model.train()
model.save(indices, similarities)

In [99]:
recommender = TFIDFRecommender()
recommender.recommend(480)

[(1309, 0.08180589587881092), (9098, 0.0723077629460714), (610, 0.04726097992985522), (8588, 0.03393704935296367), (3453, 0.030844746837884726), (992, 0.0299305717881781), (921, 0.028165307579045073), (1487, 0.027410861346845387), (7130, 0.026949944721288498), (3503, 0.02652744545102256)]
Finished in: 0.012001752853393555


[{'1544': 0.08180589587881092},
 {'117529': 0.0723077629460714},
 {'1017': 0.04726097992985522},
 {'136016': 0.03393704935296367},
 {'4519': 0.030844746837884726},
 {'888': 0.0299305717881781},
 {'420': 0.028165307579045073},
 {'1396': 0.027410861346845387},
 {'69278': 0.026949944721288498},
 {'4445': 0.02652744545102256}]