In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils.extmath import randomized_svd
from sklearn.feature_extraction.text import  TfidfVectorizer
import pickle


In [3]:

platforms = pd.read_csv('plataformas.csv')

In [4]:
platforms

Unnamed: 0,Id,title,rating,movies_type,score
0,as1,the grand seduction,g,"comedy, drama",3.467131
1,as2,take care good night,13+,"drama, international",3.548682
2,as3,secrets of deception,g,"action, drama, suspense",3.500000
3,as4,pink: staying true,g,documentary,3.538055
4,as5,monster maker,g,"drama, fantasy",3.478992
...,...,...,...,...,...
22993,ns8803,zodiac,r,"cult movies, dramas, thrillers",3.438998
22994,ns8804,zombie dumb,tv-y7,"kids' tv, korean tv shows, tv comedies",3.515947
22995,ns8805,zombieland,r,"comedies, horror movies",3.420945
22996,ns8806,zoom,pg,"children & family movies, comedies",3.588050


In [5]:
user_item = platforms[['Id', 'title','score']] 
user_item.reset_index(drop=True)
user_item = user_item.head(10000)

In [12]:
user_item

Unnamed: 0,Id,title,score
0,as1,the grand seduction,3.467131
1,as2,take care good night,3.548682
2,as3,secrets of deception,3.500000
3,as4,pink: staying true,3.538055
4,as5,monster maker,3.478992
...,...,...,...
9995,ds328,disney gallery / star wars: the mandalorian,3.525407
9996,ds329,max keeble's big move,3.514989
9997,ds330,soul,3.564516
9998,ds331,arendelle castle yule log,3.498058


In [13]:
user_item.to_csv('user_item.csv', index=False)

In [6]:
# Vectorizador TfidfVectorizer con parámetros de reduccion procesamiento
vectorizer = TfidfVectorizer(min_df=10, max_df=0.5, ngram_range=(1,2))

# Ajustar y transformar el texto de la columna "descripcion" del DataFrame
X = vectorizer.fit_transform(user_item['title'])

# Calcular la matriz de similitud de coseno con una matriz reducida de 500x500
similarity_matrix = cosine_similarity(X[:5500,:])

# Obtener la descomposición en valores singulares aleatoria de la matriz de similitud de coseno con 10 componentes
n_components = 10
U, Sigma, VT = randomized_svd(similarity_matrix, n_components=n_components)

# Construir la matriz reducida de similitud de coseno
reduced_similarity_matrix = U.dot(np.diag(Sigma)).dot(VT)

In [7]:
reduced_similarity_matrix

array([[ 9.95449954e-01,  2.30347757e-03, -1.91291272e-03, ...,
         4.17850702e-03, -3.80238238e-03,  4.03151271e-03],
       [ 2.30348122e-03,  2.26978452e-04,  2.64151613e-03, ...,
         4.47509904e-04,  1.75550913e-03,  1.28767617e-04],
       [-1.91289264e-03,  2.64152606e-03,  1.17069917e-01, ...,
         5.88227375e-03,  1.21276844e-03, -4.02010143e-04],
       ...,
       [ 4.17861969e-03,  4.47463310e-04,  5.88227729e-03, ...,
         1.31014339e-03,  4.41330012e-03,  3.29514169e-04],
       [-3.80229664e-03,  1.75563260e-03,  1.21274196e-03, ...,
         4.41332006e-03,  3.74214111e-02,  9.83283470e-04],
       [ 4.03151890e-03,  1.28826131e-04, -4.02034605e-04, ...,
         3.29524167e-04,  9.83211740e-04,  4.66750287e-04]])

In [9]:
reduced_similarity_df = pd.DataFrame(reduced_similarity_matrix)
reduced_similarity_df.to_csv('reduced_similarity_matrix.csv', index=False)

In [14]:
reduced_similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5490,5491,5492,5493,5494,5495,5496,5497,5498,5499
0,0.995450,0.002303,-0.001913,3.177226e-04,7.534010e-04,0.002711,-1.725906e-18,0.000367,-1.787840e-18,-0.000654,...,-0.000160,-0.002961,-0.003259,0.461259,0.222428,0.0,0.234894,0.004179,-0.003802,0.004032
1,0.002303,0.000227,0.002642,2.832867e-05,4.803004e-05,0.001101,2.070956e-18,0.000555,7.139032e-19,0.000183,...,0.000078,0.003937,0.001116,0.004096,0.003152,0.0,0.004574,0.000448,0.001756,0.000129
2,-0.001913,0.002642,0.117070,8.184574e-04,2.057974e-04,0.000398,-3.902022e-18,-0.000542,-1.235339e-18,0.000276,...,0.000005,0.161287,0.008642,0.141040,0.111182,0.0,-0.003015,0.005882,0.001213,-0.000402
3,0.000318,0.000028,0.000818,1.222493e-05,2.282698e-07,0.000091,1.000587e-18,0.000018,-2.327354e-19,0.000045,...,0.000012,0.001169,0.000191,0.001112,0.000854,0.0,-0.000053,0.000082,0.000417,0.000006
4,0.000753,0.000048,0.000206,2.201654e-07,5.623055e-05,0.000220,2.424260e-18,-0.000022,1.804490e-18,0.000060,...,0.000001,0.000404,-0.000012,0.000545,0.000384,0.0,-0.000060,0.000123,0.000340,0.000012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
5496,0.234894,0.004574,-0.003015,-5.346857e-05,-5.994078e-05,0.000989,-2.006356e-17,0.020739,-1.024777e-19,0.000602,...,-0.000006,-0.000587,0.021322,0.102539,0.056352,0.0,0.359480,0.007715,0.028197,0.001953
5497,0.004179,0.000447,0.005882,8.188689e-05,1.231037e-04,0.005009,7.661933e-18,0.000742,1.299333e-18,0.000526,...,0.000100,0.008778,0.002223,0.008675,0.006770,0.0,0.007715,0.001310,0.004413,0.000330
5498,-0.003802,0.001756,0.001213,4.168720e-04,3.402343e-04,0.010049,8.493575e-17,0.004775,-4.064392e-18,0.003636,...,0.001151,0.006417,0.013801,-0.003499,0.001196,0.0,0.028197,0.004413,0.037421,0.000983


In [10]:
def get_recommendation(titulo: str):
    try:
        #Ubicamos el indice del titulo pasado como parametro en la columna 'title' del dts user_item
        indice = np.where(user_item['title'] == titulo)[0][0]
        #Encontramos los indices de las puntuaciones y caracteristicas similares del titulo 
        puntuaciones_similitud = reduced_similarity_matrix[indice,:]
        #Ordenamos los indices de menor a mayor
        puntuacion_ordenada = np.argsort(puntuaciones_similitud)[::-1]
        #seleccionamos solo 5 
        top_indices = puntuacion_ordenada[:5]
        #retornamos los 5 items con sus titulos como una lista
        return user_item.loc[top_indices, 'title'].tolist()
        #Si el titulo dado no se encuentra damos un aviso
    except IndexError:
        print(f"El título '{titulo}' no se encuentra en la base de datos. Intente con otro título.")

In [11]:
get_recommendation('the grand seduction')

['the struggle 2: the dilemma',
 'the flying deuces',
 'the expanse',
 'the fabulous allan carr',
 'the fades']