In [3]:
import pandas as pd
import numpy as np

In [63]:
df=pd.read_csv("imdb_netflix_clean.csv")

In [5]:
df['description'].head(5)

0    After a devastating earthquake hits Mexico Cit...
1    When an army recruit is found dead, his fellow...
2    In a postapocalyptic world, rag-doll robots hi...
3    In a postapocalyptic world, rag-doll robots hi...
4    A brilliant group of students become card-coun...
Name: description, dtype: object

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf=TfidfVectorizer(stop_words='english')

df['description']=df['description'].fillna('')
tfidf_matrix=tfidf.fit_transform(df['description'])

tfidf_matrix.shape

(5510, 13984)

In [12]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim=linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
#We are going to define a function that takes in a movie title as an input and outputs a list of the 10 most similar movies. Firstly, for this, we need a reverse mapping of movie titles and DataFrame indices. In other words, we need a mechanism to identify the index of a movie in our metadata DataFrame, given its title.

In [14]:
#Construct a reverse map of indices and movie titles
indices=pd.Series(df.index, index=df['title']).drop_duplicates()

In [28]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.stats import pearsonr

In [41]:
def get_similarity_matrix(features, method="cosine"):
    """Returnează matricea de similaritate pe baza metodei alese."""
    if method == "cosine":
        return cosine_similarity(features)
    elif method == "euclidean":
        # distanțele Euclidiene -> le inversăm ca să devină similarități (1 / (1 + distanță))
        dist = euclidean_distances(features)
        return 1 / (1 + dist)    
    else:
        raise ValueError("Metodă necunoscută. Folosește: 'cosine', 'euclidean' sau 'pearson'.")


def get_recommendation(title, df, indices, features, method="cosine"):
    """Returnează titlurile similare și scorurile lor, în funcție de metoda aleasă."""
    cosine_sim = get_similarity_matrix(features, method)
    
    idx = indices[title]
    if isinstance(idx, pd.Series):
        idx = idx.iloc[0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [(i, float(score)) for i, score in sim_scores if not np.isnan(score)]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    
    movie_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    recommendations = pd.DataFrame({
        'title': df['title'].iloc[movie_indices].values,
        'score': scores
    })
    
    print(f"Top recomandări pentru '{title}' folosind {method} similarity:\n")
    for t, s in zip(recommendations['title'], recommendations['score']):
        print(f"🎬 {t} — scor: {s:.4f}")
    
    return recommendations

In [42]:
get_recommendation('365 days', df, indices, tfidf_matrix, method='cosine')


Top recomandări pentru '365 days' folosind cosine similarity:

🎬 365 days — scor: 1.0000
🎬 holding the man — scor: 0.1715
🎬 our godfather — scor: 0.1439
🎬 find yourself — scor: 0.1363
🎬 14 minutes from earth — scor: 0.1311
🎬 juman — scor: 0.1272
🎬 paper year — scor: 0.1239
🎬 el silencio es bienvenido — scor: 0.1225
🎬 singles villa — scor: 0.1187
🎬 aapla manus — scor: 0.1183


Unnamed: 0,title,score
0,365 days,1.0
1,holding the man,0.171482
2,our godfather,0.143919
3,find yourself,0.136285
4,14 minutes from earth,0.13105
5,juman,0.127217
6,paper year,0.123942
7,el silencio es bienvenido,0.122452
8,singles villa,0.118727
9,aapla manus,0.11829


In [43]:
get_recommendation('365 days', df, indices, tfidf_matrix, method='euclidean')


Top recomandări pentru '365 days' folosind euclidean similarity:

🎬 365 days — scor: 1.0000
🎬 holding the man — scor: 0.4372
🎬 our godfather — scor: 0.4332
🎬 find yourself — scor: 0.4321
🎬 14 minutes from earth — scor: 0.4314
🎬 juman — scor: 0.4308
🎬 paper year — scor: 0.4304
🎬 el silencio es bienvenido — scor: 0.4301
🎬 singles villa — scor: 0.4296
🎬 aapla manus — scor: 0.4296


Unnamed: 0,title,score
0,365 days,1.0
1,holding the man,0.437205
2,our godfather,0.433182
3,find yourself,0.432093
4,14 minutes from earth,0.431352
5,juman,0.430812
6,paper year,0.430353
7,el silencio es bienvenido,0.430144
8,singles villa,0.429625
9,aapla manus,0.429565


In [45]:
#We are going to build a recommender based on the following metadata: the 3 top actors, the director, related genres and the movie plot keywords.

In [65]:
from ast import literal_eval

def safe_to_list(x):
   
    if isinstance(x, list):
        return x
    
    elif isinstance(x, str) and x.strip().startswith('[') and x.strip().endswith(']'):
        try:
            return literal_eval(x)
        except Exception:
            return []
    
    elif isinstance(x, str):
        return [i.strip() for i in x.split(',') if i.strip() != '']
    
    else:
        return []


features = ['cast', 'genres', 'rating', 'director']
for feature in features:
    df[feature] = df[feature].apply(safe_to_list)


def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    else:
        return ''

for feature in features:
    df[feature] = df[feature].apply(clean_data)

df.head()


Unnamed: 0,id,title,director,cast,country,release_year,date_added,rating,description,runtime,...,type_tvSpecial,type_video,rating_class,num_votes_log,cluster,cluster_v2,PCA1_v2,PCA2_v2,predicted_class_num,predicted_class
0,0,7:19,[jorgemichelgrau],"[demiánbichir, héctorbonilla, oscarserrano, az...",Mexico,2016,2016-12-23,[tv-ma],After a devastating earthquake hits Mexico Cit...,94,...,0,0,Average,6.361302,1,3,0.126578,1.738837,0,Average
1,1,23:59,[gilbertchan],"[teddchan, stellachung, henleyhii, lawrencekoh...",Singapore,2011,2018-12-20,[r],"When an army recruit is found dead, his fellow...",78,...,0,0,Bad,6.754604,2,1,-0.01017,-0.632712,1,Bad
2,2,9,[shaneacker],"[elijahwood, johnc.reilly, jenniferconnelly, c...",United States,2009,2017-11-16,[pg-13],"In a postapocalyptic world, rag-doll robots hi...",79,...,0,0,Good,11.807429,4,4,0.333948,-3.5544,3,Good
3,3,9,[shaneacker],"[elijahwood, johnc.reilly, jenniferconnelly, c...",United States,2009,2017-11-16,[pg-13],"In a postapocalyptic world, rag-doll robots hi...",9,...,0,0,Average,4.406719,2,0,-2.560098,-0.342676,3,Good
4,4,21,[robertluketic],"[jimsturgess, kevinspacey, katebosworth, aaron...",United States,2008,2020-01-01,[pg-13],A brilliant group of students become card-coun...,123,...,0,0,Good,12.352581,4,3,2.357464,2.023064,3,Good


In [56]:
def create_soup(x):
    return ' '.join(x['cast'])+' '+' '.join(x['genres'])+' '+' '.join(x['rating'])+' '+' '.join(x['director'])
for col in ['cast', 'genres', 'rating', 'director']:
    df[col] = df[col].fillna('').apply(lambda x: [i.strip() for i in x.split(',')] if isinstance(x, str) else [])

df['soup']=df.apply(create_soup,axis=1)

In [57]:
df['soup'].head(10)

0       
1       
2       
3       
4       
5       
6       
7       
8       
9       
Name: soup, dtype: object

In [66]:
for col in ['cast', 'genres', 'rating', 'director']:
    df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [])

def create_soup(x):
    return ' '.join(x['cast']) + ' ' + ' '.join(x['genres']) + ' ' + ' '.join(x['rating']) + ' ' + ' '.join(x['director'])

df['soup'] = df.apply(create_soup, axis=1)

print(df['soup'].isna().sum(), "NaN values in soup")
print(df['soup'].str.strip().eq('').sum(), "empty strings in soup")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')

valid_soup = df['soup'][df['soup'].str.strip() != '']
count_matrix = count.fit_transform(valid_soup)

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

 df = df.reset_index(drop=True)


0 NaN values in soup
0 empty strings in soup


In [75]:
indices=pd.Series(df.index, index=df['title'])
get_recommendation('bert kreischer: secret time', df, indices, tfidf_matrix, method='cosine')


Top recomandări pentru 'bert kreischer: secret time' folosind cosine similarity:

🎬 bert kreischer: hey big boy — scor: 0.3578
🎬 bert kreischer: the machine — scor: 0.2428
🎬 hasan minhaj: homecoming king — scor: 0.2108
🎬 john mulaney: kid gorgeous at radio city — scor: 0.1816
🎬 moshe kasher: live in oakland — scor: 0.1662
🎬 jenny slate: stage fright — scor: 0.1281
🎬 ken jeong: you complete me, ho — scor: 0.1256
🎬 michael mcintyre: showman — scor: 0.1231
🎬 jim gaffigan: obsessed — scor: 0.1222
🎬 hannah gadsby: nanette — scor: 0.1179


Unnamed: 0,title,score
0,bert kreischer: hey big boy,0.357768
1,bert kreischer: the machine,0.242759
2,hasan minhaj: homecoming king,0.210836
3,john mulaney: kid gorgeous at radio city,0.18158
4,moshe kasher: live in oakland,0.166242
5,jenny slate: stage fright,0.128113
6,"ken jeong: you complete me, ho",0.125582
7,michael mcintyre: showman,0.123089
8,jim gaffigan: obsessed,0.122174
9,hannah gadsby: nanette,0.117944
