In [143]:
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

pd.__version__, np.__version__, re.__version__, sklearn.__version__


('0.25.0', '1.16.0', '2.2.1', '0.23.1')

In [81]:
print("columns:", movies_df.columns)
print("Shape:", movies_df.shape)

columns: Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')
Shape: (34886, 8)


In [82]:
str_len_plot_ser = movies_df['Plot'].str.len()
print("MIN:", np.min(str_len_plot_ser), 
      "\tMEAN:", np.mean(str_len_plot_ser), 
      "\tMAX:", np.max(str_len_plot_ser), 
      "\nVal counts bins:", str_len_plot_ser.value_counts(bins = 10))

MIN: 15 	MEAN: 2165.0345410766495 	MAX: 36773 
Val counts bins: (-21.759, 3690.8]     27622
(3690.8, 7366.6]       6901
(7366.6, 11042.4]       306
(11042.4, 14718.2]       38
(14718.2, 18394.0]       10
(18394.0, 22069.8]        3
(33097.2, 36773.0]        2
(29421.4, 33097.2]        2
(25745.6, 29421.4]        1
(22069.8, 25745.6]        1
Name: Plot, dtype: int64


In [83]:
embeddings_dict = {}
with open("glove.6B.100d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [84]:
def get_sent_vector_avg(inp_sent):
    try:
        inp_sent_lw = inp_sent.lower()
        vec_holder = []
        for word in inp_sent_lw.split():
            if word not in embeddings_dict:
                continue
            word_emb_ = embeddings_dict[word]
            vec_holder.append(word_emb_)
        average_vec = np.mean(vec_holder, axis = 0)
        return average_vec
    except:
        return None
    

In [85]:
def get_similars(inp_name, dict_to_consider):
    inp_name_vec = dict_to_consider[inp_name]
    cos_sim_res = cosine_similarity([inp_name_vec], list(dict_to_consider.values()))
    
    dict_keys = list(dict_to_consider.keys())
    
    TOP_N = 20
    top_n_ind = cos_sim_res[0].argsort()[-TOP_N:]
    top_n_similarities = [key_ for ind, key_ in enumerate(dict_keys) if ind in top_n_ind]
    top_n_similarities.reverse()
    return top_n_similarities

In [109]:
movies_df_vec_df = movies_df.copy()

split_feature_func = lambda x: None if np.all(pd.isnull(x)) else x.replace(", ", ",").split(",")
movies_df_vec_df['Cast'] = movies_df_vec_df['Cast'].apply(split_feature_func)
movies_df_vec_df['Director'] = movies_df_vec_df['Director'].apply(split_feature_func)
movies_df_vec_df['Genre'] = movies_df_vec_df['Genre'].apply(split_feature_func)

movies_df_vec_df['sent_vec'] = movies_df_vec_df['Plot'].apply(get_sent_vector_avg)

  out=out, **kwargs)


In [110]:

title_emb_dict = movies_df_vec_df.groupby("Title")["sent_vec"].apply(lambda x: np.mean(x, axis = 0)).dropna().to_dict()

director_emb_dict = movies_df_vec_df.explode("Director")\
            .groupby("Director")["sent_vec"].apply(lambda x: np.mean(x, axis = 0)).dropna().to_dict()
    
cast_emb_dict = movies_df_vec_df.explode("Cast")\
            .groupby("Cast")["sent_vec"].apply(lambda x: np.mean(x, axis = 0)).dropna().to_dict()

genre_emb_dict = movies_df_vec_df.explode("Genre")\
            .groupby("Genre")["sent_vec"].apply(lambda x: np.mean(x, axis = 0)).dropna().to_dict()


In [114]:
# Just return the value based on the key of the dict with error handling
def get_vector(inp, vec_dict):
    try:
        return vec_dict[inp]
    except:
        return None
# Transformation is made flexible. It could take list as input
def get_vector_with_list(inp, vec_dict):
    try:
        return np.mean([get_vector(ele, vec_dict) for ele in inp], axis = 0)
    except:
        return None


In [118]:

movies_df_transformed_df = movies_df_vec_df.dropna().copy()

movies_df_transformed_df['Cast_vec'] = movies_df_transformed_df['Cast'].apply(lambda x: get_vector_with_list( 
                                                                              x, cast_emb_dict))
movies_df_transformed_df['Title_vec'] = movies_df_transformed_df['Title'].apply(lambda x: get_vector( 
                                                                              x, title_emb_dict))
movies_df_transformed_df['Genre_vec'] = movies_df_transformed_df['Genre'].apply(lambda x: get_vector_with_list( 
                                                                              x, genre_emb_dict))
movies_df_transformed_df['Director_vec'] = movies_df_transformed_df['Director'].apply(lambda x: get_vector_with_list( 
                                                                              x, director_emb_dict))

movies_df_transformed_df['vector_combined'] = movies_df_transformed_df.apply(
    lambda x: np.concatenate((x['Cast_vec'], x['Title_vec'], x['Genre_vec'], x['Director_vec']), axis = None), 
    axis = 1
)


In [121]:
movie_embeddings_dict = movies_df_transformed_df.set_index("Title")['vector_combined'].to_dict()


In [157]:
check_movie_name = "gravity"
[movie for movie in movies if re.search(check_movie_name, movie, re.IGNORECASE)]

['Defying Gravity', 'Gravity']

In [158]:
################ MOVIE RESULTS ################
INP_MOVIE = "Gravity"
print("*****\nInput Movie:\n", INP_MOVIE, 
      "\n*****\nThe Movies you may like\n\n", "\n".join(get_similars(INP_MOVIE, movie_embeddings_dict)))


*****
Input Movie:
 Gravity 
*****
The Movies you may like

 The Age of Pioneers
Gamera vs. Zigra
Gamera vs. Guiron
Gamera
Gorath
Battle in Outer Space
Thunderbirds Are Go
The Martian
Interstellar
Planes: Fire & Rescue
Gravity
Europa Report
Apollo 18
Virus
Armageddon
Star Trek: First Contact
DeepStar Six
The Abyss
Airport '77
Dark Star


In [89]:

check_genre_name = "  "
genres = list(genre_emb_dict.keys())
[genre for genre in genres if re.search(check_genre_name, genre, re.IGNORECASE)]

[]

In [111]:
################ GENRE RESULTS ################
INP_GENRE_ = "romance "
print("*****\nInput Genre:\n", INP_GENRE_,  
      "\n*****\nSimilar Genres\n\n", "\n".join(get_similars(INP_GENRE_, genre_emb_dict)))


*****
Input Genre:
 romance  
*****
Similar Genres

 romantic drama
romantic comedy
romance/thriller
romance/drama
romance/comedy
romance/action
romance drama
romance 
romance
rom-com
musical 
masala
love
family drama
family 
drama romance
comedy romance
comedy 
charmme
adult comedy


In [113]:
check_movie_name = "NO STRINGS"
[movie for movie in movies if re.search(check_movie_name, movie, re.IGNORECASE)]

['No Strings Attached']

In [93]:
################ MOVIE RESULTS ################
INP_MOVIE = "No Strings Attached"
print("*****\nInput Movie:\n", INP_MOVIE, 
      "\n*****\nSimilar Movies\n\n", "\n".join(get_similars(INP_MOVIE, title_emb_dict)))


*****
Input Movie:
 No Strings Attached 
*****
Similar Movies

 What's Your Number?
Walking and Talking
The Wedding Planner
The Perfect Man
Someone Like You
Peggy Sue Got Married
No Strings Attached
Must Love Dogs
Little City
Killing Me Softly
Jumping the Broom
Inconceivable
In the Land of Women
How to Be Single
Half Angel
For Colored Girls
Complete Unknown
Chance
27 Dresses
 A Teacher


In [159]:
check_cast_name = "rajini"
casts = list(cast_emb_dict.keys())
[cast for cast in casts if re.search(check_cast_name, cast, re.IGNORECASE)]

['Rajinikanth', 'T. R. Rajini']

In [160]:
################ MOVIE RESULTS ################
INP_CAST = "Rajinikanth"
print("*****\nInput CAST:\n", INP_CAST, 
      "\n*****\nSimilar CASTS\n\n", "\n".join(get_similars(INP_CAST, cast_emb_dict)))


*****
Input CAST:
 Rajinikanth 
*****
Similar CASTS

 Vikram
Vijayashanti
Vijay
Sridevi
Sathyaraj
Revathi
Rekha
Ramya Krishnan
Rajinikanth
Radha
Raadhika
Manorama
Kamal Hassan
Kamal Haasan
Janagaraj
Gouthami
Goundamani
Chiranjeevi
Bhanupriya
Ajith Kumar
