### Imports

In [30]:
import pandas as pd
import ast
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm, tnrange
tqdm.pandas()
import dateutil
import datetime

#### Create Dataset

In [31]:
df = pd.read_csv("./data.csv",index_col=0)

In [32]:
values = ['director_anchors','genre','star_anchs','summary_tokens']
for val in values:
  df[val]=df[val].apply(lambda x: ast.literal_eval(x) if type(x)==str else np.nan)

In [33]:
def createDates(row):
  if type(row)==str:
    return dateutil.parser.parse(row).strftime('%Y-%m-%d')
  else:
    return np.nan

def createDateTime(row):
  row = row.split("-")
  row = datetime.datetime(int(row[0]), int(row[1]), int(row[2]))
  return row

tqdm.pandas(desc="Creating Date")
df['date'] = df['release'].progress_apply(createDates)
df = df.dropna(subset=['date'])
df = df.reset_index(drop=True)

tqdm.pandas(desc="Creating Datetime")
df['datetime'] = df['date'].progress_apply(createDateTime)
tqdm.pandas(desc="Creating Year Column")
df['year'] = df['datetime'].progress_apply(lambda x: x.year)
tqdm.pandas(desc="Creating Month Column")
df['month'] = df['datetime'].progress_apply(lambda x: x.month)

Creating Date:   0%|          | 0/85311 [00:00<?, ?it/s]

Creating Datetime:   0%|          | 0/75938 [00:00<?, ?it/s]

Creating Year Column:   0%|          | 0/75938 [00:00<?, ?it/s]

Creating Month Column:   0%|          | 0/75938 [00:00<?, ?it/s]

In [34]:
df.shape

(75938, 15)

In [35]:
df.loc[df['year']>1970].shape

(53685, 15)

In [36]:
df=df.loc[df['year']>1970].reset_index(drop=True)

#### Create Cosin Similarities

In [None]:
similarities = cosine_similarity(np.array(df['summary_tokens'].tolist(), dtype='int8'))

In [None]:
#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(unique_id, cosine_sim = similarities):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
#     breakpoint()
    # gettin the index of the movie that matches the unique id
    idx = df[df.profile==unique_id].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

In [None]:
values = recommendations(df.profile[0])

In [None]:
for val in values:
  print(df.iloc[val].summary)
  print("=================")

In [None]:
df.iloc[values]