[Reference](https://betterprogramming.pub/how-to-build-a-recommender-system-using-netflix-data-562d1e04a2b9)

In [1]:
import pandas as pd
df = pd.read_csv('netflix_titles.csv')
df.head()

In [2]:
df = df.drop(['show_id', 'date_added', 'release_year', 'type'], axis=1)
df.head()

In [3]:
df.isna().sum()

In [4]:
text_list = ["computers can not read natural language", "the natural language must be converted into numbers", "there are many different ways to do that"]

In [5]:
import numpy as np
from nltk.tokenize import word_tokenize
from collections import defaultdict 
 
sentences = []
vocab = []
for sent in text_list:
    x = word_tokenize(sent)
    sentence = [w.lower() for w in x if w.isalpha() ]
    sentences.append(sentence)
    for word in sentence:
        if word not in vocab:
            vocab.append(word)
 

len_vector = len(vocab)
print(len_vector)

In [6]:
index_word = {}
i = 0
for word in vocab:
    index_word[word] = i 
    i += 1
print(index_word)
{'computers': 0, 'can': 1, 'not': 2, 'read': 3, 'natural': 4, 'language': 5, 'the': 6, 'must': 7, 'be': 8, 'converted': 9, 'into': 10, 'numbers': 11, 'there': 12, 'are': 13, 'many': 14, 'different': 15, 'ways': 16, 'to': 17, 'do': 18, 'that': 19}

In [7]:
def bag_of_words(sent):
    count_dict = defaultdict(int)  
    vec = np.zeros(len_vector,dtype=int)    
    for item in sent:
        count_dict[item] += 1
    for key,item in count_dict.items():
        vec[index_word[key]] = item
    return vec

In [8]:
vector = bag_of_words(sentences[0])
print(vector)

In [9]:
import math
import re
from collections import Counter

WORD = re.compile(r"\w+")

In [10]:
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)
def get_cosine(desc1, desc2):
    vector1 = text_to_vector(desc1)
    vector2 = text_to_vector(desc2)
    intersection = set(vector1.keys()) & set(vector2.keys())
    numerator = sum([vector1[x] * vector2[x] for x in intersection])

    sum1 = sum([vector1[x] ** 2 for x in list(vector1.keys())])
    sum2 = sum([vector2[x] ** 2 for x in list(vector2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [11]:
description2 = "In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor."

In [12]:
df['description2'] = description2

In [13]:
df['cosine'] = df.apply(lambda x: get_cosine(x.description, x.description2), axis=1)

In [14]:
df.head()

In [15]:
df_sorted = df.sort_values(by=['cosine'],ascending=False)
df_sorted.head(10)