# Recommendation System using Word2vec


## Preprocessing of the data


In [None]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_netflix = pd.read_csv("/content/drive/My Drive/Colab Notebooks/netflix_titles.csv")
df_netflix.drop(
    columns=[
        "director",
        "cast",
        "country",
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)


Unnamed: 0,show_id,title,listed_in,description
0,s1,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


## Removing Punctuations and Stopwords


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from string import punctuation

list_stopwords = set(stopwords.words("english") + list(punctuation))
df_netflix["title_list"] = df_netflix["title_list"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)


In [None]:
import string
#xóa các dấu câu 
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)

#lọc ra bất kỳ từ nào có độ dài bằng 0 hoặc là các chuỗi rỗng
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)


In [None]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))
#loại các dữ liệu bị lặp ở các cột

In [None]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')



## Finding Similarities Among Shows using Title, Genres, Description


In [None]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in wv.key_to_index]
    list_[3] = [word for word in list_[3] if word in wv.key_to_index]
    list_[4] = [word for word in list_[4] if word in wv.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)
#dùng để loại các từ không nằm trong pre-trained Word2Vec model để tăng hiệu suất model


In [None]:
from tqdm import tqdm


def recommendation(title):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in wv.key_to_index]
        list_[3] = [word for word in list_[3] if word in wv.key_to_index]
        list_[4] = [word for word in list_[4] if word in wv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            score_catg = wv.n_similarity(list1[2], list2[2])
            score_desc = wv.n_similarity(list1[3], list2[3])
            try:
                score_title = wv.n_similarity(list1[4], list2[4]) / 2
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )
    df_netflix_similarity["final_score"] = (
        df_netflix_similarity["score_title"]
        + df_netflix_similarity["score_category"]
        + df_netflix_similarity["score_description"]
    )
    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)
#tìm điểm số tương quan giữa các từ với từng cột xong rồi cộng lại được final score để tìm ra top10 có độ liên quan cao nhất

## Movie Recommender using word2vec tool


In [None]:
recommendation("The Conjuring")


100%|██████████| 8807/8807 [00:04<00:00, 1808.04it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
383,Conjuring Spirit,The Conjuring,0.376218,0.964287,0.624534,1.965039
95,The Conjuring 2,The Conjuring,0.40848,0.913295,0.595724,1.917499
391,Delirium,The Conjuring,0.11187,1.0,0.687572,1.799442
86,Insidious,The Conjuring,0.093044,1.0,0.687981,1.781025
513,The Diabolical,The Conjuring,0.156948,0.953485,0.669266,1.779699
64,The Strange House,The Conjuring,0.086246,0.964287,0.713428,1.76396
522,The Haunting of Molly Hartley,The Conjuring,0.173505,1.0,0.572111,1.745615
355,All Light Will End,The Conjuring,0.09736,1.0,0.64138,1.73874
273,Malevolent,The Conjuring,0.147643,1.0,0.585509,1.733152
42,The Strangers,The Conjuring,0.073289,1.0,0.645084,1.718374


In [None]:
recommendation("Insidious")


100%|██████████| 8807/8807 [00:04<00:00, 1792.22it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
84,What Lies Below,Insidious,0.112808,1.0,0.807122,1.91993
513,The Diabolical,Insidious,0.260919,0.953485,0.690771,1.905174
273,Malevolent,Insidious,0.247639,1.0,0.624875,1.872514
512,The Devil Inside,Insidious,0.141939,1.0,0.695405,1.837343
522,The Haunting of Molly Hartley,Insidious,0.146591,1.0,0.677317,1.823909
72,Things Heard & Seen,Insidious,0.082905,1.0,0.726047,1.808952
370,Bhoot,Insidious,0.084453,0.964287,0.753416,1.802156
64,The Strange House,Insidious,0.105459,0.964287,0.728778,1.798524
391,Delirium,Insidious,0.050422,1.0,0.745601,1.796024
223,Sinister 2,Insidious,0.223022,0.913295,0.656998,1.793315


# Recommendation System using Sentence Transformer


## Load csv into Pandas Dataframe


In [None]:
sent_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/netflix_titles.csv")

In [None]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.14.1-py3-

In [None]:
# This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-distilroberta-base-v1")


Downloading (…)7f4ef/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f279f7f4ef/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading (…)79f7f4ef/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)279f7f4ef/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7f4ef/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)279f7f4ef/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)9f7f4ef/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

## Find Embeddings for all show descriptions in dataset.


In [None]:
descriptions = sent_df["description"].tolist()
# print(descriptions)
des_embeddings = []
for i, des in enumerate(descriptions):
    des_embeddings.append(model.encode(des))


## For a query show_id let's find the top ten shows with the highest cosine similarity.


In [None]:
import torch
from sentence_transformers import util


def recommend(query):
    query_embedded = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedded, des_embeddings)
    top10_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][
        1:11
    ]
    return top10_matches


## Movie Recommender using sentence transformers


In [None]:
title = "The Conjuring"
query_show_des = sent_df.loc[sent_df["title"] == title]["description"].to_list()[0]

recommended_results = recommend(query_show_des)
recommended_results = [x + 1 for x in recommended_results]

for i in range(len(recommended_results)):
    print(
        sent_df["title"].loc[
            sent_df["show_id"] == str("s" + str(recommended_results[i]))
        ]
    )


7260    Laddaland
Name: title, dtype: object
8279    The Diabolical
Name: title, dtype: object
1637    Don’t Listen
Name: title, dtype: object
7361    Lupt
Name: title, dtype: object
7936    Sardaar ji
Name: title, dtype: object
836    Ghost Lab
Name: title, dtype: object
5332    Demonic
Name: title, dtype: object
6845    Ghost House
Name: title, dtype: object
8060    Soul to Keep
Name: title, dtype: object
7488    Monster Family
Name: title, dtype: object
