# **Text Similarity Techniques**

In [None]:
import pandas as pd

# Correct GitHub raw file URL
url = "https://raw.githubusercontent.com/harshitcodes/tmdb_movie_data_analysis/master/tmdb-5000-movie-dataset/tmdb_5000_movies.csv"

# Reading data
movie_data = pd.read_csv(url)

# Considering only the required fields from the dataset
movie_data = movie_data[['title', 'tagline', 'overview', 'budget', 'popularity']]
print("Dimension of dataset:", movie_data.shape)

# Information related to number of missing observations for each column
print("Number of missing values in column:\n", movie_data.isnull().sum())

# Removing missing values
movie_data.dropna(inplace=True)
print("Dimension of new dataset:", movie_data.shape)

# Creating new column named 'information' by joining two columns: overview and tagline
movie_data['information'] = movie_data['overview'] + ' ' + movie_data['tagline']
movie_data.info()

# Sorting high budget movies
highbudget_movies = movie_data.sort_values(by='budget', ascending=False)
print("\nHigh Budget Movies:\n", highbudget_movies['title'].head())

# Sorting popular movies
popular_movies = movie_data.sort_values(by='popularity', ascending=False)
print("\nPopular Movies:\n", popular_movies['title'].head())


Dimension of dataset: (4803, 5)
Number of missing values in column:
 title           0
tagline       844
overview        3
budget          0
popularity      0
dtype: int64
Dimension of new dataset: (3959, 5)
<class 'pandas.core.frame.DataFrame'>
Index: 3959 entries, 0 to 4801
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        3959 non-null   object 
 1   tagline      3959 non-null   object 
 2   overview     3959 non-null   object 
 3   budget       3959 non-null   int64  
 4   popularity   3959 non-null   float64
 5   information  3959 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 216.5+ KB

High Budget Movies:
 17    Pirates of the Caribbean: On Stranger Tides
1        Pirates of the Caribbean: At World's End
7                         Avengers: Age of Ultron
6                                         Tangled
4                                     John Carter
Name: title, dtype:

**n-gram**

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
#Importing necessary library.
from textblob import TextBlob
text = "Display your text or numeric variable here"
blob = TextBlob(text)

#Creating bigram.
bigram_var = blob.ngrams(n=2)
print("Bigrams include:\n", bigram_var)

#Creating trigram.
trigrams_var = blob.ngrams(n=3)
print("Trigrams include:\n", trigrams_var)

#Creating fourgram.
fourgrams_var = blob.ngrams(n=4)
print("Fourgrams include:\n", fourgrams_var)

Bigrams include:
 [WordList(['Display', 'your']), WordList(['your', 'text']), WordList(['text', 'or']), WordList(['or', 'numeric']), WordList(['numeric', 'variable']), WordList(['variable', 'here'])]
Trigrams include:
 [WordList(['Display', 'your', 'text']), WordList(['your', 'text', 'or']), WordList(['text', 'or', 'numeric']), WordList(['or', 'numeric', 'variable']), WordList(['numeric', 'variable', 'here'])]
Fourgrams include:
 [WordList(['Display', 'your', 'text', 'or']), WordList(['your', 'text', 'or', 'numeric']), WordList(['text', 'or', 'numeric', 'variable']), WordList(['or', 'numeric', 'variable', 'here'])]


**Without Pretrained Model**

In [None]:
#Importing necessary libraries.
import nltk
import re
import numpy as np

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

#Creating a function for text processing.
def func_text_process (document):
  #Converting to lower case.
  document = document.lower()

  #Removing special characters.
  document = re.sub(r'[^a-zA-Z0-9\s]', '', document, re.I|re.A)
  document = document.strip()

  #Tokenization of document.
  tokens = nltk.word_tokenize(document)

  #Filetr stopwords out of document.
  filtered_tokens = [token for token in tokens if token not in stop_words]

  #Create document from filtered tokens.
  document = ' '.join(filtered_tokens)
  return document

#Vectorizing the processed document.
text_process = np.vectorize(func_text_process)
new_data = text_process(list(movie_data['information']))

#Feature Extraction.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Using Count Vectorizer.
cv = CountVectorizer(binary=False, min_df=2, ngram_range=(1,2))
cv_result = cv.fit_transform(new_data)
print("Dimension after Count Vectorizer:", cv_result.shape)

#Using TF_IDF Vectorizer for extracting TF-IDF Features.
tfid = TfidfVectorizer(use_idf=True, min_df=2, ngram_range=(1,2), sublinear_tf=True)
tfid_result = tfid.fit_transform(new_data)
print("Dimension after TF-IDF Vectorizer:", tfid_result.shape)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dimension after Count Vectorizer: (3959, 17820)
Dimension after TF-IDF Vectorizer: (3959, 17820)


**Cosine Similarity** -- Determines silimarity of the document.

In [None]:
#Creating a list of titles.
movies = movie_data['title'].values
print("The list of movies:\n", movies)

#Compute pairwise document similarity.
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfid_result)
cosine_df = pd.DataFrame(cosine_sim)
cosine_df.head()

#Determining index of the Deadpool movie.
movie_index = np.where(movies == 'Deadpool')[0][0]
print("Index of Deadpool movie:", movie_index)

#Find top similar movies as DeadPool movie.
similarities = cosine_df.iloc[movie_index].values
print("Movie Similarities:\n", similarities)

#Displaying indexes of first eight similar movies.
similar_movie_index = np.argsort(-similarities)[0:8]
print("Indeex of similar movies:\n", similar_movie_index)

#Displaying names of first eight similar movies.
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

The list of movies:
 ['Avatar' "Pirates of the Caribbean: At World's End" 'Spectre' ...
 'El Mariachi' 'Newlyweds' 'Shanghai Calling']
Index of Deadpool movie: 765
Movie Similarities:
 [0.         0.00616236 0.01474417 ... 0.         0.         0.00845363]
Indeex of similar movies:
 [ 765 2429 1066  462 3879  235 3117 1931]
Name of similar movies:
 ['Deadpool' 'Silent Trigger' 'Underworld: Evolution' 'Mars Attacks!'
 'Bronson' 'Fantastic Four' 'Don Jon' 'The Transporter']


**Euclidean Distance**

In [None]:
#Performing document similarity on basis of Euclidean distance.
from sklearn.metrics.pairwise import euclidean_distances
euclidean_sim = euclidean_distances(tfid_result)
euclidean_df = pd.DataFrame(euclidean_sim)

#Determining index of the Newlyweds movie.
movie_index = np.where(movies == 'Newlyweds')[0][0]
print("Index of Newlyweds movie:", movie_index)

#Find top similar movies as Newlyweds movie.
similarities = euclidean_df.iloc[movie_index].values
print("Movie Similarities:\n", similarities)

#Displaying indexed of first eight similar movies.
similar_movie_index = np.argsort(similarities)[0:8]
print("Index of similar movies:\n", similar_movie_index)

#Displaying names of first eight similar movies.
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

Index of Newlyweds movie: 3957
Movie Similarities:
 [1.41421356 1.41421356 1.41421356 ... 1.41421356 0.         1.41421356]
Index of similar movies:
 [3957 2464 2121  600  491  836 1491 3833]
Name of similar movies:
 ['Newlyweds' 'Our Family Wedding' 'Just Married' 'Ted 2'
 'The Secret Life of Pets' 'Bullet to the Head' 'Bride Wars' 'Blue Ruin']


**Manhattan Distance**

In [None]:
#Performing document similarity on basis of Manhattan distance.
from sklearn.metrics.pairwise import manhattan_distances
manhattan_sim = manhattan_distances(tfid_result)
manhattan_df = pd.DataFrame(manhattan_sim)

#Determining index of The Matrix Revolutions movie.
movie_index = np.where(movies == 'The Matrix Revolutions')[0][0]
print("Index of The Matrix Revolutions movie:", movie_index)

#Find top similar movies for The Matrix Revolutions movie.
similarities = manhattan_df.iloc[movie_index].values
print("Movie Similarities:\n", similarities)

#Displaying indexes of first eight silimar movies.
similar_movie_index = np.argsort(similarities)[0:8]
print("Index of similar movies:\n", similar_movie_index)

#Displaying names of first eight similar movies.
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

Index of The Matrix Revolutions movie: 119
Movie Similarities:
 [ 9.13932628  9.38768388 10.37115606 ... 10.36003091  7.20963996
 10.57467291]
Index of similar movies:
 [ 119 3867 3957 3651  171  963 2831 3704]
Name of similar movies:
 ['The Matrix Revolutions' "Amidst the Devil's Wings" 'Newlyweds'
 'Eddie: The Sleepwalking Cannibal' 'Turbo' 'Zoolander 2' 'Richard III'
 'Love Me Tender']


**Bert Algorithm**

In [None]:
# Importing necessary libraries
import tensorflow as tf
import transformers
import tqdm
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

# Creating a function for tokenization
def fun_tokenizer(tokenizer, docs):
    features = []
    for doc in tqdm.tqdm(docs, desc="Converting documents to features"):
        tokens = tokenizer.tokenize(doc)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        features.append(ids)
    return features

print("The function is created successfully")

# -------------------------------------
# BERT TOKENIZATION AND SIMILARITY
# -------------------------------------
bert_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
bert_features = fun_tokenizer(bert_tokenizer, movie_data['information'])
bert_trg = sequence.pad_sequences(bert_features, maxlen=500)

movies = movie_data['title'].values

print("---------------Cosine Similarity for Bert Tokenizer------------------")
cosine_sim = cosine_similarity(bert_trg)
cosine_df = pd.DataFrame(cosine_sim)
movie_index = np.where(movies == 'Money Train')[0][0]
similarities = cosine_df.iloc[movie_index].values
similar_movie_index = np.argsort(-similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

print("--------------------Euclidean Distance for Bert Tokenizer--------------")
euclidean_sim = euclidean_distances(bert_trg)
euclidean_df = pd.DataFrame(euclidean_sim)
movie_index = np.where(movies == 'The Love Letter')[0][0]
similarities = euclidean_df.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

print("--------------------Manhattan Distance for Bert Tokenizer--------------")
manhattan_sim = manhattan_distances(bert_trg)
manhattan_df = pd.DataFrame(manhattan_sim)
movie_index = np.where(movies == 'Magic Mike')[0][0]
similarities = manhattan_df.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)


The function is created successfully


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Converting documents to features: 100%|██████████| 3959/3959 [00:05<00:00, 777.39it/s]


---------------Cosine Similarity for Bert Tokenizer------------------
Index of similar movies:
 [1153 3067 2404 1597 2968 1704 2161 1489]
Name of similar movies:
 ['Escape from Planet Earth' 'Mr. Nice Guy' 'The Love Letter' 'The Jacket'
 'Harry Brown' 'Rain Man' 'Gosford Park' 'Baby Mama']
--------------------Euclidean Distance for Bert Tokenizer--------------
Index of similar movies:
 [1502 1597 2968 2355  590 2902  599 1985]
Name of similar movies:
 ['P.S. I Love You' 'The Jacket' 'Harry Brown' 'Drive' 'Blackhat' '50/50'
 'Money Train' 'Babel']
--------------------Manhattan Distance for Bert Tokenizer--------------
Index of similar movies:
 [3171 1897  680  645 2140 1006  407 1952]
Name of similar movies:
 ['My Big Fat Greek Wedding' 'Diary of a Wimpy Kid: Dog Days'
 'Daddy Day Care' 'We Are Marshall' 'The Invention of Lying'
 'Journey to the Center of the Earth' 'Creepshow' 'Tammy']


**GPT2 Algorithm**

In [None]:
# -------------------------------------
# GPT-2 TOKENIZATION AND SIMILARITY
# -------------------------------------
print("\n============== GPT-2 TOKENIZER ANALYSIS ==============\n")

# Initializing a GPT2 Tokenizer
gpt2_tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # GPT2 does not have pad token by default

# Tokenizing
gpt2_features = fun_tokenizer(gpt2_tokenizer, movie_data['information'])

# Padding sequences
gpt2_trg = sequence.pad_sequences(gpt2_features, maxlen=500, padding='post')

movies = movie_data['title'].values

# Cosine similarity for GPT-2
print("--------------Cosine Similarity for GPT2 Tokenizer-----------------")
cosine_sim_gpt2 = cosine_similarity(gpt2_trg)
cosine_df_gpt2 = pd.DataFrame(cosine_sim_gpt2)
movie_index = np.where(movies == 'Money Train')[0][0]
similarities = cosine_df_gpt2.iloc[movie_index].values
similar_movie_index = np.argsort(-similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

# Euclidean distance for GPT-2
print("--------------------Euclidean Distance for GPT2 Tokenizer--------------")
euclidean_sim_gpt2 = euclidean_distances(gpt2_trg)
euclidean_df_gpt2 = pd.DataFrame(euclidean_sim_gpt2)
movie_index = np.where(movies == 'The Love Letter')[0][0]
similarities = euclidean_df_gpt2.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

# Manhattan distance for GPT-2
print("--------------------Manhattan Distance for GPT2 Tokenizer--------------")
manhattan_sim_gpt2 = manhattan_distances(gpt2_trg)
manhattan_df_gpt2 = pd.DataFrame(manhattan_sim_gpt2)
movie_index = np.where(movies == 'Magic Mike')[0][0]
similarities = manhattan_df_gpt2.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)






tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Converting documents to features: 100%|██████████| 3959/3959 [00:03<00:00, 1140.79it/s]


--------------Cosine Similarity for GPT2 Tokenizer-----------------
Index of similar movies:
 [2927 1712 3364 2663 2096    0 2517  107]
Name of similar movies:
 ['Made in Dagenham' 'The Fifth Estate' 'Taxman' 'Frailty'
 'The Children of Huang Shi' 'Avatar' 'Letters from Iwo Jima'
 'Transformers']
--------------------Euclidean Distance for GPT2 Tokenizer--------------
Index of similar movies:
 [1575 1925 2538  491 1341 1886 1948 1488]
Name of similar movies:
 ['Dead Man Down' 'Death at a Funeral' 'Operation Chromite'
 'The Secret Life of Pets' 'Mirrors' 'The Hundred-Foot Journey' 'Jaws 2'
 'Four Brothers']
--------------------Manhattan Distance for GPT2 Tokenizer--------------
Index of similar movies:
 [1335 2509 3212 2607 3078 3446 1763 3258]
Name of similar movies:
 ['Street Fighter' 'Harriet the Spy' 'Rabbit Hole' 'Project Almanac'
 'Casa De Mi Padre' 'They Came Together' 'Blood Ties' "Moms' Night Out"]


**Roberta Algorithm**

In [None]:
# -------------------------------------
# ROBERTA TOKENIZATION AND SIMILARITY
# -------------------------------------

print("\n============== ROBERTA TOKENIZER ANALYSIS ==============\n")

# Initializing a Roberta Tokenizer
roberta_tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base')

# Tokenizing
roberta_features = fun_tokenizer(roberta_tokenizer, movie_data['information'])

# Padding sequences
roberta_trg = sequence.pad_sequences(roberta_features, maxlen=500, padding='post')

# Cosine similarity for RoBERTa
print("--------------Cosine Similarity for Roberta Tokenizer-----------------")
cosine_sim_roberta = cosine_similarity(roberta_trg)
cosine_df_roberta = pd.DataFrame(cosine_sim_roberta)
movie_index = np.where(movies == 'Money Train')[0][0]
similarities = cosine_df_roberta.iloc[movie_index].values
similar_movie_index = np.argsort(-similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

# Euclidean distance for RoBERTa
print("--------------------Euclidean Distance for Roberta Tokenizer--------------")
euclidean_sim_roberta = euclidean_distances(roberta_trg)
euclidean_df_roberta = pd.DataFrame(euclidean_sim_roberta)
movie_index = np.where(movies == 'The Love Letter')[0][0]
similarities = euclidean_df_roberta.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

# Manhattan distance for RoBERTa
print("--------------------Manhattan Distance for Roberta Tokenizer--------------")
manhattan_sim_roberta = manhattan_distances(roberta_trg)
manhattan_df_roberta = pd.DataFrame(manhattan_sim_roberta)
movie_index = np.where(movies == 'Magic Mike')[0][0]
similarities = manhattan_df_roberta.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)





Converting documents to features: 100%|██████████| 3959/3959 [00:04<00:00, 791.94it/s]


--------------Cosine Similarity for Roberta Tokenizer-----------------
Index of similar movies:
 [3704 1534 2517    0 1480 2289 2782 3093]
Name of similar movies:
 ['Love Me Tender' 'Dumb and Dumberer: When Harry Met Lloyd'
 'Letters from Iwo Jima' 'Avatar' 'Legends of the Fall' "Jennifer's Body"
 'Sphinx' 'Julia']
--------------------Euclidean Distance for Roberta Tokenizer--------------
Index of similar movies:
 [2538 1886  170  590 2028 2503 2046 1948]
Name of similar movies:
 ['Operation Chromite' 'The Hundred-Foot Journey' 'The Revenant' 'Blackhat'
 'Reign Over Me' 'The Shallows' 'The Man Who Knew Too Little' 'Jaws 2']
--------------------Manhattan Distance for Roberta Tokenizer--------------
Index of similar movies:
 [ 353 2607 3049 3212 3286 1835 1763 2359]
Name of similar movies:
 ['Pixels' 'Project Almanac' 'Mallrats' 'Rabbit Hole'
 'The Last Exorcism Part II' 'Old School' 'Blood Ties'
 'When the Game Stands Tall']


**XLM Algorithm**

In [None]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
# -------------------------------------
# XLM TOKENIZATION AND SIMILARITY
# -------------------------------------
print("\n============== XLM TOKENIZER ANALYSIS ==============\n")

# Initializing an XLM Tokenizer
xlm_tokenizer = transformers.XLMTokenizer.from_pretrained('xlm-mlm-en-2048')

# Tokenizing
xlm_features = fun_tokenizer(xlm_tokenizer, movie_data['information'])

# Padding sequences
xlm_trg = sequence.pad_sequences(xlm_features, maxlen=500, padding='post')

# Cosine similarity for XLM
print("--------------Cosine Similarity for XLM Tokenizer-----------------")
cosine_sim_xlm = cosine_similarity(xlm_trg)
cosine_df_xlm = pd.DataFrame(cosine_sim_xlm)
movie_index = np.where(movies == 'Money Train')[0][0]
similarities = cosine_df_xlm.iloc[movie_index].values
similar_movie_index = np.argsort(-similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

# Euclidean distance for XLM
print("--------------------Euclidean Distance for XLM Tokenizer--------------")
euclidean_sim_xlm = euclidean_distances(xlm_trg)
euclidean_df_xlm = pd.DataFrame(euclidean_sim_xlm)
movie_index = np.where(movies == 'The Love Letter')[0][0]
similarities = euclidean_df_xlm.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)

# Manhattan distance for XLM
print("--------------------Manhattan Distance for XLM Tokenizer--------------")
manhattan_sim_xlm = manhattan_distances(xlm_trg)
manhattan_df_xlm = pd.DataFrame(manhattan_sim_xlm)
movie_index = np.where(movies == 'Magic Mike')[0][0]
similarities = manhattan_df_xlm.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
similar_movie_name = movies[similar_movie_index]
print("Name of similar movies:\n", similar_movie_name)






Converting documents to features: 100%|██████████| 3959/3959 [00:13<00:00, 297.09it/s]


--------------Cosine Similarity for XLM Tokenizer-----------------
Index of similar movies:
 [3453 3867 3449 1786 1611 1172  894 3379]
Name of similar movies:
 ['Central Station' "Amidst the Devil's Wings" 'Four Single Fathers'
 'The Greatest Game Ever Played' 'The Royal Tenenbaums' 'The Guilt Trip'
 'The Ring Two' 'The Dead Girl']
--------------------Euclidean Distance for XLM Tokenizer--------------
Index of similar movies:
 [1367 3113 1133 1634  547 1897 3172 3288]
Name of similar movies:
 ['Krull' 'The Sting' 'Vantage Point' 'Our Brand Is Crisis' 'The Village'
 'Diary of a Wimpy Kid: Dog Days' 'Spring Breakers' 'Scoop']
--------------------Manhattan Distance for XLM Tokenizer--------------
Index of similar movies:
 [2083 2274 3418 1606 1028 1691 1230 3760]
Name of similar movies:
 ['Bandslam' 'The Unborn' 'Red River' 'The Big Short'
 'Yours, Mine and Ours' "New Year's Eve" 'The Hudsucker Proxy' 'Bambi']


**DistilBert Algorithm**

In [None]:
# -------------------------------------
# DISTILBERT TOKENIZATION AND SIMILARITY
# -------------------------------------
print("\n============== DistilBERT TOKENIZER ANALYSIS ==============\n")

# Import tokenizer
from transformers import DistilBertTokenizer

# Initialize DistilBERT tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize with your utility
distilbert_features = fun_tokenizer(distilbert_tokenizer, movie_data['information'])

# Pad sequences to uniform length
distilbert_trg = sequence.pad_sequences(distilbert_features, maxlen=500, padding='post')

movies = movie_data['title'].values

# 1. Cosine similarity
print("--------------Cosine Similarity for DistilBERT Tokenizer-----------------")
cosine_sim_distilbert = cosine_similarity(distilbert_trg)
cosine_df_distilbert = pd.DataFrame(cosine_sim_distilbert)
movie_index = np.where(movies == 'Money Train')[0][0]
similarities = cosine_df_distilbert.iloc[movie_index].values
similar_movie_index = np.argsort(-similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
print("Name of similar movies:\n", movies[similar_movie_index])

# 2. Euclidean distance
print("--------------------Euclidean Distance for DistilBERT Tokenizer--------------")
euclidean_sim_distilbert = euclidean_distances(distilbert_trg)
euclidean_df_distilbert = pd.DataFrame(euclidean_sim_distilbert)
movie_index = np.where(movies == 'The Love Letter')[0][0]
similarities = euclidean_df_distilbert.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
print("Name of similar movies:\n", movies[similar_movie_index])

# 3. Manhattan distance
print("--------------------Manhattan Distance for DistilBERT Tokenizer--------------")
manhattan_sim_distilbert = manhattan_distances(distilbert_trg)
manhattan_df_distilbert = pd.DataFrame(manhattan_sim_distilbert)
movie_index = np.where(movies == 'Magic Mike')[0][0]
similarities = manhattan_df_distilbert.iloc[movie_index].values
similar_movie_index = np.argsort(similarities)[1:9]
print("Index of similar movies:\n", similar_movie_index)
print("Name of similar movies:\n", movies[similar_movie_index])






tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Converting documents to features: 100%|██████████| 3959/3959 [00:05<00:00, 769.52it/s]


--------------Cosine Similarity for DistilBERT Tokenizer-----------------
Index of similar movies:
 [ 876 1133 2833 1210 3257 3745 2404 3453]
Name of similar movies:
 ['Anchorman 2: The Legend Continues' 'Vantage Point' 'Melancholia'
 'Memoirs of an Invisible Man' 'Salvador' 'Snow White: A Deadly Summer'
 'The Love Letter' 'Central Station']
--------------------Euclidean Distance for DistilBERT Tokenizer--------------
Index of similar movies:
 [2161 1133 1716 3273 3091 1145  590  599]
Name of similar movies:
 ['Gosford Park' 'Vantage Point' 'The Face of an Angel'
 'Survival of the Dead' 'The 5th Quarter' 'Shall We Dance?' 'Blackhat'
 'Money Train']
--------------------Manhattan Distance for DistilBERT Tokenizer--------------
Index of similar movies:
 [1897 3488 3171 3829  645 2607 3935 2219]
Name of similar movies:
 ['Diary of a Wimpy Kid: Dog Days' 'The Spectacular Now'
 'My Big Fat Greek Wedding' 'Mutual Friends' 'We Are Marshall'
 'Project Almanac' 'This Is Martin Bonner' 'The Princ