In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
FILENAME = 'moviereviews.csv'
df = pd.read_csv(FILENAME)
print('Movie Review dataset')
print('-----------')
print('Initial number of samples = %d'%(df.shape[0]))
print('Initial number of features = %d\n'%(df.shape[1]))
df.head()

Movie Review dataset
-----------
Initial number of samples = 13
Initial number of features = 2



Unnamed: 0,movie,review
0,The Lord of the Rings The Two Towers,remarkable display of fantasy action powerful ...
1,Inception,implanting stealing idea destroy gripping acti...
2,Spiderman Across the spider verse,mind bending wild action sequences intimate em...
3,The Dark Knight,Best live action portrayal beat organized crim...
4,Three colors red,mesmerising friendship turned love profound un...


In [36]:
movie_titles = df['movie'].values
movie_titles

array(['The Lord of the Rings The Two Towers', 'Inception',
       'Spiderman Across the spider verse', 'The Dark Knight',
       'Three colors red', 'It happened one night',
       'In the Mood for Love', 'Before Sunrise', 'Gone with the wind',
       'Eternal Sunshine of the Spotless Mind',
       'The Shawshank Redemption', 'Raging Bull', 'Lawrence of Arabia'],
      dtype=object)

In [34]:
movie_reviews = df['review'].values
movie_reviews

array(['remarkable display of fantasy action powerful ring hobbit destroy it fight',
       'implanting stealing idea destroy gripping action jaw dropping fight stunning visual violence',
       'mind bending wild action sequences intimate emotional moments amazing action',
       'Best live action portrayal beat organized crime in Gotham enigmatic villain brutality violence',
       'mesmerising friendship turned love profound unconventional bond heartfelt ',
       'Romantic comedy screwball comedy enduring tale of romance comical true love',
       'Neighbors solace bonding affair predicament spell binding infatuation heartwarming',
       'blossoming love know each other chance encounter meeting someone special fleeting romance magical evening',
       'epic romance greatest romantic film ever made touching amazing relationship cherished love',
       'Length people go finding love of life emotional rollercoaster in blossoming of love reignited fascinating journey into heart ',
   

CALCULATION OF MOVIE REVIEW VECTOR

In [38]:
vectorizer = CountVectorizer().fit_transform(movie_reviews)
reviews_matrix = vectorizer.toarray()

In [48]:
vectorizer = CountVectorizer().fit_transform(movie_titles)
titles_matrix = vectorizer.toarray()

Calculate similarity using dot product formula

In [39]:
dot_product_simi = np.dot(reviews_matrix, reviews_matrix.T)

Calculate similarity using angle between vectors formula

In [40]:
norms = np.linalg.norm(reviews_matrix, axis=1)
angle_similarity = np.dot(reviews_matrix, reviews_matrix.T) / np.outer(norms, norms)


In [42]:

# Fill the diagonal with zeros to avoid movies being similar to themselves
np.fill_diagonal(dot_product_simi, 0)
np.fill_diagonal(angle_similarity, 0)

# Find top 3 pairs of movies using dot product similarity
dot_product_top_indices = np.unravel_index(np.argsort(dot_product_simi, axis=None)[::-1], dot_product_simi.shape)
dot_product_top_pairs = [(movie_titles[i], movie_titles[j], dot_product_simi[i, j]) for i, j in zip(*dot_product_top_indices)]

# Find top 3 pairs of movies using angle similarity
angle_top_indices = np.unravel_index(np.argsort(angle_similarity, axis=None)[::-1], angle_similarity.shape)
angle_top_pairs = [(movie_titles[i], movie_titles[j], angle_similarity[i, j]) for i, j in zip(*angle_top_indices)]

# Print top similar movie pairs



Result

In [43]:
print("Top 3 movie pairs using Dot using Angle Similarity:")
for i, pair in enumerate(angle_top_pairs[:3]):
    print(f"{i+1}. {pair[0]} - {pair[1]} (Similarity: {pair[2]})")


Top 3 movie pairs using Dot using Angle Similarity:
1. The Lord of the Rings The Two Towers - Inception (Similarity: 0.26111648393354675)
2. Inception - The Lord of the Rings The Two Towers (Similarity: 0.26111648393354675)
3. Gone with the wind - It happened one night (Similarity: 0.24019223070763074)


Hand Coded

In [None]:
# Using numpy.dot()
start_time = time.time()
dot_product_numpy = np.dot(titles_matrix, reviews_matrix)
numpy_time = time.time() - start_time

# Using hand-coded multiplication and addition
start_time = time.time()
dot_product_hand_coded = sum(x * y for x, y in zip(titles_matrix, reviews_matrix))
hand_coded_time = time.time() - start_time

# Compare results and timing
print(f"Numpy Dot Product: {dot_product_numpy}")
print(f"Hand-Coded Dot Product: {dot_product_hand_coded}")

print(f"Time taken using numpy.dot(): {numpy_time:.6f} seconds")
print(f"Time taken using hand-coding: {hand_coded_time:.6f} seconds")

speedup = hand_coded_time / numpy_time
print(f"Speedup with numpy.dot(): {speedup:.2f}x")
