In [2]:
import pandas as pd
import numpy as np
import time

Here I have imported the required libraries, that is, numpy for vector operations and pandas for importing data

In [3]:
#load the movie_review dataset
DATA_DIR = 'D://ALA//Movie_Review//Data//'
FILENAME = DATA_DIR + 'moviereviews.csv'
df = pd.read_csv(FILENAME)
df.head(5)


Unnamed: 0,movie,review
0,The Lord of the Rings The Two Towers,remarkable display of fantasy action powerful ...
1,Inception,implanting stealing idea destroy gripping acti...
2,Spiderman Across the spider verse,mind bending wild action sequences intimate em...
3,The Dark Knight,Best live action portrayal beat organized crim...
4,Three colors red,mesmerising friendship turned love profound un...


Importing the dataset and displaying the first 5 columns of the dataset

In [4]:
print(df.isna().any())
print(df.isna().sum())

print(df['review'].nunique())

movie     False
review    False
dtype: bool
movie     0
review    0
dtype: int64
13


checks the missing values 
prints the number of missing values 
prints the number of unique values

In [5]:
df.drop_duplicates()

Unnamed: 0,movie,review
0,The Lord of the Rings The Two Towers,remarkable display of fantasy action powerful ...
1,Inception,implanting stealing idea destroy gripping acti...
2,Spiderman Across the spider verse,mind bending wild action sequences intimate em...
3,The Dark Knight,Best live action portrayal beat organized crim...
4,Three colors red,mesmerising friendship turned love profound un...
5,It happened one night,Romantic comedy screwball comedy enduring tale...
6,In the Mood for Love,Neighbors solace bonding affair predicament sp...
7,Before Sunrise,blossoming love know each other chance encount...
8,Gone with the wind,epic romance greatest romantic film ever made ...
9,Eternal Sunshine of the Spotless Mind,Length people go finding love of life emotiona...


drops all the duplicate entries in the dataset.

In [6]:
df.shape
#number of rows and columns in the dataset.

(13, 2)

The dataset has 13 rows and 2 features

In [9]:
from collections import Counter

Used to calculate the number of occurances of a element in a list or array

In [10]:
# Create vectors for each review
review_vectors = []
for review in df['review']:
    words = review.split()
    vector = Counter(words)  # Count the frequency of each word
    review_vectors.append(vector)

# Convert the list of review vectors into a numpy array
numpy_review_vectors = np.array([list(vector.values()) for vector in review_vectors])

print(numpy_review_vectors, '\n')
# Print the vectors
for i, movie in enumerate(df['movie']):
    print(f"Movie: {movie}")
    print(f"Vector: {numpy_review_vectors[i]}")
    print()


[list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 2, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 1, 1, 1, 1]) list([1, 2, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])] 

Movie: The Lord of the Rings The Two Towers
Vector: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Movie: Inception
Vector: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Movie: Spiderman Across the spider verse
Vector: [1, 1, 1, 2, 1, 1, 1, 1, 1]

Movie: The Dark Knight
Vector: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Movie: Three colors red
Vector: [1, 1, 1, 1, 1, 1, 1, 1]

Movie: It happened one night
Vector: [1, 2, 1, 1, 1, 1, 1, 1, 1, 1]

Movie: In t

  numpy_review_vectors = np.array([list(vector.values()) for vector in review_vectors])


In [12]:
movie_review_vectors = {}
for index, row in df.iterrows():
    movie = row['movie']
    review = row['review']

    words = review.split()
    vector = {}
    for word in words:
        vector[word] = vector.get(word, 0) + 1

    movie_review_vectors[movie] = vector

movie_review_vectors[movie]

{'classic': 1,
 'adventure': 1,
 'war': 1,
 'movie': 1,
 'psychological': 1,
 'drama': 1,
 'british': 1,
 'officer': 1,
 'fight': 1,
 'ottoman': 1,
 'success': 1,
 'ego': 1,
 'dangerous': 1}

The loop iterates through the dataset extracting the movie name and the review features. It then creates a dictionary called vectors. A dictionary contains keywords mapped to its values. 
Here, the key is the words in the review column and values are the frequencies of each word.
(This was also thought in the class by using 2 sentences.

In [None]:
def dot_product(v1, v2):
    result = 0
    for word in v1:
        if word in v2:
            result += v1[word] * v2[word]
    return result

In [None]:
similarities = {}
for movie1 in movie_review_vectors:
    for movie2 in movie_review_vectors:
        if movie1 != movie2:
            similarity = dot_product(movie_review_vectors[movie1], movie_review_vectors[movie2])
            similarities[(movie1, movie2)] = similarity

In [None]:
top_similar_pairs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:3]
top_similar_pairs


[(('It happened one night', 'Eternal Sunshine of the Spotless Mind'), 4),
 (('Eternal Sunshine of the Spotless Mind', 'It happened one night'), 4),
 (('The Lord of the Rings The Two Towers', 'Inception'), 3)]

In [None]:
for pair, similarity in top_similar_pairs:
    print(f"Movies: {pair[0]} and {pair[1]}, Similarity: {similarity}")

Movies: It happened one night and Eternal Sunshine of the Spotless Mind, Similarity: 4
Movies: Eternal Sunshine of the Spotless Mind and It happened one night, Similarity: 4
Movies: The Lord of the Rings The Two Towers and Inception, Similarity: 3


In [None]:
def numpy_dot_product(v1, v2):
    keys = set(v1.keys()) & set(v2.keys())
    np_dot_prod = np.dot([v1[key] for key in keys], [v2[key] for key in keys])
    return np_dot_prod 



In [None]:
n_iterations = 10000


In [None]:
start_time = time.time()
for _ in range(n_iterations):
    for movie1 in movie_review_vectors:
        for movie2 in movie_review_vectors:
            if movie1 != movie2:
                dot_product(movie_review_vectors[movie1], movie_review_vectors[movie2])
end_time = time.time()
hand_coded_time = end_time - start_time

In [None]:
start_time = time.time()
for _ in range(n_iterations):
    for movie1 in movie_review_vectors:
        for movie2 in movie_review_vectors:
            if movie1 != movie2:
                numpy_dot_product(movie_review_vectors[movie1], movie_review_vectors[movie2])
end_time = time.time()
numpy_time = end_time - start_time

In [None]:
print(f"Hand-coded Dot Product Time: {hand_coded_time:.4f} seconds")
print(f"Numpy Dot Product Time: {numpy_time:.4f} seconds")
print(f"Speedup: {hand_coded_time / numpy_time:.2f}x")

Hand-coded Dot Product Time: 2.0727 seconds


NameError: name 'numpy_time' is not defined