# MODEL BUILDING USING COSINE SIMILARITY

In [86]:
import pandas as pd
import numpy as np
import math

In [87]:
new_df = pd.read_csv("/content/new_df.csv")

In [88]:
from sklearn.metrics.pairwise import cosine_similarity

In [89]:
class CountVectorizer_Scratch:
    def __init__(self, max_features=None):
        self.max_features = max_features
        self.vocab = {}
        self.vocabulary_ = {}

    def fit_transform(self, documents):
        self.create_vocabulary(documents)
        return self.transform(documents)

    def create_vocabulary(self, documents):
        word_count = {}
        for document in documents:
            for word in document.split():
                if word not in word_count:
                    word_count[word] = 1
                else:
                    word_count[word] += 1
        sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
        if self.max_features is not None:
            sorted_words = sorted_words[:self.max_features]
        self.vocabulary_ = {word[0]: index for index, word in enumerate(sorted_words)}

    def transform(self, documents):
        document_vectors = []
        for document in documents:
            vector = [0] * len(self.vocabulary_)
            for word in document.split():
                if word in self.vocabulary_:
                    vector[self.vocabulary_[word]] += 1
            document_vectors.append(vector)
        return np.array(document_vectors)

In [90]:
class TfidfVectorizer_Scratch(CountVectorizer_Scratch):
    def __init__(self, max_features=None, stop_words=None):
        super().__init__(max_features, stop_words)

    def fit_transform(self, documents):
        count_matrix = super().fit_transform(documents)
        tf_matrix = self.calculate_tf(count_matrix)
        idf_vector = self.calculate_idf(count_matrix)
        return self.calculate_tfidf(tf_matrix, idf_vector)

    def calculate_tf(self, count_matrix):
        tf_matrix = []
        for row in count_matrix:
            total_words = sum(row)
            tf_vector = [count / total_words for count in row]
            tf_matrix.append(tf_vector)
        return tf_matrix

    def calculate_idf(self, count_matrix):
        idf_vector = []
        total_documents = len(count_matrix)
        for j in range(len(count_matrix[0])):
            num_docs_containing_word = sum([1 for row in count_matrix if row[j] > 0])
            idf = math.log(total_documents / (1 + num_docs_containing_word))
            idf_vector.append(idf)
        return idf_vector

    def calculate_tfidf(self, tf_matrix, idf_vector):
        tfidf_matrix = []
        for tf_vector in tf_matrix:
            tfidf_vector = [tf * idf for tf, idf in zip(tf_vector, idf_vector)]
            tfidf_matrix.append(tfidf_vector)
        return tfidf_matrix

In [91]:
def cosine_similarity_matrix(vectors):
    num_vectors = len(vectors)
    similarity_matrix = np.zeros((num_vectors, num_vectors))
    for i in range(num_vectors):
        for j in range(num_vectors):
            similarity_matrix[i][j] = cosine_similarity_scratch(vectors[i], vectors[j])
    return similarity_matrix

def cosine_similarity_scratch(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude1 = math.sqrt(np.sum(np.square(vector1)))
    magnitude2 = math.sqrt(np.sum(np.square(vector2)))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0

    return dot_product / (magnitude1 * magnitude2)

In [92]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer_Scratch(max_features=5000)
tfid = TfidfVectorizer(max_features=1000)

In [93]:
vector = cv.fit_transform(new_df['transformed_text'])

In [94]:
vector.shape

(9742, 5000)

In [95]:
similarity = cosine_similarity(vector)
print(similarity)

[[1.         0.63245553 0.28867513 ... 0.         0.13363062 0.11785113]
 [0.63245553 1.         0.18257419 ... 0.         0.         0.        ]
 [0.28867513 0.18257419 1.         ... 0.         0.         0.13608276]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.13363062 0.         0.         ... 0.         1.         0.        ]
 [0.11785113 0.         0.13608276 ... 0.         0.         1.        ]]


In [96]:
similarity.shape

(9742, 9742)

In [97]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)

In [98]:
recommend("Toy Story (1995)")

Toy Story 2 (1999)
Toy Story 3 (2010)
Moana (2016)
Antz (1998)
Valiant (2005)
