In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
## exploring the dataset

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
links = pd.read_csv("links.csv")

In [6]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
ratings = pd.read_csv("ratings.csv")

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
tags = pd.read_csv("tags.csv")

In [10]:
tags.shape

(3683, 4)

In [11]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [12]:
## data preprocessing

In [13]:
movies.isnull().sum()
movies.duplicated().sum()

0

In [14]:
links.isnull().sum()
links.duplicated().sum()

0

In [15]:
ratings.isnull().sum()
ratings.duplicated().sum()

0

In [16]:
tags.isnull().sum()
tags.duplicated().sum()

0

In [17]:
print(movies.shape)
print(tags.shape)
print(links.shape)
print(ratings.shape)

(9742, 3)
(3683, 4)
(9742, 3)
(100836, 4)


In [18]:
## creating a new dataset

In [19]:
# 1. movieId
# 2. title
# 3. genres
# 4. tag
# 5. rating
# 6. imdbId
# 7. tmdbId

In [20]:
tags_new = tags[['tag', 'movieId']]
ratings_new = ratings[['movieId', 'rating']]
tags_new

Unnamed: 0,tag,movieId
0,funny,60756
1,Highly quotable,60756
2,will ferrell,60756
3,Boxing story,89774
4,MMA,89774
...,...,...
3678,for katie,7382
3679,austere,7936
3680,gun fu,3265
3681,heroic bloodshed,3265


In [21]:
new_df = pd.merge(movies, links , on = 'movieId')

In [22]:
# different users give different ratings for same movieId. assigning average rating to to each movieId
average_ratings = ratings_new.groupby('movieId')['rating'].mean().reset_index()

# Merge the movies DataFrame with the average_ratings DataFrame based on movieId
new_df = pd.merge(new_df, average_ratings, on='movieId', how='left')

In [23]:
# concatenating tags for same movie by different users
tags_concatenated = tags_new.groupby('movieId')['tag'].apply(lambda x: ','.join(x)).reset_index()
new_df = pd.merge(new_df, tags_concatenated, on='movieId', how='left')


In [24]:
new_df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,3.920930,"pixar,pixar,fun"
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,3.431818,"fantasy,magic board game,Robin Williams,game"
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,3.259615,"moldy,old"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,2.357143,
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,3.071429,"pregnancy,remake"
...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131.0,4.000000,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030.0,3.500000,
9739,193585,Flint (2017),Drama,6397426,479308.0,3.500000,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455.0,3.500000,


In [25]:
## creating a new column that will have the preprocessed tags

In [26]:
import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
new_df["new_genres"] = new_df["genres"].apply(lambda x : " ".join(x.split("|")))

In [28]:
import string

def remove_punctuation(text):
  # Tokenize the text into words
  words = word_tokenize(text)

  # Define a translation table to remove punctuation
  table = str.maketrans('', '', string.punctuation)

  # Remove punctuation from each word
  stripped_words = [word.translate(table) for word in words]

  # Join the words back into a string
  clean_text = ' '.join(stripped_words)
  return clean_text

In [29]:
new_df["new_title"] = new_df["title"].apply(remove_punctuation)

In [30]:
new_df["text"] = new_df["new_title"] + new_df["new_genres"]

In [31]:
# new_df.head()
new_df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,rating,tag,new_genres,new_title,text
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,3.920930,"pixar,pixar,fun",Adventure Animation Children Comedy Fantasy,Toy Story 1995,Toy Story 1995 Adventure Animation Children C...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,3.431818,"fantasy,magic board game,Robin Williams,game",Adventure Children Fantasy,Jumanji 1995,Jumanji 1995 Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,3.259615,"moldy,old",Comedy Romance,Grumpier Old Men 1995,Grumpier Old Men 1995 Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,2.357143,,Comedy Drama Romance,Waiting to Exhale 1995,Waiting to Exhale 1995 Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,3.071429,"pregnancy,remake",Comedy,Father of the Bride Part II 1995,Father of the Bride Part II 1995 Comedy
...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131.0,4.000000,,Action Animation Comedy Fantasy,Black Butler Book of the Atlantic 2017,Black Butler Book of the Atlantic 2017 Actio...
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030.0,3.500000,,Animation Comedy Fantasy,No Game No Life Zero 2017,No Game No Life Zero 2017 Animation Comedy F...
9739,193585,Flint (2017),Drama,6397426,479308.0,3.500000,,Drama,Flint 2017,Flint 2017 Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455.0,3.500000,,Action Animation,Bungo Stray Dogs Dead Apple 2018,Bungo Stray Dogs Dead Apple 2018 Action Anim...


In [32]:
from nltk.corpus import stopwords
nltk.download("stopwords")
import string
# print(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [34]:
## function for data preprocessing of the comment_text column
def preprocessing(text):
  text = text.lower()
  text = nltk.word_tokenize(text)
  y = []
  for i in text:
    if i.isalnum():
      y.append(i)
  text = list(y)
  y.clear()

  for i in text:
    if i not in stopwords.words("english"):
      y.append(i)

  text = list(y)
  y.clear()

  for i in text:
    if i not in string.punctuation:
      y.append(i)

  text = list(y)
  y.clear()

  for i in text:
    y.append(ps.stem(i))


  return " ".join(y)


In [35]:
new_df["transformed_text"] = new_df["text"].apply(preprocessing)

In [36]:
new_df.head() ## final dataset

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,rating,tag,new_genres,new_title,text,transformed_text
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,3.92093,"pixar,pixar,fun",Adventure Animation Children Comedy Fantasy,Toy Story 1995,Toy Story 1995 Adventure Animation Children C...,toy stori 1995 adventur anim children comedi f...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,3.431818,"fantasy,magic board game,Robin Williams,game",Adventure Children Fantasy,Jumanji 1995,Jumanji 1995 Adventure Children Fantasy,jumanji 1995 adventur children fantasi
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,3.259615,"moldy,old",Comedy Romance,Grumpier Old Men 1995,Grumpier Old Men 1995 Comedy Romance,grumpier old men 1995 comedi romanc
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,2.357143,,Comedy Drama Romance,Waiting to Exhale 1995,Waiting to Exhale 1995 Comedy Drama Romance,wait exhal 1995 comedi drama romanc
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,3.071429,"pregnancy,remake",Comedy,Father of the Bride Part II 1995,Father of the Bride Part II 1995 Comedy,father bride part ii 1995 comedi


# MODEL BUILDING USING COSINE SIMILARITY

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')
tfid = TfidfVectorizer(max_features=1000)

In [39]:
vector = cv.fit_transform(new_df['transformed_text']).toarray()

In [40]:
vector.shape

(9742, 5000)

In [41]:
similarity = cosine_similarity(vector)
print(similarity)

[[1.         0.63245553 0.31622777 ... 0.         0.13363062 0.11785113]
 [0.63245553 1.         0.2        ... 0.         0.         0.        ]
 [0.31622777 0.2        1.         ... 0.         0.         0.1490712 ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.13363062 0.         0.         ... 0.         1.         0.        ]
 [0.11785113 0.         0.1490712  ... 0.         0.         1.        ]]


In [42]:
similarity.shape

(9742, 9742)

In [43]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)


In [44]:
recommend("Toy Story (1995)")

Toy Story 2 (1999)
Toy Story 3 (2010)
Antz (1998)
Moana (2016)
Balto (1995)


# SCRATCH

In [45]:
# word_tokenize
# remove stopwords
# remove punctuations
# stemming function
# bagofwords
# tfidf implementation
# implement cosine_similarity from scratch-

In [46]:
import string

def word_tokenize_scratch(text):

    tokens = []
    start = None

    for i, char in enumerate(text):
        if char.isalnum():
            if start is None:
                start = i
        else:
            if start is not None:
                tokens.append(text[start:i])
                start = None

            tokens.append(char)

    if start is not None:
        tokens.append(text[start:])

    tks = []
    for token in tokens:
      if token != ' ':
        tks.append(token)

    return tks


text = 'Black Butler: Book of the Atlantic (2017)'
tokens = word_tokenize_scratch(text)
print(tokens)


['Black', 'Butler', ':', 'Book', 'of', 'the', 'Atlantic', '(', '2017', ')']


In [47]:
# from nltk.tokenize import word_tokenize
# txt='Black Butler: Book of the Atlantic (2017)'
# words = word_tokenize(txt)
# words

In [48]:
def remove_punctuation_scratch(text):
  # Tokenize the text into words
  words = word_tokenize_scratch(text)

  # Define a translation table to remove punctuation
  table = str.maketrans('', '', string.punctuation)

  # Remove punctuation from each word
  stripped_words = [word.translate(table) for word in words]

  # Join the words back into a string
  clean_text = ' '.join(stripped_words)
  return clean_text

In [49]:
def remove_stopwords_scratch(tokens):
    stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])
    return [word for word in tokens if word not in stopwords]



In [50]:
def remove_punctuation_scrth(tokens):
    return [token for token in tokens if token not in string.punctuation]

In [51]:
def porter_stemmer(word):
    # Simple implementation of the Porter stemming algorithm
    if word.endswith('sses'):
        return word[:-2]
    elif word.endswith('ies'):
        return word[:-2]
    elif word.endswith('ss'):
        return word
    elif word.endswith('s'):
        return word[:-1]
    else:
        return word

In [52]:
## function for data preprocessing of the comment_text column
def preprocessing_scratch(text):
  text = text.lower()
  y = []

  # Tokenize text
  tokens = word_tokenize_scratch(text)

  # Remove stop words
  tokens = remove_stopwords_scratch(tokens)

  # Remove punctuation
  tokens = remove_punctuation_scrth(tokens)


  for i in tokens:
    y.append(ps.stem(i))

  # z=[]
  # for i in tokens:
  #   z.append(porter_stemmer(i))



  return " ".join(y)      #, " ".join(z)

In [53]:
a = 'Jumanji 1995 Adventure Children Fantasy'
print(preprocessing_scratch(a))

jumanji 1995 adventur children fantasi


COUNT VECTORIZER

In [54]:
import numpy as np
import math

class CountVectorizer_Scratch:
    def __init__(self, max_features=None):
        self.max_features = max_features
        self.vocab = {}
        self.vocabulary_ = {}

    def fit_transform(self, documents):
        self.create_vocabulary(documents)
        return self.transform(documents)

    def create_vocabulary(self, documents):
        word_count = {}
        for document in documents:
            for word in document.split():
                if word not in word_count:
                    word_count[word] = 1
                else:
                    word_count[word] += 1
        sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
        if self.max_features is not None:
            sorted_words = sorted_words[:self.max_features]
        self.vocabulary_ = {word[0]: index for index, word in enumerate(sorted_words)}

    def transform(self, documents):
        document_vectors = []
        for document in documents:
            vector = [0] * len(self.vocabulary_)
            for word in document.split():
                if word in self.vocabulary_:
                    vector[self.vocabulary_[word]] += 1
            document_vectors.append(vector)
        return np.array(document_vectors)



TFID VECTORIZER

In [None]:
class TfidfVectorizer_Scratch(CountVectorizer_Scratch):
    def __init__(self, max_features=None, stop_words=None):
        super().__init__(max_features, stop_words)

    def fit_transform(self, documents):
        count_matrix = super().fit_transform(documents)
        tf_matrix = self.calculate_tf(count_matrix)
        idf_vector = self.calculate_idf(count_matrix)
        return self.calculate_tfidf(tf_matrix, idf_vector)

    def calculate_tf(self, count_matrix):
        tf_matrix = []
        for row in count_matrix:
            total_words = sum(row)
            tf_vector = [count / total_words for count in row]
            tf_matrix.append(tf_vector)
        return tf_matrix

    def calculate_idf(self, count_matrix):
        idf_vector = []
        total_documents = len(count_matrix)
        for j in range(len(count_matrix[0])):
            num_docs_containing_word = sum([1 for row in count_matrix if row[j] > 0])
            idf = math.log(total_documents / (1 + num_docs_containing_word))
            idf_vector.append(idf)
        return idf_vector

    def calculate_tfidf(self, tf_matrix, idf_vector):
        tfidf_matrix = []
        for tf_vector in tf_matrix:
            tfidf_vector = [tf * idf for tf, idf in zip(tf_vector, idf_vector)]
            tfidf_matrix.append(tfidf_vector)
        return tfidf_matrix

COSINE SIMILARITY MATRIX

In [55]:
def cosine_similarity_matrix(vectors):
    num_vectors = len(vectors)
    similarity_matrix = np.zeros((num_vectors, num_vectors))
    for i in range(num_vectors):
        for j in range(num_vectors):
            similarity_matrix[i][j] = cosine_similarity_scratch(vectors[i], vectors[j])
    return similarity_matrix

def cosine_similarity_scratch(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude1 = math.sqrt(np.sum(np.square(vector1)))
    magnitude2 = math.sqrt(np.sum(np.square(vector2)))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0

    return dot_product / (magnitude1 * magnitude2)


In [56]:
# Example Run

# cv = CountVectorizer_Scratch(max_features=5000)
# vector = cv.fit_transform(new_df['transformed_text'])

# similarity_matrix = cosine_similarity_matrix(vector)

# def recommend_scratch(movie):
#     index = new_df[new_df['title'] == movie].index[0]
#     distances = sorted(enumerate(similarity_matrix[index]), reverse=True, key=lambda x: x[1])
#     for i in distances[1:6]:
#         print(new_df.iloc[i[0]]['title'])

# recommend_scratch("Toy Story (1995)")

BAG OF WORDS

In [59]:
class BagOfWords:
    def __init__(self):
        self.vocab = {}

    def fit_transform(self, documents):
        # Tokenization and vocabulary creation
        self.create_vocabulary(documents)

        # Vectorization
        return self.transform(documents)

    def create_vocabulary(self, documents):
        for document in documents:
            for word in document.split():
                if word not in self.vocab:
                    self.vocab[word] = len(self.vocab)

    def transform(self, documents):
        document_vectors = []
        for document in documents:
            vector = [0] * len(self.vocab)
            for word in document.split():
                if word in self.vocab:
                    vector[self.vocab[word]] += 1
            document_vectors.append(vector)
        return document_vectors


documents = [
    "this is the first document",
    "this document is the second document",
    "and this is the third one",
    "is this the first document"
]

bow = BagOfWords()
bow_matrix = bow.fit_transform(documents)
print(bow_matrix)


[[1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 2, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0]]
