# Recommendation Model Training

In [11]:
import numpy as np
import pandas as pd

In [12]:
movies = pd.read_csv('datasets/movieInfo.csv')
movies.head()

Unnamed: 0,imdbID,Title,Genres,Plot,Directors,Writers,Actors,Language,Country,Runtime
tt0000013,The Photographical Congress Arrives in Lyon,"Documentary, Short",The photographers who need to participate in t...,LouisLumière,"AugusteLumière, P.J.C.Janssen",,,France,movie,['1']
tt0000005,Blacksmith Scene,"Short, Comedy",Three men hammer on an anvil and pass a bottle...,WilliamK.L.Dickson,"CharlesKayser, JohnOtt",,,United States,short,['1']
tt0000001,Carmencita,"Documentary, Short",Performing on what looks like a small wooden s...,WilliamK.L.Dickson,Carmencita,,,United States,short,['1']
tt0000003,Poor Pierrot,"Animation, Comedy, Short, Romance","One night, Arlequin come to see his lover Colo...",ÉmileReynaud,,,,France,short,['4']
tt0000007,Corbett and Courtney Before the Kinetograph,"Short, Sport",James J. Corbett and Peter Courtney meet in a ...,"WilliamK.L.Dickson, WilliamHeise","JamesJ.Corbett, PeterCourtney",,,United States,short,['1']


In [13]:
movies.shape

(2215, 10)

## [spaCy](https://spacy.io/)
`spaCy` is a library for advanced **Natural Language Processing** in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products.

`spaCy` comes with [pretrained pipelines](https://spacy.io/models) and currently supports tokenization and training for 70+ languages. It features state-of-the-art speed and neural network models for tagging, parsing, named entity recognition, text classification and more, multi-task learning with pretrained transformers like BERT, as well as a production-ready [training system](https://spacy.io/usage/training_) and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE)

In [8]:
try:
    !pip show spacy
    !python -m spacy info en_core_web_sm
except:
    !pip install spacy
    !python -m spacy download en_core_web_sm

Name: spacy
Version: 3.7.2
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: C:\Users\anujj\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: catalogue, cymem, jinja2, langcodes, murmurhash, numpy, packaging, preshed, pydantic, requests, setuptools, smart-open, spacy-legacy, spacy-loggers, srsly, thinc, tqdm, typer, wasabi, weasel
Required-by: en-core-web-sm
[1m

lang                en                            
name                core_web_sm                   
version             3.7.0                         
description         English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
author              Explosion                     
email               contact@explosion.ai          
url                 https://explosion.ai          
license             MIT                        

In [9]:
import spacy

# Load the English language model
nlp = spacy.load('en_core_web_sm')

def extract_and_combine_keywords(plot, title):
    doc_plot = nlp(plot)
    doc_title = nlp(title)

    keywords_plot = [token.text for token in doc_plot if token.is_alpha and not token.is_stop]
    keywords_title = [token.text for token in doc_title if token.is_alpha and not token.is_stop]

    combined_keywords = keywords_plot + keywords_title
    return ", ".join(combined_keywords)  # Join keywords into comma-separated string

movies['Keywords'] = movies.apply(lambda row: extract_and_combine_keywords(row['Plot'], row['Title']), axis=1)

In [10]:
movies['Tags'] = movies[['Keywords', 'Genres', 'Directors', 'Writers', 'Actors', 'Language', 'Country']].apply(lambda x: ' '.join(map(str, x)), axis=1)
movies.head()

Unnamed: 0,imdbID,Title,Genres,Plot,Directors,Writers,Actors,Language,Country,Runtime,Keywords,Tags
tt0000013,The Photographical Congress Arrives in Lyon,"Documentary, Short",The photographers who need to participate in t...,LouisLumière,"AugusteLumière, P.J.C.Janssen",,,France,movie,['1'],"LouisLumière, Documentary, Short","LouisLumière, Documentary, Short The photograp..."
tt0000005,Blacksmith Scene,"Short, Comedy",Three men hammer on an anvil and pass a bottle...,WilliamK.L.Dickson,"CharlesKayser, JohnOtt",,,United States,short,['1'],"Short, Comedy","Short, Comedy Three men hammer on an anvil and..."
tt0000001,Carmencita,"Documentary, Short",Performing on what looks like a small wooden s...,WilliamK.L.Dickson,Carmencita,,,United States,short,['1'],"Documentary, Short","Documentary, Short Performing on what looks li..."
tt0000003,Poor Pierrot,"Animation, Comedy, Short, Romance","One night, Arlequin come to see his lover Colo...",ÉmileReynaud,,,,France,short,['4'],"ÉmileReynaud, Animation, Comedy, Short, Romance","ÉmileReynaud, Animation, Comedy, Short, Romanc..."
tt0000007,Corbett and Courtney Before the Kinetograph,"Short, Sport",James J. Corbett and Peter Courtney meet in a ...,"WilliamK.L.Dickson, WilliamHeise","JamesJ.Corbett, PeterCourtney",,,United States,short,['1'],"WilliamHeise, Short, Sport","WilliamHeise, Short, Sport James J. Corbett an..."


# [scikit-learn](https://scikit-learn.org)
`scikit-learn` is a Python module for **MACHINE LEARNING** built on top of SciPy and is distributed under the 3-Clause BSD license.

The project was started in 2007 by David Cournapeau as a Google Summer of Code project, and since then many volunteers have contributed. See the [About us](https://scikit-learn.org/dev/about.html#authors) page for a list of core contributors.

In [4]:
try:
    !pip show sklearn
except ImportError:
    !pip install scikit-learn

Name: sklearn
Version: 0.0.post9
Summary: deprecated sklearn package, use scikit-learn instead
Home-page: 
Author: 
Author-email: 
License: 
Location: C:\Users\anujj\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: 
Required-by: 


## [sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
`CountVectorizer` converts a collection of text documents to a matrix of token counts.

This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data.

## [sklearn.metrics.pairwise.cosine_similarity](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html)
`cosine_similarity` computes the L2-normalized dot product of vectors. That is, if **x** and **y** are row vectors, their cosine similarity **k** is defined as:
 $$k(x, y) = \frac{x y^\top}{\|x\| \|y\|}$$

 
This is called cosine similarity, because Euclidean (L2) normalization projects the vectors onto the unit sphere, and their dot product is then the cosine of the angle between the points denoted by the vectors.

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# Create a sparse matrix
cv = CountVectorizer(max_features=1000, stop_words='english')
vector = cv.fit_transform(movies['Tags'])

n = vector.shape[0]
chunk_size = 1000       #Define the chunk size
N = 10                  # Number of top similar vectors to store
similarityVector = {}   # Calculate cosine similarity in chunks and populate the top N similar vectors

In [16]:
# A generator to calculate cosine similarity
def calculate_cosine_similarity(vector):
    n = vector.shape[0]
    for i in range(0, n, chunk_size):
        end = min(i + chunk_size, n)
        similarity_chunk = cosine_similarity(vector[i:end], vector)
        yield similarity_chunk

In [17]:
for i, similarity_chunk in enumerate(calculate_cosine_similarity(vector)):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, n)
    for j in range(start, end):
        similarity_vector = similarity_chunk[j - start]
        # Get the indices of the top N most similar vectors
        top_n_indices = np.argpartition(similarity_vector, -N)[-N:]
        # Store the indices and their corresponding similarities
        similarityVector[j] = [(index, similarity_vector[index]) for index in top_n_indices]

In [22]:
def recommend(similarityVector, movies, movie):
    if movie in movies['Title'].values:
        index = movies[movies['Title'] == movie].index[0]
        distances = similarityVector[index]
        distances.sort(key=lambda x: x[1], reverse=True)
        recommended_movies = [movies.iloc[i[0]]['Title'] for i in distances]
        return recommended_movies
    else:
        return f"Movie '{movie}' not found in the database."

In [27]:
import pickle
import os

os.makedirs('models', exist_ok=True)

with open('models/movieList.pkl', 'wb') as movieFile:
    pickle.dump(movies, movieFile)

with open('models/similarity.pkl', 'wb') as similarityFile:
    pickle.dump(similarityVector, similarityFile)

In [28]:
recommend(similarityVector, movies, 'Carmencita')

['Carmencita',
 'A Colour Box',
 'Edison Kinetoscopic Record of a Sneeze',
 'The X-Ray Fiend',
 'Diagonal Symphony',
 'New York Subway',
 "Autour d'une cabine",
 'Rain',
 "Cordeliers' Square in Lyon",
 'Blacksmith Scene']