# Movie Recomendation systems

It is a content based recomendation system that suggest or recommend moview to the user based on short description, keywords, title and actor names

### Import libraries

In [1]:
import re
import ast
import nltk
import string
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and wordnet if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\palma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\palma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import os
os.chdir('../')

In [6]:
%pwd

'e:\\Movie recommendation system'

### Load dataset

In [13]:
# load dataset

df_credits = pd.read_csv('artifacts/raw/tmdb_5000_credits.csv')
df_movies = pd.read_csv('artifacts/raw/tmdb_5000_movies.csv')

df = df_movies.merge(df_credits, left_on='id', right_on='movie_id')

In [7]:
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title_x,vote_average,vote_count,movie_id,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'movie_id', 'title_y', 'cast', 'crew'],
      dtype='object')

Important columns

1. id
2. title
3. overview
4. genres
5. cast
6. crew
7. keywords

In [14]:
df = df[[
"id",
"original_title",
"overview",
"genres",
"cast",
"crew",
"keywords",
'vote_average',
'release_date'
]]

In [14]:
df.head(1)

Unnamed: 0,id,original_title,overview,genres,cast,crew,keywords,vote_average,release_date
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",7.2,2009.0


In [15]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year

### Data cleaning and formating

In [16]:
# format genres column
def convert_to_list(row):
    l = []
    for i in ast.literal_eval(row):
        l.append(i['name'])
    return ' '.join(l)
df['genres'] = df['genres'].apply(convert_to_list)

In [17]:
# format cast column
def convert_to_list_cast(row):
    l= []
    flag = 0
    for i in ast.literal_eval(row):
        if flag == 5:
            break
        flag+=1
        l.append(i['character'])
        l.append(i['name'])
    return ' '.join(l)
df['cast'] = df['cast'].apply(convert_to_list_cast)

In [18]:
# ast.literal_eval(df['crew'][0])
# format crew column

def convert_to_list_crew(row):
    l = []
    for i in ast.literal_eval(row):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
    return ' '.join(l)
df['crew'] = df['crew'].apply(convert_to_list_crew)

In [19]:
# kew=yword column formating
df['keywords'] = df['keywords'].apply(convert_to_list)

In [20]:
df.rename({'original_title':'title'}, axis=1, inplace=True)

In [21]:
df['tags'] = df['title'] + df['overview'] + df['genres'] + df['cast'] + df['crew'] + df['keywords']

In [23]:
new_df = df[['id', 'title', 'tags', 'overview', 'genres', 'crew', 'cast', 'vote_average', 'release_date']].dropna()

In [30]:
new_df.drop(columns=['cast'], inplace=True)

In [31]:
new_df.to_csv('artifacts/raw/movies.csv', index=False)

### Basic text preprocessing

In [26]:
class BasicTextPreprocessing:  
    def __init__(self):  
        pass 

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):  
        
        def keep_text_only(input_string):   
            result = re.findall(r'[a-zA-Z\s]+', input_string)   
            return ''.join(result).strip()    
        
        def remove_urls(text):    
            if isinstance(text, str):  # Check if the text is a string  
                url_pattern = r'\b(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\/[^\s]*)?\b'    
                cleaned_text = re.sub(url_pattern, '', text)   
                return ' '.join(cleaned_text.split())  
            return text  # Return as is for non-string inputs  
            
        def remove_punctuation(row):   
            return row.translate(str.maketrans('', '', string.punctuation)) 

        def spell_correction(row):
            l = []
            for i in row.split():
                l.append(str(TextBlob(i).correct()))
            return ' '.join(l)
            
        def remove_stopwords(text):
            stop_words = set(stopwords.words('english'))
            return ' '.join([i for i in text.split() if i.lower() not in stop_words])

        def lemmatization(row):  
            lemmatizer = WordNetLemmatizer()
            l = [lemmatizer.lemmatize(i) for i in row.split()]  
            return ' '.join(l)
            
        X = X.str.lower()  
        X = X.apply(keep_text_only)  
        X = X.apply(remove_urls)  
        X = X.apply(remove_punctuation)    
        X = X.apply(remove_stopwords)  
        X = X.apply(lemmatization)
        return X.values
        
class CosineSimilarity:
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return cosine_similarity(X)
        
# Creating pipeline
tokenizer = Pipeline(steps=[
    ('basic preprocessing', BasicTextPreprocessing()),
    ('count vectorization', CountVectorizer(max_features=100)),
    ('tf-idf', TfidfTransformer())
    # ('cosign similarity', CosineSimilarity())
])

# Assuming df is defined AttributeErrorand 'tags' is a column
similarity = tokenizer.fit_transform(new_df['tags']).toarray() # Fixed for Series input

def recommend(text):
    a = tokenizer.transform(pd.Series([text]))
    distances = np.dot(similarity, a.T).ravel()/(np.linalg.norm(similarity, axis=1)*(np.linalg.norm(a)))
    top_movie_index = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    return top_movie_index


ValueError: dimension mismatch

In [22]:
import os

# Function to get file size
def get_file_size(file_path):
    file_size = os.path.getsize(file_path)
    return file_size

# Example usage
file_path = 'artifacts/processed/data/transformed_data.npy'
file_size = get_file_size(file_path)
print((file_size/1024)/1024, 'MB')

183.1055908203125 MB


In [None]:
def recommend(text):
    a = tokenizer.transform(pd.Series([text]))
    distances = np.dot(similarity, a.T).ravel()/(np.linalg.norm(similarity, axis=1)*(np.linalg.norm(a)))
    top_movie_index = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    return top_movie_index
recommend('a horror movie with love story and romance')

In [137]:
text = 'a horror love story movie'
vector = tokenizer.transform(pd.Series([text]))
a = vector.toarray()

In [140]:
similarity.shape, a.shape

((4800, 10000), (1, 10000))

In [156]:
distances = np.dot(similarity, a.T).ravel()/(np.linalg.norm(similarity, axis=1)*(np.linalg.norm(a)))

In [25]:
df[df['title'] == 'The Dark Knight']['overview'].index[0]

65

In [34]:
a = np.array([1, 2, 3, 4,5])
np.where(a != 3)[0]

array([0, 1, 3, 4], dtype=int64)

In [1]:
import requests

In [15]:
import requests

url = "https://api.themoviedb.org/3/movie/63?api_key=f5f0e091654432696b191938d11e63df"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmNWYwZTA5MTY1NDQzMjY5NmIxOTE5MzhkMTFlNjNkZiIsIm5iZiI6MTcyNjY5ODEwOS4yNDczNTcsInN1YiI6IjY2ZWI0ZmE5NWMwNTE5YTIzNGQzYWRhYyIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.EzOK_WjEgmeL1GSbSUwH_9DLb8xvl-I_Ezp42LJyFw4"
}

response = requests.get(url, headers=headers)

response.json()

{'adult': False,
 'backdrop_path': '/1IWaKG7AWiYMhADxhGtnElDJAGI.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 53, 'name': 'Thriller'},
  {'id': 9648, 'name': 'Mystery'}],
 'homepage': '',
 'id': 63,
 'imdb_id': 'tt0114746',
 'origin_country': ['US'],
 'original_language': 'en',
 'original_title': 'Twelve Monkeys',
 'overview': "In the year 2035, convict James Cole reluctantly volunteers to be sent back in time to discover the origin of a deadly virus that wiped out nearly all of the earth's population and forced the survivors into underground communities. But when Cole is mistakenly sent to 1990 instead of 1996, he's arrested and locked up in a mental hospital. There he meets psychiatrist Dr. Kathryn Railly, and patient Jeffrey Goines, the son of a famous virus expert, who may hold the key to the mysterious rogue group, the Army of the 12 Monkeys, thought to be responsible for unleashing the killer disease.",
 '

In [16]:
import ast
from IPython.display import Image, display

image_url = 'https://image.tmdb.org/t/p/w500'+ response.json()['poster_path']

image = Image(url=image_url)
display(image)

In [32]:
import requests
def fetch_movie_image(movie_id):
    images_url = []
    for id in movie_id:
        url = f'https://api.themoviedb.org/3/movie/{id}?api_key=f5f0e091654432696b191938d11e63df'

        headers = {
            "accept": "application/json",
            "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmNWYwZTA5MTY1NDQzMjY5NmIxOTE5MzhkMTFlNjNkZiIsIm5iZiI6MTcyNjY5ODEwOS4yNDczNTcsInN1YiI6IjY2ZWI0ZmE5NWMwNTE5YTIzNGQzYWRhYyIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.EzOK_WjEgmeL1GSbSUwH_9DLb8xvl-I_Ezp42LJyFw4"
        }

        response = requests.get(url, headers=headers)
        print(response)
        image_url = 'https://image.tmdb.org/t/p/w500'+ response.json()['poster_path']
        print(image_url)
        images_url.append(image_url)
    return images_url
url = fetch_movie_image([49026, 272, 102899, 855])

<Response [200]>
https://image.tmdb.org/t/p/w500/hr0L2aueqlP2BYUblTTjmtn0hw4.jpg
<Response [200]>
https://image.tmdb.org/t/p/w500/4MpN4kIEqUjW8OPtOQJXlTdHiJV.jpg
<Response [200]>
https://image.tmdb.org/t/p/w500/rQRnQfUl3kfp78nCWq8Ks04vnq1.jpg
<Response [200]>
https://image.tmdb.org/t/p/w500/7fU5dSqKRL4XHeEUz62rCKBfYok.jpg


In [31]:
url

['https://image.tmdb.org/t/p/w500/zj8ongFhtWNsVlfjOGo8pSr7PQg.jpg',
 'https://image.tmdb.org/t/p/w500/pFEtVPW88pWflYV84UFL0h1iJr3.jpg']

In [33]:
temp_df = new_df.iloc[:5]

In [35]:
for i in temp_df:
    print(i)

id
title
tags
overview
genres
crew
vote_average
release_date


In [1]:
import os
os.chdir('../')

In [2]:
pwd

'e:\\Github repositories\\movie-recommendation-system'

In [3]:
import numpy as np

In [15]:
data = np.load('artifacts/processed/data/transformed_data.npy')

In [16]:
data_size_mb = data.nbytes / (1024 * 1024)
print(f"Size of data: {data_size_mb} MB")

Size of data: 45.76683044433594 MB


In [11]:
data.dtype

dtype('float32')

In [12]:
import numpy as np

print("Smallest positive subnormal number:", np.finfo(np.float16).tiny)
print("Smallest positive normal number:", np.finfo(np.float16).min)
print("Largest positive number:", np.finfo(np.float16).max)
print("Smallest negative number:", -np.finfo(np.float16).max)

Smallest positive subnormal number: 6.104e-05
Smallest positive normal number: -65500.0
Largest positive number: 65500.0
Smallest negative number: -65500.0
