# Movie Recomendation systems

It is a content based recomendation system that suggest or recommend moview to the user based on short description, keywords, title and actor names

### Import libraries

In [1]:
import re
import ast
import nltk
import string
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and wordnet if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\palma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\palma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import os
os.chdir('../')

In [5]:
%pwd

'e:\\Movie recommendation system'

### Load dataset

In [12]:
# load dataset

df_credits = pd.read_csv('artifacts/raw/tmdb_5000_credits.csv')
df_movies = pd.read_csv('artifacts/raw/tmdb_5000_movies.csv')

df = df_movies.merge(df_credits, left_on='id', right_on='movie_id')

In [7]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'movie_id', 'title_y', 'cast', 'crew'],
      dtype='object')

Important columns

1. id
2. title
3. overview
4. genres
5. cast
6. crew
7. keywords

In [13]:
df = df[[
"id",
"original_title",
"overview",
"genres",
"cast",
"crew",
"keywords"
]]

### Data cleaning and formating

In [14]:
# format genres column
def convert_to_list(row):
    l = []
    for i in ast.literal_eval(row):
        l.append(i['name'])
    return ' '.join(l)
df['genres'] = df['genres'].apply(convert_to_list)

In [15]:
# format cast column
def convert_to_list_cast(row):
    l= []
    flag = 0
    for i in ast.literal_eval(row):
        if flag == 5:
            break
        flag+=1
        l.append(i['character'])
        l.append(i['name'])
    return ' '.join(l)
df['cast'] = df['cast'].apply(convert_to_list_cast)

In [111]:
# ast.literal_eval(df['crew'][0])
# format crew column

def convert_to_list_crew(row):
    l = []
    for i in ast.literal_eval(row):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
    return ' '.join(l)
df['crew'] = df['crew'].apply(convert_to_list_crew)

In [16]:
# kew=yword column formating
df['keywords'] = df['keywords'].apply(convert_to_list)

In [17]:
df.rename({'original_title':'title'}, axis=1, inplace=True)

In [18]:
df['tags'] = df['title'] + df['overview'] + df['genres'] + df['cast'] + df['crew'] + df['keywords']

In [19]:
new_df = df[['id', 'title', 'tags']].dropna()

In [20]:
new_df.to_csv('artifacts/raw/movies.csv', index=False)

### Basic text preprocessing

In [138]:
class BasicTextPreprocessing:  
    def __init__(self):  
        pass 

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):  
        
        def keep_text_only(input_string):   
            result = re.findall(r'[a-zA-Z\s]+', input_string)   
            return ''.join(result).strip()    
        
        def remove_urls(text):    
            if isinstance(text, str):  # Check if the text is a string  
                url_pattern = r'\b(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\/[^\s]*)?\b'    
                cleaned_text = re.sub(url_pattern, '', text)   
                return ' '.join(cleaned_text.split())  
            return text  # Return as is for non-string inputs  
            
        def remove_punctuation(row):   
            return row.translate(str.maketrans('', '', string.punctuation)) 

        def spell_correction(row):
            l = []
            for i in row.split():
                l.append(str(TextBlob(i).correct()))
            return ' '.join(l)
            
        def remove_stopwords(text):
            stop_words = set(stopwords.words('english'))
            return ' '.join([i for i in text.split() if i.lower() not in stop_words])

        def lemmatization(row):  
            lemmatizer = WordNetLemmatizer()
            l = [lemmatizer.lemmatize(i) for i in row.split()]  
            return ' '.join(l)
            
        X = X.str.lower()  
        X = X.apply(keep_text_only)  
        X = X.apply(remove_urls)  
        X = X.apply(remove_punctuation)    
        X = X.apply(remove_stopwords)  
        X = X.apply(lemmatization)
        return X.values
        
class CosineSimilarity:
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return cosine_similarity(X)
        
# Creating pipeline
tokenizer = Pipeline(steps=[
    ('basic preprocessing', BasicTextPreprocessing()),
    ('count vectorization', CountVectorizer(max_features=10000)),
    ('tf-idf', TfidfTransformer())
    # ('cosign similarity', CosineSimilarity())
])

# Assuming df is defined AttributeErrorand 'tags' is a column
similarity = tokenizer.fit_transform(new_df['tags']).toarray() # Fixed for Series input

def recommend(text):
    a = tokenizer.transform(pd.Series([text]))
    distances = np.dot(similarity, a.T).ravel()/(np.linalg.norm(similarity, axis=1)*(np.linalg.norm(a)))
    top_movie_index = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    return top_movie_index
recommend('a horror movie with love story and romance')

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
def recommend(text):
    a = tokenizer.transform(pd.Series([text]))
    distances = np.dot(similarity, a.T).ravel()/(np.linalg.norm(similarity, axis=1)*(np.linalg.norm(a)))
    top_movie_index = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    return top_movie_index
recommend('a horror movie with love story and romance')

In [137]:
text = 'a horror love story movie'
vector = tokenizer.transform(pd.Series([text]))
a = vector.toarray()

In [140]:
similarity.shape, a.shape

((4800, 10000), (1, 10000))

In [156]:
distances = np.dot(similarity, a.T).ravel()/(np.linalg.norm(similarity, axis=1)*(np.linalg.norm(a)))