## Dataset Information
Content based filtering movie recommendation project. Dataset is collected from first top 250 movies on IMDB.

## Import Modules

In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Loading the Dataset

In [2]:
# Specify the file path
adress = r'IMDB-Top250movies2.csv'

# Read the file using pandas
df = pd.read_csv(adress)

# Print first five rows
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",...,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",...,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",...,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",...,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         250 non-null    int64  
 1   Title              250 non-null    object 
 2   Year               250 non-null    int64  
 3   Rated              246 non-null    object 
 4   Released           247 non-null    object 
 5   Runtime            250 non-null    object 
 6   Genre              250 non-null    object 
 7   Director           250 non-null    object 
 8   Writer             250 non-null    object 
 9   Actors             250 non-null    object 
 10  Plot               250 non-null    object 
 11  Language           249 non-null    object 
 12  Country            250 non-null    object 
 13  Awards             248 non-null    object 
 14  Poster             250 non-null    object 
 15  Ratings.Source     250 non-null    object 
 16  Ratings.Value      250 non

## Data Preprocessing

In [4]:
# convert to lower case
df['clean_plot'] = df['Plot'].str.lower()

# replace any charachters other than alphabet with a space
df['clean_plot'] = df['clean_plot'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

# replace two or more than two spaces with one space
df['clean_plot'] = df['clean_plot'].apply(lambda x: re.sub('\s+', ' ', x))

df['clean_plot']

0      two imprisoned men bond over a number of years...
1      the aging patriarch of an organized crime dyna...
2      the early life and career of vito corleone in ...
3      when the menace known as the joker emerges fro...
4      a jury holdout attempts to prevent a miscarria...
                             ...                        
245    blacksmith will turner teams up with eccentric...
246    a former child star torments her paraplegic si...
247    travis henderson an aimless drifter who has be...
248    desperate measures are taken by a man who trie...
249    a stranger in the city asks questions no one h...
Name: clean_plot, Length: 250, dtype: object

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Tokenizing
df['clean_plot'] = df['clean_plot'].apply(lambda x: nltk.word_tokenize(x))

df['clean_plot']

0      [two, imprisoned, men, bond, over, a, number, ...
1      [the, aging, patriarch, of, an, organized, cri...
2      [the, early, life, and, career, of, vito, corl...
3      [when, the, menace, known, as, the, joker, eme...
4      [a, jury, holdout, attempts, to, prevent, a, m...
                             ...                        
245    [blacksmith, will, turner, teams, up, with, ec...
246    [a, former, child, star, torments, her, parapl...
247    [travis, henderson, an, aimless, drifter, who,...
248    [desperate, measures, are, taken, by, a, man, ...
249    [a, stranger, in, the, city, asks, questions, ...
Name: clean_plot, Length: 250, dtype: object

In [7]:
# remove stop words
stop_words = nltk.corpus.stopwords.words('english')
plot = []
for sentence in df['clean_plot']:
    temp = []
    for word in sentence:
        if word not in stop_words or len(word) > 3:
            temp.append(word)
    plot.append(temp)

In [8]:
df['clean_plot'] = plot

In [9]:
df['clean_plot']

0      [two, imprisoned, men, bond, over, number, yea...
1      [aging, patriarch, organized, crime, dynasty, ...
2      [early, life, career, vito, corleone, new, yor...
3      [when, menace, known, joker, emerges, from, my...
4      [jury, holdout, attempts, prevent, miscarriage...
                             ...                        
245    [blacksmith, will, turner, teams, with, eccent...
246    [former, child, star, torments, paraplegic, si...
247    [travis, henderson, aimless, drifter, been, mi...
248    [desperate, measures, taken, man, tries, save,...
249    [stranger, city, asks, questions, one, asked, ...
Name: clean_plot, Length: 250, dtype: object

In [10]:
# split each string in the 'Genre' column by the ',' character.
df['Genre'] = df['Genre'].apply(lambda x: x.split(','))

# split top 4 string in the 'Actors' column by the ',' character.
df['Actors'] = df['Actors'].apply(lambda x: x.split(',')[:4])

# split each string in the 'Director' column by the ',' character.
df['Director'] = df['Director'].apply(lambda x: x.split(',')[:4])

In [11]:
# funtion to convert to lower case and replacing the space
def clean(sentence):
    temp = []
    for word in sentence:
        temp.append(word.lower().replace(' ',''))
    return temp

In [12]:
# convert to lower case and replacing the space in each columns
df['Genre'] = [clean(x) for x in df['Genre']]
df['Actors'] = [clean(x) for x in df['Actors']]
df['Director'] = [clean(x) for x in df['Director']]

In [13]:
# combining all the columns data
columns = ['clean_plot', 'Genre', 'Actors', 'Director']
l = []
for i in range(len(df)):
    words = ''
    for col in columns:
        words += ' '.join(df[col][i]) + ' '
    l.append(words)

In [14]:
df['clean_input'] = l
df = df[['Title', 'clean_input']]

In [15]:
df.head()

Unnamed: 0,Title,clean_input
0,The Shawshank Redemption,two imprisoned men bond over number years find...
1,The Godfather,aging patriarch organized crime dynasty transf...
2,The Godfather: Part II,early life career vito corleone new york portr...
3,The Dark Knight,when menace known joker emerges from mysteriou...
4,12 Angry Men,jury holdout attempts prevent miscarriage just...


### Feature Extraction

In [16]:
tfidf = TfidfVectorizer()
features = tfidf.fit_transform(df['clean_input'])

In [17]:
# create cosine similarity matirx
cosine_sim = cosine_similarity(features, features)

## Movie Recommendation

In [18]:
index = pd.Series(df['Title'])

In [19]:
def recommend_movies(title):
    movies = []
    idx = index[index == title].index[0]
    score = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top10 = list(score.iloc[1:11].index)

    for i in top10:
        movies.append(df['Title'][i])
    return movies
    

In [20]:
recommend_movies('The Dark Knight Rises')

['The Dark Knight',
 'Batman Begins',
 'Inception',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Star Wars: Episode IV - A New Hope',
 'Drishyam',
 'Django Unchained',
 'Star Wars: Episode VI - Return of the Jedi',
 'Die Hard',
 'Mad Max: Fury Road']

In [21]:
recommend_movies('The Godfather')

['The Godfather: Part II',
 'Apocalypse Now',
 'On the Waterfront',
 'Scarface',
 'Casino',
 'Wild Strawberries',
 'Rashomon',
 'All About Eve',
 'Heat',
 'The 400 Blows']