## Dataset Information

   Data on Movies from IMDB (Includes Some Television as Well). Movie IDs to help gather much of this data come from one or two Kaggle projects. There is a workflow from original cobbled together spreadsheets to the final product with 27 variables and over 5000 observations. \
   Content based Filtering

## Import Modules

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv("IMDB_Top250Engmovies2_OMDB_Detailed.csv")
df.head()

In [None]:
len(df)

In [None]:
df['Plot'][0]

## Data Preprocessing

In [None]:
# convert lowercase and remove numbers, punctuations, spaces, etc.,
df['clean_plot'] = df['Plot'].str.lower()
df['clean_plot'] = df['clean_plot'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
df['clean_plot'] = df['clean_plot'].apply(lambda x: re.sub('\s+', ' ', x))
df['clean_plot']

In [None]:
# tokenize the sentence
df['clean_plot'] = df['clean_plot'].apply(lambda x: nltk.word_tokenize(x))
df['clean_plot']

In [None]:
# remove stopwords
stop_words = nltk.corpus.stopwords.words('english')
plot = []
for sentence in df['clean_plot']:
    temp = []
    for word in sentence:
        if word not in stop_words and len(word) >= 3:
            temp.append(word)
    plot.append(temp)
plot

In [None]:
df['clean_plot'] = plot

In [None]:
df['clean_plot']

In [None]:
df.head()

In [None]:
df['Genre'] = df['Genre'].apply(lambda x: x.split(','))
df['Actors'] = df['Actors'].apply(lambda x: x.split(',')[:4])
df['Director'] = df['Director'].apply(lambda x: x.split(','))

In [None]:
df['Actors'][0]

In [None]:
def clean(sentence):
    temp = []
    for word in sentence:
        temp.append(word.lower().replace(' ', ''))
    return temp

In [None]:
df['Genre'] = [clean(x) for x in df['Genre']]
df['Actors'] = [clean(x) for x in df['Actors']]
df['Director'] = [clean(x) for x in df['Director']]

In [None]:
df['Actors'][0]

In [None]:
# combining all the columns data
columns = ['clean_plot', 'Genre', 'Actors', 'Director']
l = []
for i in range(len(df)):
    words = ''
    for col in columns:
        words += ' '.join(df[col][i]) + ' '
    l.append(words)
l

In [None]:
df['clean_input'] = l
df = df[['Title', 'clean_input']]
df.head()

## Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf = TfidfVectorizer()
features = tfidf.fit_transform(df['clean_input'])

In [None]:
# create cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(features, features)
print(cosine_sim)

## Movie Recommendation

In [None]:
index = pd.Series(df['Title'])
index.head()

In [None]:
def recommend_movies(title):
    movies = []
    idx = index[index == title].index[0]
    # print(idx)
    score = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    top10 = list(score.iloc[1:11].index)
    # print(top10)
    
    for i in top10:
        movies.append(df['Title'][i])
    return movies

In [None]:
recommend_movies('The Dark Knight Rises')

In [None]:
index[index == 'The Dark Knight Rises'].index[0]

In [None]:
pd.Series(cosine_sim[3]).sort_values(ascending=False)

In [None]:
recommend_movies('The Shawshank Redemption')

In [None]:
recommend_movies('The Avengers')

In [None]:
len(df)