# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Data Loading and Understanding

In [None]:
df = pd.read_csv('movies.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# filtering the required columns for recommentdations
required_columns = ["genres","keywords","overview","title"]

df = df[required_columns]

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# check for missing values
df.info()

In [None]:
# dropping rows with missing values
df = df.dropna().reset_index(drop=True)

In [None]:
df.info()

In [None]:
df['combined'] = df['genres'] + ' ' + df['keywords'] + ' ' + df['overview']
# we will use the combined text in the recommender system

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
data = df[['title', 'combined']]

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# wordcloud from movie content
combined_text = ' '.join(df['combined'])
wordcloud = WordCloud(width=800, height = 400, background_color='white').generate(combined_text)

In [None]:
# word cloud to visualize the most cmmon words in the movie content
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most common words in movie content')
plt.show()

In [None]:
# downlnoad nltk data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


In [None]:
stop_words = set(stopwords.words('english'))


In [None]:
def process_text(text):
    # remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # convert to lowercase
    text = text.lower()
    # tokenize the text
    tokens = word_tokenize(text)
    # remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
# applying the text processing function to the combined column
data['cleaned_text'] = data['combined'].apply(process_text)

In [None]:
data.head()

In [None]:
# vectorizing with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_text'])

In [None]:
# compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# recommendation function
def recommend_movies(movie_name, cosine_sim=cosine_sim,df = data, top_n=10):
    # finding the index of the movie that matches the title
    idx = df[df['title'].str.lower() == movie_name.lower()].index
    if len(idx) == 0:
        return "Movie Not Found"
    idx = idx[0]
    
# get similarity scores for all movies with the given movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # get the scores of the top n most similar movies
    sim_scores = sim_scores[1:top_n+1]
    
# get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
# return the top n most similar movies
    return df['title'].iloc[movie_indices]

In [None]:
data["title"]

In [None]:
row_index = data[data['title'] == 'The Dark Knight Rises'].index
print(row_index)

In [None]:
movie_name = data['title'][9]
print(movie_name)

In [None]:
# Example usage of the recommendation function
print(f"Movies recommended for '{movie_name}':")
recommended_movies = recommend_movies(movie_name)
print(recommended_movies)