In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
import string

# Load the data
data = pd.read_csv("netflixData.csv")

# Data cleaning and preprocessing
data = data[["Title", "Description", "Genres", "Content Type"]]
data.dropna(inplace=True)

# Using NLTK's SnowballStemmer and stopwords
stemmer = nltk.SnowballStemmer("english")
stopwords_set = set(nltk.corpus.stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopwords_set]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    return " ".join(text)

data["Genres"] = data["Genres"].apply(clean_text)
data["Description"] = data["Description"].apply(clean_text)

# Creating the TF-IDF matrix for the cleaned "Genres" column
tfidf_vectorizer = text.TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(data["Genres"])

# Calculating cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix)

# Creating a function for movie recommendation
def netflix_recommendation(title, similarity=similarity_matrix):
    if title not in data["Title"].values:
        return "Title not found in the dataset."

    indices = pd.Series(data.index, index=data['Title']).drop_duplicates()
    idx = indices[title]
    similarity_scores = list(enumerate(similarity[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:11]  # Excluding the title itself
    movie_indices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movie_indices]

# Test the recommendation function
print(netflix_recommendation("Greenleaf"))


1420                Dynasty
1535                   Evil
1851                Godless
1901              Greenleaf
1925                  Gypsy
1942                Halston
1943    Halt and Catch Fire
2008              Heartland
2088              Hollywood
2566             Knightfall
Name: Title, dtype: object
