In [2]:
import pandas as pd

In [3]:
import sklearn

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
# Load the Netflix data

In [7]:
df = pd.read_csv('netflix_data.1.0.csv')

In [8]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,year_added,month_added,day_added
0,S1,MOVIE,DICK JOHNSON IS DEAD,KIRSTEN JOHNSON,NAN,UNITED STATES,2020,13,90.0,2021.0,9.0,25.0
1,S2,TV SHOW,BLOOD & WATER,NAN,"AMA QAMATA, KHOSI NGEMA, GAIL MABALANE, THABAN...",SOUTH AFRICA,2021,18,2.0,2021.0,9.0,24.0
2,S3,TV SHOW,GANGLANDS,JULIEN LECLERCQ,"SAMI BOUAJILA, TRACY GOTOAS, SAMUEL JOUY, NABI...",NAN,2021,18,1.0,2021.0,9.0,24.0
3,S4,TV SHOW,JAILBIRDS NEW ORLEANS,NAN,NAN,NAN,2021,18,1.0,2021.0,9.0,24.0
4,S5,TV SHOW,KOTA FACTORY,NAN,"MAYUR MORE, JITENDRA KUMAR, RANJAN RAJ, ALAM K...",INDIA,2021,18,2.0,2021.0,9.0,24.0


In [9]:
# Fill any missing titles or ratings with an empty string or mean value

In [10]:
df['title'] = df['title'].fillna('')

In [11]:
df['rating'] = df['rating'].fillna(df['rating'].mean())

In [12]:
# Create a TF-IDF Vectorizer for the title column

In [13]:
tfidf = TfidfVectorizer(stop_words='english')

In [14]:
tfidf_matrix = tfidf.fit_transform(df['title'])

In [15]:
# Compute the cosine similarity matrix

In [16]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [17]:
# Function to get recommendations based on title and rating
def get_recommendations(title, rating_threshold=0):
    # Get the index of the movie that matches the title
    idx = df[df['title'].str.contains(title, case=False)].index[0]
    # Get the pairwise similarity scores for all titles
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the titles based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the most similar titles
    similar_indices = [i[0] for i in sim_scores]
    # Create a DataFrame of similar titles
    similar_df = df.iloc[similar_indices]
    # Filter based on the rating threshold
    recommended_titles = similar_df[similar_df['rating'] >= rating_threshold]['title']
    # Return the top 10 recommended titles
    return recommended_titles.head(10)

In [18]:
# Example usage

In [19]:
recommendations = get_recommendations('LADY IN THE WATER', rating_threshold=10)

In [20]:
print(recommendations)

7262                 LADY IN THE WATER
2769    THERE'S SOMETHING IN THE WATER
3115                    THE FIRST LADY
4034                            LADY J
1                        BLOOD & WATER
6587                        DEEP WATER
8369                     THE IRON LADY
8316               THE FUTURE OF WATER
2430                         LADY BIRD
5605               THE MEMORY OF WATER
Name: title, dtype: object


In [21]:
recommendations = get_recommendations('LADY BIRD', rating_threshold=9)
print(recommendations)

2430            LADY BIRD
3115       THE FIRST LADY
4034               LADY J
7561        NATIONAL BIRD
7262    LADY IN THE WATER
4268             BIRD BOX
8369        THE IRON LADY
3264      EARTHQUAKE BIRD
5498        A YELLOW BIRD
4120     HIGH FLYING BIRD
Name: title, dtype: object


In [23]:
recommendations = get_recommendations('THE MEMORY OF WATER', rating_threshold=9)
print(recommendations)

5605               THE MEMORY OF WATER
4179                       MEMORY LOVE
2769    THERE'S SOMETHING IN THE WATER
7441                      MEMORY GAMES
1                        BLOOD & WATER
7262                 LADY IN THE WATER
6587                        DEEP WATER
8316               THE FUTURE OF WATER
8552                 THE WATER DIVINER
6956                HELL OR HIGH WATER
Name: title, dtype: object
