Step 1: Data Collection and Storage

In [None]:
import pandas as pd

# Load the Netflix dataset
netflix_data = pd.read_csv('netflix_titles.csv', encoding='latin1')
# Display the first few rows to understand the structure
print(netflix_data.head())

# Keep relevant columns for recommendation (e.g., title, genre, description)
netflix_data = netflix_data[['title', 'listed_in', 'description']]

# Check for missing values and clean if necessary
netflix_data.dropna(subset=['description', 'listed_in'], inplace=True)


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  ... Unnamed: 16  \
0  September 25, 2021          2020  PG-13     90 min  ...         NaN   
1  September 24, 2021          2021  TV-MA  2 Season

Step 2: Recommendation Algorithms

Content Based Filtering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine genres and descriptions for content similarity
netflix_data['combined_features'] = netflix_data['listed_in'] + " " + netflix_data['description']

# Apply TF-IDF to convert text to numerical features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(netflix_data['combined_features'])

# Calculate cosine similarity between items
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on content similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = netflix_data[netflix_data['title'] == title].index[0]

    # Get the pairwise similarity scores for all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar movies
    sim_indices = [i[0] for i in sim_scores[1:11]]

    # Return the top 10 most similar movies
    return netflix_data['title'].iloc[sim_indices]

# Example: Get recommendations for a movie
recommendations = get_recommendations('Breaking Bad')
print(recommendations)

2606                              Extracurricular
4118                                  Iron Ladies
5352    Have You Ever Fallen in Love, Miss Jiang?
4143                                       Sparta
1559                    The Mess You Leave Behind
3855                                   The Writer
2643                                     Love 101
517                             Good Morning Call
1067                               The Underclass
4664                             Age of Rebellion
Name: title, dtype: object


Collaborative Filtering

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Placeholder example for ratings data
ratings_data = pd.DataFrame({
    'user_id': [1, 2, 3, 1, 2],
    'title': ['Breaking Bad', 'Stranger Things', 'Narcos', 'Black Mirror', 'Narcos'],
    'rating': [5, 4, 4, 3, 5]
})

# Merge ratings with the Netflix titles
merged_data = pd.merge(ratings_data, netflix_data[['title']], on='title')

# Load data into Surprise format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(merged_data[['user_id', 'title', 'rating']], reader)

# Train/test split
trainset, testset = train_test_split(data, test_size=0.25)

# Use SVD (or any other algorithm from Surprise)
algo = SVD()
algo.fit(trainset)

# Function to recommend movies to a user based on collaborative filtering
def get_collaborative_recommendations(user_id, num_recommendations=5):
    # Get all the titles from the original dataset
    all_titles = netflix_data['title'].unique()

    # Get the list of titles the user has already rated
    rated_titles = merged_data[merged_data['user_id'] == user_id]['title'].unique()

    # Find titles the user has not rated yet
    unrated_titles = [title for title in all_titles if title not in rated_titles]

    # Predict ratings for the unrated titles
    predictions = []
    for title in unrated_titles:
        prediction = algo.predict(user_id, title)
        predictions.append((title, prediction.est))  # est is the estimated rating

    # Sort by predicted ratings in descending order
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Return the top 'num_recommendations' titles
    top_recommendations = [title for title, _ in predictions[:num_recommendations]]
    return top_recommendations

# Example: Get recommendations for user 1
user_recommendations = get_collaborative_recommendations(user_id=1)
print("Collaborative Filtering Recommendations for User 1:")
print(user_recommendations)

Collaborative Filtering Recommendations for User 1:
['Narcos', 'Dick Johnson Is Dead', 'Blood & Water', 'Ganglands', 'Jailbirds New Orleans']


Hybrid Filtering (Combination of Content based filtering and Collaborative filtering)

In [None]:
def hybrid_recommendation(user_id, title, num_recommendations=5, alpha=0.5):
    # Collaborative filtering recommendations (based on user preferences)
    collab_recs = get_collaborative_recommendations(user_id, num_recommendations)

    # Content-based recommendations (based on movie similarities)
    #content_recs = get_content_based_recommendations(title, num_recommendations)
    content_recs = get_recommendations(title) # Removed num_recommendations argument

    # Combine the two sets of recommendations (weighted sum)
    hybrid_recs = set(collab_recs).union(set(content_recs))  # Union to combine both

    return list(hybrid_recs)[:num_recommendations]

# Example: Get hybrid recommendations for user 1 and 'Stranger Things'
print("Hybrid Recommendations for User 1 and 'Stranger Things':")
print(hybrid_recommendation(user_id=1, title='Stranger Things'))

Hybrid Recommendations for User 1 and 'Stranger Things':
['Narcos', 'Freaks', 'Manifest', 'Ganglands', 'Dick Johnson Is Dead']


Step 3 Model Evaluation

In [None]:
from surprise.accuracy import mae

# Calculate RMSE and MAE for the predictions
rmse(predictions)
mae(predictions)

RMSE: 0.3994
MAE:  0.3994


0.39943282854131024

Step 4: Real-time Recommendations (Flask API)

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/recommend', methods=['POST'])
def recommend():
    title = request.json['title']
    recommendations = get_recommendations(title)
    return jsonify({'recommended_titles': recommendations.tolist()})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


Step 5: Scalability and Performance

In [None]:
import sqlite3

# Connect to an SQLite database for long-term storage
conn = sqlite3.connect('netflix_recommendation.db')

# Store Netflix data
netflix_data.to_sql('netflix_titles', conn, if_exists='replace', index=False)

# Example: Query titles from the database
query = "SELECT title FROM netflix_titles WHERE listed_in LIKE '%Action%'"
action_titles = pd.read_sql(query, conn)
print(action_titles)


                              title
0                         Ganglands
1                  Bangkok Breaking
2                            Jaguar
3            Resurrection: Ertugrul
4                    The Stronghold
...                             ...
1023        XXX: State of the Union
1024          Yamla Pagla Deewana 2
1025  You Don't Mess with the Zohan
1026                    Young Tiger
1027             Parasyte: The Grey

[1028 rows x 1 columns]


Step 6: Security and Privacy

In [None]:
from cryptography.fernet import Fernet

# Generate encryption key
key = Fernet.generate_key()
cipher_suite = Fernet(key)

# Example: Encrypt and decrypt sensitive data
encrypted_data = cipher_suite.encrypt(b"Sensitive User Data")
decrypted_data = cipher_suite.decrypt(encrypted_data)

print(decrypted_data)


b'Sensitive User Data'
