In [4]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os

# Download NLTK data for sentiment analysis
nltk.download('vader_lexicon')

# Set data path
data_path = r'C:\Users\jahna\Movie_Recommendations\data\movielens-20m-dataset'

# Load datasets
movies = pd.read_csv(os.path.join(data_path, 'movie.csv'))
ratings = pd.read_csv(os.path.join(data_path, 'rating.csv'))
tags = pd.read_csv(os.path.join(data_path, 'tag.csv'))
genome_scores = pd.read_csv(os.path.join(data_path, 'genome_scores.csv'))
genome_tags = pd.read_csv(os.path.join(data_path, 'genome_tags.csv'))

# Sample a subset for faster processing (optional, remove for full dataset)
ratings = ratings.sample(n=1000000, random_state=42)  # Sample 1M ratings
movies = movies[movies['movieId'].isin(ratings['movieId'].unique())]
tags = tags[tags['movieId'].isin(ratings['movieId'].unique())]
genome_scores = genome_scores[genome_scores['movieId'].isin(ratings['movieId'].unique())]

# Clean data
# Remove movies with missing titles
movies = movies.dropna(subset=['title'])

# Process genres
movies['genres'] = movies['genres'].replace('|', ' ', regex=True)
movies['genres'] = movies['genres'].replace('(no genres listed)', '')

# Process tags
# Aggregate user tags per movie
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
movies = movies.merge(movie_tags, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

# Combine genres and tags for content-based filtering
movies['content'] = movies['genres'] + ' ' + movies['tag']

# Merge ratings with movie titles
ratings = ratings.merge(movies[['movieId', 'title', 'content']], left_on='movieId', right_on='movieId', how='left')

# Display data
print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
print("Tags shape:", tags.shape)
print("Genome scores shape:", genome_scores.shape)
print(ratings.head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jahna\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Ratings shape: (1000000, 6)
Movies shape: (15374, 5)
Tags shape: (446487, 4)
Genome scores shape: (11489808, 3)
   userId  movieId  rating            timestamp  \
0  122270     8360     3.5  2012-04-22 01:07:04   
1   49018       32     2.0  2001-09-11 07:50:36   
2   89527   109374     3.5  2015-01-06 09:26:40   
3  106704     1060     3.0  2000-01-22 21:27:57   
4   47791     1732     2.0  2006-01-19 15:48:23   

                                       title  \
0                             Shrek 2 (2004)   
1  Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   
2           Grand Budapest Hotel, The (2014)   
3                            Swingers (1996)   
4                   Big Lebowski, The (1998)   

                                             content  
0   A d v e n t u r e | A n i m a t i o n | C h i...  
1   M y s t e r y | S c i - F i | T h r i l l e r...  
2   C o m e d y | D r a m a  amazing storytelling...  
3   C o m e d y | D r a m a  funny Vince Vaughn f...  
4   C o m e d y |

In [5]:
# Prepare data for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model
svd = SVD(n_factors=100, random_state=42)
svd.fit(trainset)

# Function to get collaborative filtering predictions
def get_collaborative_recommendations(user_id, n=5):
    # Get all movie IDs
    movie_ids = movies['movieId'].unique()
    # Predict ratings for all movies
    predictions = [svd.predict(user_id, movie_id) for movie_id in movie_ids]
    # Sort by predicted rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    # Get top N movie IDs
    top_movie_ids = [pred.iid for pred in predictions[:n]]
    # Get movie titles
    top_movies = movies[movies['movieId'].isin(top_movie_ids)][['movieId', 'title', 'content']]
    return top_movies

# Test collaborative filtering
user_id = ratings['userId'].iloc[0]  # Example user
collab_recs = get_collaborative_recommendations(user_id)
print(f"Collaborative Filtering Recommendations for User {user_id}:")
print(collab_recs)

Collaborative Filtering Recommendations for User 122270:
      movieId                             title  \
312       318  Shawshank Redemption, The (1994)   
1029     1069           Murder, My Sweet (1944)   
1101     1147         When We Were Kings (1996)   
1171     1221    Godfather: Part II, The (1974)   
2426     2571                Matrix, The (1999)   

                                                content  
312    C r i m e | D r a m a  friendship masterplan ...  
1029   C r i m e | F i l m - N o i r | T h r i l l e...  
1101   D o c u m e n t a r y  character based on rea...  
1171   C r i m e | D r a m a  complex characters maf...  
2426   A c t i o n | S c i - F i | T h r i l l e r  ...  


In [6]:
# Create TF-IDF matrix for content (genres + tags)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(movies['content'])

# Compute cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get content-based recommendations
def get_content_recommendations(title, n=5):
    # Get movie index
    idx = movies[movies['title'] == title].index
    if len(idx) == 0:
        return pd.DataFrame()
    idx = idx[0]
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get top N similar movies
    sim_scores = sim_scores[1:n+1]  # Exclude the movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['movieId', 'title', 'content']]

# Test content-based filtering
movie_title = movies['title'].iloc[0]  # Example movie
content_recs = get_content_recommendations(movie_title)
print(f"Content-Based Recommendations for {movie_title}:")
print(content_recs)

Content-Based Recommendations for Toy Story (1995):
      movieId                  title  \
2954     3114     Toy Story 2 (1999)   
2214     2355   Bug's Life, A (1998)   
4650     4886  Monsters, Inc. (2001)   
4964     5218         Ice Age (2002)   
6024     6377    Finding Nemo (2003)   

                                                content  
2954   A d v e n t u r e | A n i m a t i o n | C h i...  
2214   A d v e n t u r e | A n i m a t i o n | C h i...  
4650   A d v e n t u r e | A n i m a t i o n | C h i...  
4964   A d v e n t u r e | A n i m a t i o n | C h i...  
6024   A d v e n t u r e | A n i m a t i o n | C h i...  


In [7]:
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Compute sentiment for tags
tags['sentiment'] = tags['tag'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

# Aggregate sentiment per movie
movie_sentiment = tags.groupby('movieId')['sentiment'].mean().reset_index()

# Function to filter recommendations by sentiment
def filter_by_sentiment(recommendations, min_sentiment=0.1):
    # Merge recommendations with sentiment
    recs_with_sentiment = recommendations.merge(movie_sentiment, on='movieId', how='left')
    # Fill missing sentiment with neutral (0)
    recs_with_sentiment['sentiment'] = recs_with_sentiment['sentiment'].fillna(0)
    # Filter by positive sentiment
    return recs_with_sentiment[recs_with_sentiment['sentiment'] >= min_sentiment]

# Test sentiment filtering on collaborative recommendations
collab_recs_sentiment = filter_by_sentiment(collab_recs)
print("Collaborative Recommendations with Positive Sentiment:")
print(collab_recs_sentiment)

Collaborative Recommendations with Positive Sentiment:
Empty DataFrame
Columns: [movieId, title, content, sentiment]
Index: []


In [8]:
# Function for hybrid recommendations
def get_hybrid_recommendations(user_id, movie_title, n=5, use_sentiment=True):
    # Get collaborative recommendations
    collab_recs = get_collaborative_recommendations(user_id, n=10)
    # Get content-based recommendations
    content_recs = get_content_recommendations(movie_title, n=10)
    # Combine recommendations
    combined = pd.concat([collab_recs, content_recs]).drop_duplicates(subset=['movieId'])
    # Apply sentiment filtering if enabled
    if use_sentiment:
        combined = filter_by_sentiment(combined)
    # Return top N
    return combined.head(n)

# Test hybrid recommendations
user_id = ratings['userId'].iloc[0]
movie_title = movies['title'].iloc[0]
hybrid_recs = get_hybrid_recommendations(user_id, movie_title)
print(f"Hybrid Recommendations for User {user_id} and Movie {movie_title}:")
print(hybrid_recs)

Hybrid Recommendations for User 122270 and Movie Toy Story (1995):
    movieId                           title  \
1       898  Philadelphia Story, The (1940)   
12     4886           Monsters, Inc. (2001)   

                                              content  sentiment  
1    C o m e d y | D r a m a | R o m a n c e  scre...   0.193295  
12   A d v e n t u r e | A n i m a t i o n | C h i...   0.136745  


In [2]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------- ----- 1.3/1.5 MB 9.5 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 8.0 MB/s eta 0:00:00
Downloading regex-2024.11.6-cp311-cp311-win_amd64.whl (274 kB)
Downloading click-8.2.1-py3-none-any.whl (102 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk

   ---------------------------------------- 0/4 [tqdm]
   ---------------------------------------- 0/4 [tqdm]
   ---------- --------------------

In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/11.6 MB 9.6 MB/s eta 0:00:02
   ------------ --------------------------- 3.7/11.6 MB 9.9 MB/s eta 0:00:01
   ------------------ --------------------- 5.5/11.6 MB 9.1 MB/s eta 0:00:01
   --------------------- ------------------ 6.3/11.6 MB 7.6 MB/s eta 0:00:01
   ----------------------- ---------------- 6.8/11.6 MB 6.9 MB/s eta 0:00:01
   -------------------------- ------------- 7.6/11.6 MB 6.2 MB/s eta 0:00:01


In [2]:
!pip install numpy scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   ------------- -------------------------- 3.7/11.1 MB 14.6 MB/s eta 0:00:01
   ----------------- ---------------------- 5.0/11.1 MB 10.8 MB/s eta 0:00:01
   -------------------- ------------------- 5.8/11.1 MB 8.8 MB/s eta 0:00:01
   ----------------------- ---------------- 6.6/11.1 MB 7.2 MB/s eta 0:00:01
   ------------------------- -----------

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-win_amd64.whl size=1293198 sha256=07e152d397391b18eeef4e4125259d39b7db333e922207ca758547d5eb3472cf
  Stored in directory: c:\users\jahna\appdata\local\pip\cache\wheels\2a\8f\6e\7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surpris

In [4]:
!pip install pandas numpy scikit-learn scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'error'
Failed to build scikit-surprise


  error: subprocess-exited-with-error
  
  Building wheel for scikit-surprise (pyproject.toml) did not run successfully.
  exit code: 1
  
  [155 lines of output]
  !!
  
          ********************************************************************************
          Please use a simple string containing a SPDX expression for `project.license`. You can also use `project.license-files`. (Both options available on setuptools>=77.0.0).
  
          By 2026-Feb-18, you need to update your project and remove deprecated calls
          or your builds will no longer be supported.
  
          See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
          ********************************************************************************
  
  !!
    corresp(dist, value, root_dir)
  !!
  
          ********************************************************************************
          Please consider removing the following classifiers in favor of a 

In [2]:
!pip list


Package                   Version
------------------------- --------------
anyio                     4.9.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.5
attrs                     25.3.0
babel                     2.17.0
beautifulsoup4            4.13.4
bleach                    6.2.0
certifi                   2025.4.26
cffi                      1.17.1
charset-normalizer        3.4.2
colorama                  0.4.6
comm                      0.2.2
debugpy                   1.8.14
decorator                 5.2.1
defusedxml                0.7.1
executing                 2.2.0
fastjsonschema            2.21.1
fqdn                      1.5.1
h11                       0.16.0
httpcore                  1.0.9
httpx                     0.28.1
idna                      3.10
ipykernel                 6.29.5
ipython                   9.2.0
ipython_pygments_lexers   1.1.1
ipywidgets     

In [4]:
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'error'
Failed to build scikit-surprise


  error: subprocess-exited-with-error
  
  Building wheel for scikit-surprise (pyproject.toml) did not run successfully.
  exit code: 1
  
  [155 lines of output]
  !!
  
          ********************************************************************************
          Please use a simple string containing a SPDX expression for `project.license`. You can also use `project.license-files`. (Both options available on setuptools>=77.0.0).
  
          By 2026-Feb-18, you need to update your project and remove deprecated calls
          or your builds will no longer be supported.
  
          See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
          ********************************************************************************
  
  !!
    corresp(dist, value, root_dir)
  !!
  
          ********************************************************************************
          Please consider removing the following classifiers in favor of a 

In [4]:
!pip install "numpy<2" --force-reinstall


Collecting numpy<2
  Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl (15.8 MB)
   ---------------------------------------- 0.0/15.8 MB ? eta -:--:--
   ----- ---------------------------------- 2.1/15.8 MB 16.8 MB/s eta 0:00:01
   ----------- ---------------------------- 4.7/15.8 MB 15.0 MB/s eta 0:00:01
   ------------- -------------------------- 5.5/15.8 MB 10.2 MB/s eta 0:00:02
   --------------- ------------------------ 6.0/15.8 MB 8.4 MB/s eta 0:00:02
   ----------------- ---------------------- 6.8/15.8 MB 7.6 MB/s eta 0:00:02
   ------------------- -------------------- 7.9/15.8 MB 6.7 MB/s eta 0:00:02
   --------------------- ------------------ 8.4/15.8 MB 6.2 MB/s eta 0:00:02
   ----------------------- ---------------- 9.2/15.8 MB 5.8 MB/s eta 0:00:02
   ------------------------- -------------- 10.0/15.8 MB 5.5 MB/s eta 0:00:02
   --------------------------- ------------ 10.7/15.8 MB 5.4 MB/s eta 0:00:01
   --

  You can safely remove it manually.
  You can safely remove it manually.


In [5]:
!pip install scikit-surprise --force-reinstall

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4-cp311-cp311-win_amd64.whl
Collecting joblib>=1.2.0 (from scikit-surprise)
  Using cached joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting numpy>=1.19.5 (from scikit-surprise)
  Using cached numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting scipy>=1.6.0 (from scikit-surprise)
  Using cached scipy-1.15.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Using cached joblib-1.5.0-py3-none-any.whl (307 kB)
Using cached numpy-2.2.6-cp311-cp311-win_amd64.whl (12.9 MB)
Using cached scipy-1.15.3-cp311-cp311-win_amd64.whl (41.2 MB)
Installing collected packages: numpy, joblib, scipy, scikit-surprise

  Attempting uninstall: numpy

    Found existing installation: numpy 1.26.4

   ---------------------------------------- 0/4 [numpy]
    Uninstalling numpy-1.26.4:
   ---------------------------------------- 0/4 [numpy]
   ---------------------------------------- 0/4 [numpy]
   --------------------------------------

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset
print("All imports successful!")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\jahna\Movie_Recommendations\venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\jahna\Movie_Recommendations\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\jahna\Movie_Recommendations\venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.sta

ImportError: numpy.core.multiarray failed to import (auto-generated because you didn't call 'numpy.import_array()' after cimporting numpy; use '<void>numpy._import_array' to disable if you are certain you don't need it).

In [2]:
!pip uninstall numpy scikit-surprise -y
!pip install "numpy<2"

Found existing installation: numpy 2.2.6
Uninstalling numpy-2.2.6:
  Successfully uninstalled numpy-2.2.6
Found existing installation: scikit-surprise 1.1.4
Uninstalling scikit-surprise-1.1.4:
  Successfully uninstalled scikit-surprise-1.1.4


You can safely remove it manually.
You can safely remove it manually.
You can safely remove it manually.


Collecting numpy<2
  Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl (15.8 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4


In [3]:
!pip install scikit-surprise --no-deps

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4-cp311-cp311-win_amd64.whl
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [4]:
!pip install scikit-surprise



In [9]:
!python -c "import numpy as np; print(np.__version__); from surprise import Dataset; print('Success!')"


1.26.4
Success!
