<a href="https://colab.research.google.com/github/ANIL-BUSSA/LLINTERN/blob/main/MovieRecomendationSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install scikit-surprise pandas

# Import libraries
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load the MovieLens-100k dataset
url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(url, sep='\t', names=columns)

# Load the movie titles
url_movies = "http://files.grouplens.org/datasets/movielens/ml-100k/u.item"
columns_movies = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
                  'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
                  'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
                  'Sci-Fi', 'Thriller', 'War', 'Western']
df_movies = pd.read_csv(url_movies, sep='|', names=columns_movies, encoding='latin-1')

# Define the format of the dataset
reader = Reader(rating_scale=(1, 5))

# Load the dataset into Surprise
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

# Split the dataset into training and test set
trainset, testset = train_test_split(data, test_size=0.2)

# Use the SVD algorithm
model = SVD()

# Train the model
model.fit(trainset)

# Predict ratings for the test set
predictions = model.test(testset)

# Compute the RMSE (Root Mean Squared Error)
rmse = accuracy.rmse(predictions)

# Function to get top N recommendations for a given user including movie titles
def get_top_n_recommendations(user_id, model, n=10):
    # Get a list of all item ids
    all_items = df['item_id'].unique()

    # Predict ratings for all items for the given user
    user_predictions = [model.predict(user_id, item_id) for item_id in all_items]

    # Sort predictions by estimated rating
    user_predictions.sort(key=lambda x: x.est, reverse=True)

    # Get the top N items
    top_n_items = [pred.iid for pred in user_predictions[:n]]

    # Get the movie titles for the top N items
    top_n_movies = df_movies[df_movies['movie_id'].isin(top_n_items)][['movie_id', 'title']]

    return top_n_movies

# Get top 10 recommendations for a user (e.g., user_id = 1)
user_id = 1
top_n_recommendations = get_top_n_recommendations(user_id, model, n=20)
print(f"Top 10 movie recommendations for user {user_id}:")
print(top_n_recommendations)


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357236 sha256=c897d8b769a5dac67b2dfa1088a2aed5c6e3272fc111d6eee5b1395987601522
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4
RMSE: 0.9321
Top 10 movie recommendations for user 1:
     movie_id        

In [2]:
!pip install scikit-surprise pandas




In [3]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load the MovieLens-100k dataset
url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(url, sep='\t', names=columns)

# Load the movie titles
url_movies = "http://files.grouplens.org/datasets/movielens/ml-100k/u.item"
columns_movies = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
                  'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
                  'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
                  'Sci-Fi', 'Thriller', 'War', 'Western']
df_movies = pd.read_csv(url_movies, sep='|', names=columns_movies, encoding='latin-1')

# Check the data
df.head(), df_movies.head()


(   user_id  item_id  rating  timestamp
 0      196      242       3  881250949
 1      186      302       3  891717742
 2       22      377       1  878887116
 3      244       51       2  880606923
 4      166      346       1  886397596,
    movie_id              title release_date  video_release_date  \
 0         1   Toy Story (1995)  01-Jan-1995                 NaN   
 1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
 2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
 3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
 4         5     Copycat (1995)  01-Jan-1995                 NaN   
 
                                             IMDb_URL  unknown  Action  \
 0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
 1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
 2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
 3  http://us.imdb.com/M/title-exact?Get%20Shorty%...

In [4]:
# Define the format of the dataset
reader = Reader(rating_scale=(1, 5))

# Load the dataset into Surprise
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)


In [5]:
# Split the dataset into training and test set
trainset, testset = train_test_split(data, test_size=0.2)


In [6]:
# Use the SVD algorithm
model = SVD()

# Train the model
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f27c0bc47c0>

In [7]:
# Predict ratings for the test set
predictions = model.test(testset)

# Compute the RMSE (Root Mean Squared Error)
rmse = accuracy.rmse(predictions)


RMSE: 0.9389


In [8]:
# Function to get top N recommendations for a given user including movie titles
def get_top_n_recommendations(user_id, model, n=10):
    # Get a list of all item ids
    all_items = df['item_id'].unique()

    # Predict ratings for all items for the given user
    user_predictions = [model.predict(user_id, item_id) for item_id in all_items]

    # Sort predictions by estimated rating
    user_predictions.sort(key=lambda x: x.est, reverse=True)

    # Get the top N items
    top_n_items = [pred.iid for pred in user_predictions[:n]]

    # Get the movie titles for the top N items
    top_n_movies = df_movies[df_movies['movie_id'].isin(top_n_items)][['movie_id', 'title']]

    return top_n_movies

# Get top 10 recommendations for a user (e.g., user_id = 1)
user_id = 1
top_n_recommendations = get_top_n_recommendations(user_id, model, n=15)
print(f"Top 10 movie recommendations for user {user_id}:")
print(top_n_recommendations)


Top 10 movie recommendations for user 1:
     movie_id                                   title
11         12              Usual Suspects, The (1995)
49         50                        Star Wars (1977)
63         64        Shawshank Redemption, The (1994)
99        100                            Fargo (1996)
131       132                Wizard of Oz, The (1939)
168       169              Wrong Trousers, The (1993)
171       172         Empire Strikes Back, The (1980)
180       181               Return of the Jedi (1983)
222       223                      Sling Blade (1996)
245       246                      Chasing Amy (1997)
284       285                   Secrets & Lies (1996)
356       357  One Flew Over the Cuckoo's Nest (1975)
426       427            To Kill a Mockingbird (1962)
482       483                       Casablanca (1942)
646       647                              Ran (1985)
