In [3]:
pip install scikit-surprise



In [53]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

In [59]:
# Reading .dat files with '::' separator and Latin-1 encoding
movies = pd.read_csv('movies.csv', sep='::', engine='python',
                     names=['MovieID','title','Genres'],
                     encoding='ISO-8859-1')
ratings = pd.read_csv('ratings.csv', sep='::', engine='python',
                      names=['UserID','MovieID','Rating','Timestamp'],
                      encoding='ISO-8859-1')

In [60]:
movies = pd.read_csv('/content/movies.csv', sep=';', encoding='ISO-8859-1')
ratings = pd.read_csv('/content/ratings.csv', sep=';', encoding='ISO-8859-1')
users = pd.read_csv('/content/users.csv', sep=';', encoding='ISO-8859-1')

In [41]:
print("Movies columns:", movies.columns)
print("Ratings columns:", ratings.columns)

Movies columns: Index(['movieId', 'title', 'genres', ',,'], dtype='object')
Ratings columns: Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


Isse ratings aur movies data ek saath aa jaye ga — har rating ke sath us movie ka naam aur genre bh

In [43]:
# Convert both movieId columns to string (or int, dono same ho to chalega)
ratings['movieId'] = ratings['movieId'].astype(str)
movies['movieId'] = movies['movieId'].astype(str)

# Now merge
data = pd.merge(ratings, movies, on='movieId')
print(data.head())



   userId movieId  rating  timestamp                                   title  \
0       1    1193       5  978300760  One Flew Over the Cuckoo's Nest (1975)   
1       1     661       3  978302109        James and the Giant Peach (1996)   
2       1     914       3  978301968                     My Fair Lady (1964)   
3       1    3408       4  978300275                  Erin Brockovich (2000)   
4       1    2355       5  978824291                    Bug's Life, A (1998)   

                         genres  ,,  
0                         Drama  ,,  
1  Animation|Children's|Musical  ,,  
2               Musical|Romance  ,,  
3                         Drama  ,,  
4   Animation|Children's|Comedy   ,  


CONVERT TIMESTAMP TO DATATIME

In [44]:
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
print(data[['timestamp']].head())

            timestamp
0 2000-12-31 22:12:40
1 2000-12-31 22:35:09
2 2000-12-31 22:32:48
3 2000-12-31 22:04:35
4 2001-01-06 23:38:11


In [51]:


# Create a Reader object to parse the rating data
reader = Reader(rating_scale=(1, 5))

# Load data
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train the SVD model
model = SVD()
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model
accuracy.rmse(predictions)

# Predict a specific rating (for example, for user 1 and movie 50)
predicted_rating = model.predict(1, 50)  # User 1, Movie 50
print(predicted_rating)

RMSE: 0.8737
user: 1          item: 50         r_ui = None   est = 3.70   {'was_impossible': False}


MAKE SPECIFIC PREDICTION FOR USER AND **MOVIE**

In [61]:
# Predict a rating for user 1 and movie 50
predicted_rating = model.predict(1, 50)  # User 1, Movie 50
print(f"Predicted Rating for User 1 and Movie 50: {predicted_rating.est:.2f}")

Predicted Rating for User 1 and Movie 50: 3.70


**GENERATE TOP-N MOVIE RECOMMADATION FOR A USER**

In [62]:
# Function to get top N recommendations
def get_top_n(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Get the top 10 recommendations for user 1
top_n = get_top_n(predictions, n=10)

# Print the top 10 recommended movie IDs for user 1
print("Top 10 recommended movie IDs for user 1:")
for movie_id, rating in top_n[1]:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating:.2f}")

Top 10 recommended movie IDs for user 1:
Movie ID: 1197, Predicted Rating: 4.53
Movie ID: 2028, Predicted Rating: 4.53
Movie ID: 2355, Predicted Rating: 4.23
Movie ID: 1907, Predicted Rating: 4.07


In [63]:
# Assuming you have a 'movies' DataFrame with 'movieId' and 'title'
movie_titles = {movieId: title for movieId, title in zip(movies['movieId'], movies['title'])}

# Print top 10 movie titles for user 1
print("Top 10 recommended movies for user 1:")
for movie_id, rating in top_n[1]:
    movie_title = movie_titles.get(movie_id, "Unknown Movie")
    print(f"Movie: {movie_title}, Predicted Rating: {rating:.2f}")

Top 10 recommended movies for user 1:
Movie: Princess Bride, The (1987), Predicted Rating: 4.53
Movie: Saving Private Ryan (1998), Predicted Rating: 4.53
Movie: Bug's Life, A (1998), Predicted Rating: 4.23
Movie: Mulan (1998), Predicted Rating: 4.07


In [64]:
# Save recommendations to a DataFrame
recommendations = []
for movie_id, rating in top_n[1]:
    movie_title = movie_titles.get(movie_id, "Unknown Movie")
    recommendations.append([1, movie_title, rating])  # Assuming user 1

# Convert to DataFrame
recommendations_df = pd.DataFrame(recommendations, columns=['UserID', 'MovieTitle', 'PredictedRating'])

# Save to CSV
recommendations_df.to_csv('user_1_recommendations.csv', index=False)