### Import necessary libraries

In [None]:
import pandas as pd

# 1. Data Collection and Manipulation

## 1.1 Automatic Download of Dataset

In [None]:
!wget -q http://files.grouplens.org/datasets/movielens/ml-1m.zip

In [None]:
!unzip -q ml-1m.zip

## 1.2 Save to csv files

In [None]:
# Load the users file and save to CSV
users = pd.read_csv('ml-1m/users.dat', sep='::', engine='python', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')
users.to_csv('users.csv', index=False)

# Load the movies file and save to CSV
movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine='python', header=None, names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
movies.to_csv('movies.csv', index=False)

# Load the ratings file and save to CSV
ratings = pd.read_csv( 'ml-1m/ratings.dat', sep='::', engine='python', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
ratings.to_csv('ratings.csv', index=False)

## 1.3 Print the first few rows of the each dataframe

In [None]:
# Print the first few rows of the users dataframe to check user details like user ID and demographics
print("First few rows of the users dataframe:")
print(users.head())
print("\n")  # Adds a newline for better separation of the output

# Print the first few rows of the movies dataframe to view movie information such as movie ID and title
print("First few rows of the movies dataframe:")
print(movies.head())
print("\n")  # Adds a newline for better separation of the output

# Print the first few rows of the ratings dataframe to inspect entries of user ratings for movies
print("First few rows of the ratings dataframe:")
print(ratings.head())

First few rows of the users dataframe:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


First few rows of the movies dataframe:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


First few rows of the ratings dataframe:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3   

## 1.4 Print the column names of the each dataframe

In [None]:
# Print the column names of the users dataframe to understand the fields related to user information
print("Column names in the users dataframe:")
print(users.columns)
print("\n")  # Adds a newline for better separation of the output

# Print the column names of the movies dataframe to see the attributes associated with movies, such as titles and genres
print("Column names in the movies dataframe:")
print(movies.columns)
print("\n")  # Adds a newline for better separation of the output

# Print the column names of the ratings dataframe to review the fields that link users and movies through ratings
print("Column names in the ratings dataframe:")
print(ratings.columns)

Column names in the users dataframe:
Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], dtype='object')


Column names in the movies dataframe:
Index(['MovieID', 'Title', 'Genres'], dtype='object')


Column names in the ratings dataframe:
Index(['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype='object')


## 1.5 Checking for missing values in each dataframe

In [None]:
# Display the number of missing values in each column of the users dataframe
print("Missing values in the users dataframe:")
print(users.isnull().sum())
print("\n")  # Adds a newline for better separation of output

# Display the number of missing values in each column of the movies dataframe
print("Missing values in the movies dataframe:")
print(movies.isnull().sum())
print("\n")  # Adds a newline for better separation of output

# Display the number of missing values in each column of the ratings dataframe
print("Missing values in the ratings dataframe:")
print(ratings.isnull().sum())

Missing values in the users dataframe:
UserID        0
Gender        0
Age           0
Occupation    0
Zip-code      0
dtype: int64


Missing values in the movies dataframe:
MovieID    0
Title      0
Genres     0
dtype: int64


Missing values in the ratings dataframe:
UserID       0
MovieID      0
Rating       0
Timestamp    0
dtype: int64


## 1.6 Convert the 'Timestamp' column in the ratings dataframe from Unix time to datetime format.

In [None]:
# This enhances readability and usability of the timestamp data for further analysis.
ratings['Timestamp'] = pd.to_datetime(ratings['Timestamp'], unit='s')

# Display the first few rows of the ratings dataframe to confirm the successful conversion of the 'Timestamp' column.
print("First few rows of the ratings dataframe after converting 'Timestamp':")
print(ratings.head())

First few rows of the ratings dataframe after converting 'Timestamp':
   UserID  MovieID  Rating           Timestamp
0       1     1193       5 2000-12-31 22:12:40
1       1      661       3 2000-12-31 22:35:09
2       1      914       3 2000-12-31 22:32:48
3       1     3408       4 2000-12-31 22:04:35
4       1     2355       5 2001-01-06 23:38:11


## 1.7  Merge the ratings dataframe with the movies dataframe on the MovieID column.


In [None]:
# This step combines movie rating data with corresponding movie details.
merged_data = pd.merge(ratings, movies, on='MovieID')

# Further merge the resulting dataframe with the users dataframe on the UserID column.
# This addition includes user demographic details into the merged dataset.
merged_data = pd.merge(merged_data, users, on='UserID')

# Display the column names of the merged dataframe to verify successful merges and to understand the available data fields.
print("Columns in the merged dataframe:")
print(merged_data.columns)

Columns in the merged dataframe:
Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Title', 'Genres', 'Gender',
       'Age', 'Occupation', 'Zip-code'],
      dtype='object')


## 1.6 Removing Duplicates

In [None]:
dup_bool = merged_data.duplicated(['UserID','MovieID','Rating','Timestamp'])
dups = sum(dup_bool) # by considering all columns..( including timestamp)
print("There are {} duplicate rating entries in the data..".format(dups))

There are 0 duplicate rating entries in the data..


## 3.4 Matrix Factorization

###3.4.1 Matrix factorization using SVD




In [None]:
from surprise import Dataset, Reader, KNNWithMeans, SVD, accuracy
from surprise.model_selection import train_test_split



In [None]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(merged_data[['UserID','MovieID','Rating']], reader)

# Split into train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

In [None]:
algo_svd = SVD()
algo_svd.fit(trainset)
predictions_svd = algo_svd.test(testset)

In [None]:
predictions_svd

[Prediction(uid=4104, iid=3702, r_ui=3.0, est=3.063573624849159, details={'was_impossible': False}),
 Prediction(uid=5539, iid=1211, r_ui=4.0, est=3.992980384590431, details={'was_impossible': False}),
 Prediction(uid=2142, iid=454, r_ui=4.0, est=3.4552129784966406, details={'was_impossible': False}),
 Prediction(uid=3871, iid=2174, r_ui=3.0, est=3.440207281588182, details={'was_impossible': False}),
 Prediction(uid=4277, iid=3271, r_ui=4.0, est=4.620037975983849, details={'was_impossible': False}),
 Prediction(uid=4471, iid=1060, r_ui=4.0, est=3.8594822650286496, details={'was_impossible': False}),
 Prediction(uid=3033, iid=595, r_ui=5.0, est=3.5265182203861345, details={'was_impossible': False}),
 Prediction(uid=5100, iid=3397, r_ui=4.0, est=4.357103373356144, details={'was_impossible': False}),
 Prediction(uid=5262, iid=2628, r_ui=3.0, est=2.8401877684797787, details={'was_impossible': False}),
 Prediction(uid=5557, iid=1573, r_ui=1.0, est=3.179006074257469, details={'was_impossible

In [None]:
print("SVD RMSE:")
accuracy.rmse(predictions_svd)

SVD RMSE:
RMSE: 0.8776


0.8775899318379707

In [None]:
user_id = 1
movie_id = 1193
prediction_svd = algo_svd.predict(user_id, movie_id)
print(f"Prediction for User {user_id} on Movie {movie_id} (SVD):")
print(prediction_svd)

Prediction for User 1 on Movie 1193 (SVD):
user: 1          item: 1193       r_ui = None   est = 4.48   {'was_impossible': False}
