In [2]:
import numpy as np
import pandas as pd

In [10]:
movie_data = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, engine='python', encoding='latin1')
rating_data = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, engine='python', encoding='latin1')
user_data = pd.read_csv('ml-1m/users.dat', delimiter='::', header=None, engine='python', encoding='latin1')

In [9]:
movie_data.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
rating_data.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [12]:
user_data.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [13]:
rating_data.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
rating_data.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [14]:
user_data.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
user_data.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [15]:
movie_data.columns = ['MovieID', 'Title', 'Genres']
movie_data.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


CBF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert Genres to feature vectors
tfidf = TfidfVectorizer()
movie_features = tfidf.fit_transform(movie_data['Genres'])

In [17]:
user_profiles = {}

for user_id in user_data['UserID']:
    user_ratings = rating_data[rating_data['UserID'] == user_id]
    user_movies = movie_data[movie_data['MovieID'].isin(user_ratings['MovieID'])]
    
    user_profile = movie_features[user_movies.index].mean(axis=0)
    user_profiles[user_id] = user_profile

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming user_profile and movie_features are sparse matrices
user_profile_array = np.asarray(user_profile)  # Convert user profile to numpy array
movie_features_array = np.asarray(movie_features.toarray())  # Convert movie features to numpy array

# Calculate cosine similarity
similarities = cosine_similarity(user_profile_array, movie_features_array)

# Find the top 10 most similar movies
recommended_movie_ids = similarities.argsort().flatten()[-10:]

recommended_movie_ids

array([2023,  773,  590,   47, 1526, 2012, 2011, 2033, 2009,  584])

NCF

In [19]:
from sklearn.model_selection import train_test_split

X = rating_data[['UserID', 'MovieID']]
y = rating_data['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model

# Input layers
user_input = Input(shape=(1,))
movie_input = Input(shape=(1,))

# Embedding layers
user_embedding = Embedding(input_dim=user_data['UserID'].nunique(), output_dim=50)(user_input)
movie_embedding = Embedding(input_dim=movie_data['MovieID'].nunique(), output_dim=50)(movie_input)

# Flatten the embeddings
user_vec = Flatten()(user_embedding)
movie_vec = Flatten()(movie_embedding)

# Concatenate user and movie vectors
concat = Concatenate()([user_vec, movie_vec])

# Add Dense layers
dense_1 = Dense(128, activation='relu')(concat)
dense_2 = Dense(64, activation='relu')(dense_1)
output = Dense(1)(dense_2)

model = Model([user_input, movie_input], output)

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit([X_train['UserID'], X_train['MovieID']], y_train, epochs=5, batch_size=64)

In [None]:
predictions = model.predict([X_test['UserID'], X_test['MovieID']])
