In [1]:
## Loading the DataSets
import pandas as pd

movies = pd.read_csv('data/ml-latest-small/movies.csv')
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
tags = pd.read_csv('data/ml-latest-small/tags.csv')

In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
import numpy as np
# Extracting the year from the title column
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)

# Extracting uiniqe genres from the genres column
genres = pd.unique(movies.genres.str.split('|',expand=True).values.ravel())
print(f"Unique entry in genres column: {genres.shape[0]}\n" ,genres,"\n")
genres = genres[~np.isin(genres, [None,'(no genres listed)', 'Film-Noir', 'IMAX', 'Musical', 'War', 'Western'])]
print(f"Updated suitable genres: {genres.shape[0]}\n" ,genres,"\n")

# Creating a new column for each genre (One-hot encoding)
for genre in genres:
    movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x else 0)

# Dropping the genres and title column
movies.drop(columns=['genres','title'], inplace=True)

movies.head()

Unique entry in genres column: 21
 ['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' None 'Romance'
 'Drama' 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'War'
 'Musical' 'Documentary' 'IMAX' 'Western' 'Film-Noir' '(no genres listed)'] 

Updated suitable genres: 14
 ['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'Documentary'] 



Unnamed: 0,movieId,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1995,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,2,1995,1,0,1,0,1,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,1,0,1,0,0,0,0,0,0,0,0
3,4,1995,0,0,0,1,0,1,1,0,0,0,0,0,0,0
4,5,1995,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [7]:
# x_movies[x_movies.isnull().any(axis=1)] # Checking Null Value (Year Column having some NuLL value)

# Removing the ROW which having NULL value in Year of Movies
ratings = ratings[~ratings['movieId'].isin(movies[movies.isnull().any(axis=1)]['movieId'])]
movies.dropna(subset=['year'],inplace=True)

In [8]:
users = ratings.copy()
users.drop(columns = ['rating','timestamp'],inplace=True)

users = users.merge(ratings.groupby('userId')['rating'].count().rename('count'),on='userId')

merge_df = pd.merge(ratings, movies, on='movieId')
for genre in genres:
    genre_movies = merge_df[merge_df[genre] == 1]
    avg_rating_by_genre = genre_movies.groupby('userId')['rating'].mean().reset_index().rename(columns = {'rating':genre})
    users = users.merge(avg_rating_by_genre, on='userId', how='left')

users.drop(columns=['count'],inplace=True)
users.fillna(users.mean(),inplace=True)
print('Users: ',users.shape)
users.head()

Users:  (100818, 16)


Unnamed: 0,userId,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685826
1,1,3,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685826
2,1,6,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685826
3,1,47,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685826
4,1,50,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685826


In [9]:
# Creating Movie Train Data
x_movies = merge_df.drop(columns = ['userId','rating','timestamp'])
print('Movies: ',x_movies.shape)
x_movies.head()

Movies:  (100818, 16)


Unnamed: 0,movieId,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1995,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,3,1995,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,6,1995,0,0,0,0,0,0,0,1,1,1,0,0,0,0
3,47,1995,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,50,1995,0,0,0,0,0,0,0,0,1,1,0,1,0,0


In [10]:
import torch
from sklearn.preprocessing import StandardScaler

users_numpy = users.values[::,2:]
user_scaler = StandardScaler()
tensor_user = torch.tensor(user_scaler.fit_transform(users_numpy), dtype=torch.float32)

movie_numpy = x_movies.values[::,1:]
movie_scaler = StandardScaler()
tensor_movie = torch.tensor(movie_scaler.fit_transform(movie_numpy),dtype=torch.float32)

y_numpy = ratings['rating'].values.reshape(-1,1)

In [11]:
print('Movie Dataeet Shape: ',tensor_movie.shape)
print('User Dataset Shape: ', tensor_user.shape)
print('Target Rating Dataset Shape: ', y_numpy.shape)

Movie Dataeet Shape:  torch.Size([100818, 15])
User Dataset Shape:  torch.Size([100818, 14])
Target Rating Dataset Shape:  (100818, 1)


In [12]:
from sklearn.model_selection import train_test_split

user_train, user_test = train_test_split(tensor_user,test_size=0.2,shuffle=True,random_state=2)
movie_train, movie_test = train_test_split(tensor_movie,test_size=0.2,shuffle=True,random_state=2)
y_train,y_test = train_test_split(y_numpy,test_size=0.2,shuffle=True,random_state=2)

print(f"Train >> user:{user_train.shape}, movie:{movie_train.shape}, y:{y_train.shape}")
print(f"Test >> user:{user_test.shape}, movie:{movie_test.shape}, y:{y_test.shape}")

Train >> user:torch.Size([80654, 14]), movie:torch.Size([80654, 15]), y:(80654, 1)
Test >> user:torch.Size([20164, 14]), movie:torch.Size([20164, 15]), y:(20164, 1)


In [13]:
from sklearn.preprocessing import MinMaxScaler

y_scaler = MinMaxScaler()
y_train_norm = y_scaler.fit_transform(y_train)
y_test_norm = y_scaler.transform(y_test)

In [14]:
import torch
from torch.utils.data import DataLoader, TensorDataset
batch_size = 512
train_data = TensorDataset(user_train,movie_train,torch.tensor(y_train_norm,dtype=torch.float32))
test_data = TensorDataset(user_test,movie_test,torch.tensor(y_test_norm,dtype=torch.float32))

Train_Loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
Test_Loader = DataLoader(test_data,batch_size=batch_size,shuffle=False)

In [15]:
import torch
import torch.nn as nn

class CommonNN(nn.Module):
    def __init__(self,feature_count):
        super(CommonNN,self).__init__()

        self.fc1 = nn.Linear(feature_count,64)
        self.fc2 = nn.Linear(64,128)
        self.fc3 = nn.Linear(128,32)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    

class RecommenderNN(nn.Module):
    def __init__(self,user_feature_no,movie_feature_no):
        super(RecommenderNN,self).__init__()
        self.UserNN = CommonNN(user_feature_no)
        self.MovieNN = CommonNN(movie_feature_no)
        
    def forward(self,user_train,movie_train):
        return torch.sum(self.UserNN(user_train) * self.MovieNN(movie_train),dim=1).reshape(-1,1)



# for name, param in model.named_parameters():
#     if 'weight' in name:  # Filters out the weights
#         print(f"{name} initialized weights: \n{param.data}")

In [16]:
model = RecommenderNN(user_train.shape[1],movie_train.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-5)

In [17]:
epochs = 50

for epoch in range(epochs):
    running_loss = 0

    for user_set, movie_set, y_set in Train_Loader:
        optimizer.zero_grad()

        y_predict = model(user_set,movie_set)
        loss = criterion(y_predict,y_set)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {(running_loss/len(Train_Loader)):.4f}")

Epoch 1/50, Loss: 0.0549
Epoch 2/50, Loss: 0.0394
Epoch 3/50, Loss: 0.0383
Epoch 4/50, Loss: 0.0375
Epoch 5/50, Loss: 0.0369
Epoch 6/50, Loss: 0.0362
Epoch 7/50, Loss: 0.0359
Epoch 8/50, Loss: 0.0357
Epoch 9/50, Loss: 0.0354
Epoch 10/50, Loss: 0.0353
Epoch 11/50, Loss: 0.0350
Epoch 12/50, Loss: 0.0347
Epoch 13/50, Loss: 0.0347
Epoch 14/50, Loss: 0.0344
Epoch 15/50, Loss: 0.0344
Epoch 16/50, Loss: 0.0341
Epoch 17/50, Loss: 0.0341
Epoch 18/50, Loss: 0.0338
Epoch 19/50, Loss: 0.0337
Epoch 20/50, Loss: 0.0336
Epoch 21/50, Loss: 0.0334
Epoch 22/50, Loss: 0.0334
Epoch 23/50, Loss: 0.0332
Epoch 24/50, Loss: 0.0332
Epoch 25/50, Loss: 0.0333
Epoch 26/50, Loss: 0.0330
Epoch 27/50, Loss: 0.0329
Epoch 28/50, Loss: 0.0328
Epoch 29/50, Loss: 0.0327
Epoch 30/50, Loss: 0.0325
Epoch 31/50, Loss: 0.0325
Epoch 32/50, Loss: 0.0324
Epoch 33/50, Loss: 0.0323
Epoch 34/50, Loss: 0.0322
Epoch 35/50, Loss: 0.0323
Epoch 36/50, Loss: 0.0321
Epoch 37/50, Loss: 0.0320
Epoch 38/50, Loss: 0.0320
Epoch 39/50, Loss: 0.

In [18]:
model.eval()
with torch.no_grad():
    correct = 0
    loss = 0
    for user_test, movie_test, y_test in Test_Loader:
        out = model(user_test,movie_test)
        loss += criterion(out,y_test).item()
    print(f"Testing Loss: {loss/len(Test_Loader)}")

Testing Loss: 0.03608886953443289


In [45]:
import torch
torch.save(model.state_dict(), 'data/model/rating_v0.pth')


In [28]:
user_model = model.UserNN
movie_model = model.MovieNN
torch.save(user_model.state_dict(), 'data/model/user_v0.pth')
torch.save(movie_model.state_dict(), 'data/model/movie_v0.pth')

In [24]:
import joblib

joblib.dump(user_scaler, 'data/model/user_scaler.pkl')
joblib.dump(movie_scaler, 'data/model/movie_scaler.pkl')
joblib.dump(y_scaler, 'data/model/y_scaler.pkl')

['data/model/y_scaler.pkl']

In [34]:
user_model1 = CommonNN(14)
user_model1.load_state_dict(torch.load('data/model/user_v0.pth'))
user_model1.eval()

  user_model1.load_state_dict(torch.load('data/model/user_v0.pth'))


CommonNN(
  (fc1): Linear(in_features=14, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=32, bias=True)
)