In [125]:
## Loading the DataSets
import pandas as pd

movies = pd.read_csv('data/ml-latest-small/movies.csv')
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
tags = pd.read_csv('data/ml-latest-small/tags.csv')

In [126]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [127]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [128]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [129]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [130]:
import numpy as np
# Extracting the year from the title column
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)

# Extracting uiniqe genres from the genres column
genres = pd.unique(movies.genres.str.split('|',expand=True).values.ravel())
print(f"Unique entry in genres column: {genres.shape[0]}\n" ,genres,"\n")
genres = genres[~np.isin(genres, [None,'(no genres listed)', 'Film-Noir', 'IMAX', 'Musical', 'War', 'Western'])]
print(f"Updated suitable genres: {genres.shape[0]}\n" ,genres,"\n")

# Creating a new column for each genre (One-hot encoding)
for genre in genres:
    movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x else 0)

# Dropping the genres and title column
movies.drop(columns=['genres','title'], inplace=True)

movies.head()

Unique entry in genres column: 21
 ['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' None 'Romance'
 'Drama' 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'War'
 'Musical' 'Documentary' 'IMAX' 'Western' 'Film-Noir' '(no genres listed)'] 

Updated suitable genres: 14
 ['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'Documentary'] 



Unnamed: 0,movieId,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1995,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,2,1995,1,0,1,0,1,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,1,0,1,0,0,0,0,0,0,0,0
3,4,1995,0,0,0,1,0,1,1,0,0,0,0,0,0,0
4,5,1995,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [152]:
ratings[ratings['rating']<1]

Unnamed: 0,userId,movieId,rating,timestamp
261,3,31,0.5,1306463578
262,3,527,0.5,1306464275
263,3,647,0.5,1306463619
264,3,688,0.5,1306464228
265,3,720,0.5,1306463595
...,...,...,...,...
99394,608,6827,0.5,1117519648
99408,608,7004,0.5,1117506252
100345,610,61818,0.5,1493848943
100415,610,72424,0.5,1493849030


In [131]:
# x_movies[x_movies.isnull().any(axis=1)] # Checking Null Value (Year Column having some NuLL value)

# Removing the ROW which having NULL value in Year of Movies
ratings = ratings[~ratings['movieId'].isin(movies[movies.isnull().any(axis=1)]['movieId'])]
movies.dropna(subset=['year'],inplace=True)

In [158]:
users = ratings.copy()
users.drop(columns = ['rating','timestamp'],inplace=True)

users = users.merge(ratings.groupby('userId')['rating'].count().rename('count'),on='userId')

merge_df = pd.merge(ratings, movies, on='movieId')
for genre in genres:
    genre_movies = merge_df[merge_df[genre] == 1]
    avg_rating_by_genre = genre_movies.groupby('userId')['rating'].mean().reset_index().rename(columns = {'rating':genre})
    users = users.merge(avg_rating_by_genre, on='userId', how='left')

users.drop(columns=['count'],inplace=True)
mean_rating = users.loc[:, ~users.columns.isin(['userId', 'movieId'])].mean()
# users.fillna(mean_rating,inplace=True)
print('Users: ',users.shape)
users.head()

Users:  (100818, 16)


Unnamed: 0,userId,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,
1,1,3,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,
2,1,6,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,
3,1,47,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,
4,1,50,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,


In [159]:
print(users.isna().sum())

userId             0
movieId            0
Adventure        106
Animation       4455
Children        2522
Comedy            21
Fantasy          833
Romance           98
Drama              0
Action            45
Crime            187
Thriller          37
Horror          2801
Mystery          888
Sci-Fi           162
Documentary    31051
dtype: int64


In [160]:
users.fillna(0,inplace=True)
users.head()

Unnamed: 0,userId,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,0.0
1,1,3,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,0.0
2,1,6,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,0.0
3,1,47,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,0.0
4,1,50,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,0.0


In [161]:
# Creating Movie Train Data
x_movies = merge_df.drop(columns = ['userId','rating','timestamp'])
x_movies['year'] = x_movies['year'].astype(int)
print('Movies: ',x_movies.shape)
x_movies.head()

Movies:  (100818, 16)


Unnamed: 0,movieId,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1995,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,3,1995,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,6,1995,0,0,0,0,0,0,0,1,1,1,0,0,0,0
3,47,1995,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,50,1995,0,0,0,0,0,0,0,0,1,1,0,1,0,0


In [194]:
import torch
from sklearn.preprocessing import StandardScaler

users_numpy = users.values[::,2:]
user_scaler = StandardScaler()
tensor_user = torch.tensor(user_scaler.fit_transform(users_numpy), dtype=torch.float32)

movie_numpy = x_movies.values[::,1:]
movie_numpy = movie_numpy.astype(np.float32)
movie_scaler = StandardScaler()
movie_numpy[:,0] = movie_scaler.fit_transform(movie_numpy[:,0].reshape(-1,1)).flatten()
tensor_movie = torch.tensor(movie_numpy,dtype=torch.float32)
# tensor_movie = torch.tensor(movie_numpy,dtype=torch.float32)

y_numpy = ratings['rating'].values.reshape(-1,1)

In [195]:
print('Movie Dataeet Shape: ',tensor_movie.shape)
print('User Dataset Shape: ', tensor_user.shape)
print('Target Rating Dataset Shape: ', y_numpy.shape)

Movie Dataeet Shape:  torch.Size([100818, 15])
User Dataset Shape:  torch.Size([100818, 14])
Target Rating Dataset Shape:  (100818, 1)


In [196]:
from sklearn.model_selection import train_test_split

user_train, user_test = train_test_split(tensor_user,test_size=0.2,shuffle=True,random_state=2)
movie_train, movie_test = train_test_split(tensor_movie,test_size=0.2,shuffle=True,random_state=2)
y_train,y_test = train_test_split(y_numpy,test_size=0.2,shuffle=True,random_state=2)

print(f"Train >> user:{user_train.shape}, movie:{movie_train.shape}, y:{y_train.shape}")
print(f"Test >> user:{user_test.shape}, movie:{movie_test.shape}, y:{y_test.shape}")

Train >> user:torch.Size([80654, 14]), movie:torch.Size([80654, 15]), y:(80654, 1)
Test >> user:torch.Size([20164, 14]), movie:torch.Size([20164, 15]), y:(20164, 1)


In [214]:
# from sklearn.preprocessing import MinMaxScaler

# y_scaler = MinMaxScaler((0,5))
# y_train_norm = y_scaler.fit_transform(y_train)
# y_test_norm = y_scaler.transform(y_test)

y_train_norm = y_train/5
y_test_norm = y_test/5

In [217]:
import torch
from torch.utils.data import DataLoader, TensorDataset
batch_size = 512
train_data = TensorDataset(user_train,movie_train,torch.tensor(y_train_norm,dtype=torch.float32))
test_data = TensorDataset(user_test,movie_test,torch.tensor(y_test_norm,dtype=torch.float32))

Train_Loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
Test_Loader = DataLoader(test_data,batch_size=batch_size,shuffle=False)

  test_data = TensorDataset(user_test,movie_test,torch.tensor(y_test_norm,dtype=torch.float32))


In [218]:
import torch
import torch.nn as nn

class CommonNN(nn.Module):
    def __init__(self,feature_count):
        super(CommonNN,self).__init__()

        self.fc1 = nn.Linear(feature_count,64)
        self.fc2 = nn.Linear(64,128)
        self.fc3 = nn.Linear(128,32)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    

class RecommenderNN(nn.Module):
    def __init__(self,user_feature_no,movie_feature_no):
        super(RecommenderNN,self).__init__()
        self.UserNN = CommonNN(user_feature_no)
        self.MovieNN = CommonNN(movie_feature_no)
        
    def forward(self,user_train,movie_train):
        out =  torch.sum(self.UserNN(user_train) * self.MovieNN(movie_train),dim=1).reshape(-1,1)
        return torch.sigmoid(out)



# for name, param in model.named_parameters():
#     if 'weight' in name:  # Filters out the weights
#         print(f"{name} initialized weights: \n{param.data}")

In [219]:
model = RecommenderNN(user_train.shape[1],movie_train.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-5)

In [220]:
epochs = 50

for epoch in range(epochs):
    running_loss = 0

    for user_set, movie_set, y_set in Train_Loader:
        optimizer.zero_grad()

        y_predict = model(user_set,movie_set)
        loss = criterion(y_predict,y_set)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {(running_loss/len(Train_Loader)):.4f}")

Epoch 1/50, Loss: 0.0361
Epoch 2/50, Loss: 0.0321
Epoch 3/50, Loss: 0.0311
Epoch 4/50, Loss: 0.0305
Epoch 5/50, Loss: 0.0301
Epoch 6/50, Loss: 0.0297
Epoch 7/50, Loss: 0.0295
Epoch 8/50, Loss: 0.0293
Epoch 9/50, Loss: 0.0291
Epoch 10/50, Loss: 0.0289
Epoch 11/50, Loss: 0.0286
Epoch 12/50, Loss: 0.0285
Epoch 13/50, Loss: 0.0284
Epoch 14/50, Loss: 0.0283
Epoch 15/50, Loss: 0.0282
Epoch 16/50, Loss: 0.0281
Epoch 17/50, Loss: 0.0280
Epoch 18/50, Loss: 0.0279
Epoch 19/50, Loss: 0.0278
Epoch 20/50, Loss: 0.0277
Epoch 21/50, Loss: 0.0276
Epoch 22/50, Loss: 0.0275
Epoch 23/50, Loss: 0.0275
Epoch 24/50, Loss: 0.0274
Epoch 25/50, Loss: 0.0273
Epoch 26/50, Loss: 0.0272
Epoch 27/50, Loss: 0.0272
Epoch 28/50, Loss: 0.0271
Epoch 29/50, Loss: 0.0271
Epoch 30/50, Loss: 0.0270
Epoch 31/50, Loss: 0.0269
Epoch 32/50, Loss: 0.0269
Epoch 33/50, Loss: 0.0269
Epoch 34/50, Loss: 0.0268
Epoch 35/50, Loss: 0.0267
Epoch 36/50, Loss: 0.0267
Epoch 37/50, Loss: 0.0266
Epoch 38/50, Loss: 0.0266
Epoch 39/50, Loss: 0.

In [247]:
model.eval()
with torch.no_grad():
    correct = 0
    loss = 0
    for user_test, movie_test, y_test in Test_Loader:
        out = model(user_test,movie_test)
        loss += criterion(out,y_test).item()
    print(f"Testing Loss: {loss/len(Test_Loader)}")

Testing Loss: 0.030721087008714676


In [248]:
import torch
torch.save(model.state_dict(), 'data/model/rating_v0.pth')


In [249]:
user_model = model.UserNN
movie_model = model.MovieNN
torch.save(user_model.state_dict(), 'data/model/user_v0.pth')
torch.save(movie_model.state_dict(), 'data/model/movie_v0.pth')

In [250]:
import joblib

joblib.dump(user_scaler, 'data/model/user_scaler.pkl')
joblib.dump(movie_scaler, 'data/model/movie_scaler.pkl')
# joblib.dump(y_scaler, 'data/model/y_scaler.pkl')

['data/model/movie_scaler.pkl']

In [226]:
import os


movie_scaler = joblib.load(os.path.join("data","model",'movie_scaler.pkl'))

In [236]:
movie_scaler.transform(np.array(2015).reshape(-1,1)).flatten()

array([1.43127259])

In [244]:
ar = np.array([1,2],dtype=np.float32)


In [245]:
ar[0] = movie_scaler.transform(np.array(2015).reshape(-1,1)).flatten()[0]

In [246]:
ar

array([1.4312726, 2.       ], dtype=float32)