In [26]:
## Loading the DataSets
import pandas as pd

movies = pd.read_csv('data/ml-latest-small/movies.csv')
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
tags = pd.read_csv('data/ml-latest-small/tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [45]:
import numpy as np
# Extracting the year from the title column
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)

# Extracting uiniqe genres from the genres column
genres = pd.unique(movies.genres.str.split('|',expand=True).values.ravel())
print(f"Unique entry in genres column: {genres.shape[0]}\n" ,genres,"\n")
genres = genres[~np.isin(genres, [None,'(no genres listed)', 'Film-Noir', 'IMAX', 'Musical', 'War', 'Western'])]
print(f"Updated suitable genres: {genres.shape[0]}\n" ,genres,"\n")

# Creating a new column for each genre (One-hot encoding)
for genre in genres:
    movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x else 0)

# Dropping the genres and title column
movies.drop(columns=['genres','title'], inplace=True)

movies.head()

Unique entry in genres column: 21
 ['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' None 'Romance'
 'Drama' 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'War'
 'Musical' 'Documentary' 'IMAX' 'Western' 'Film-Noir' '(no genres listed)'] 

Updated suitable genres: 14
 ['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'Documentary'] 



Unnamed: 0,movieId,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1995,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,2,1995,1,0,1,0,1,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,1,0,1,0,0,0,0,0,0,0,0
3,4,1995,0,0,0,1,0,1,1,0,0,0,0,0,0,0
4,5,1995,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [48]:
users = ratings.copy()
users.drop(columns = ['rating','timestamp'],inplace=True)

users = users.merge(ratings.groupby('userId')['rating'].count().rename('count'),on='userId')

merge_df = pd.merge(ratings, movies, on='movieId')
for genre in genres:
    genre_movies = merge_df[merge_df[genre] == 1]
    avg_rating_by_genre = genre_movies.groupby('userId')['rating'].mean().reset_index().rename(columns = {'rating':genre})
    users = users.merge(avg_rating_by_genre, on='userId', how='left')

users.drop(columns=['count'],inplace=True)
users.fillna(users.mean(),inplace=True)
print('Users: ',users.shape)
users.head()

Users:  (100836, 16)


Unnamed: 0,userId,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685863
1,1,3,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685863
2,1,6,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685863
3,1,47,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685863
4,1,50,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.685863


In [52]:
# Creating Movie Train Data
x_movies = merge_df.drop(columns = ['userId','rating','timestamp'])
print('Movies: ',x_movies.shape)
x_movies.head()

Movies:  (100836, 16)


Unnamed: 0,movieId,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1995,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,3,1995,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,6,1995,0,0,0,0,0,0,0,1,1,1,0,0,0,0
3,47,1995,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,50,1995,0,0,0,0,0,0,0,0,1,1,0,1,0,0


In [70]:
import torch
from sklearn.preprocessing import StandardScaler

users_numpy = users.values[::,2:]
user_scaler = StandardScaler()
tensor_user = torch.tensor(user_scaler.fit_transform(users_numpy), dtype=torch.float32)

movie_numpy = x_movies.values[::,1:]
movie_scaler = StandardScaler()
tensor_movie = torch.tensor(movie_scaler.fit_transform(movie_numpy),dtype=torch.float32)

tensor_y = torch.tensor(ratings['rating'].values.reshape(-1,1),dtype=torch.float32)

In [72]:
print('Movie Dataeet Shape: ',tensor_movie.shape)
print('User Dataset Shape: ', tensor_user.shape)
print('Target Rating Dataset Shape: ', tensor_y.shape)

Movie Dataeet Shape:  torch.Size([100836, 15])
User Dataset Shape:  torch.Size([100836, 14])
Target Rating Dataset Shape:  torch.Size([100836, 1])


In [77]:
from sklearn.model_selection import train_test_split

user_train, user_test = train_test_split(tensor_user,test_size=0.2,shuffle=True,random_state=2)
movie_train, movie_test = train_test_split(tensor_movie,test_size=0.2,shuffle=True,random_state=2)
y_train,y_test = train_test_split(tensor_y,test_size=0.2,shuffle=True,random_state=2)

print(f"Train >> user:{user_train.shape}, movie:{movie_train.shape}, y:{y_train.shape}")
print(f"Test >> user:{user_test.shape}, movie:{movie_test.shape}, y:{y_test.shape}")

Train >> user:torch.Size([80668, 14]), movie:torch.Size([80668, 15]), y:torch.Size([80668, 1])
Test >> user:torch.Size([20168, 14]), movie:torch.Size([20168, 15]), y:torch.Size([20168, 1])


In [None]:
from sklearn.preprocessing import MinMaxScaler

y_scaler = MinMaxScaler()
y_train_norm = y_scaler.fit_transform(y_train)
y_test_norm = y_scaler.transform(y_test)