In [122]:
## Loading the DataSets
import pandas as pd

movies = pd.read_csv('data/ml-latest-small/movies.csv')
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
tags = pd.read_csv('data/ml-latest-small/tags.csv')

In [123]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [124]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [125]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [126]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [127]:
import numpy as np
# Extracting the year from the title column
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)

# Extracting uiniqe genres from the genres column
genres = pd.unique(movies.genres.str.split('|',expand=True).values.ravel())
print(f"Unique entry in genres column: {genres.shape[0]}\n" ,genres,"\n")
genres = genres[~np.isin(genres, [None,'(no genres listed)', 'Film-Noir', 'IMAX', 'Musical', 'War', 'Western'])]
print(f"Updated suitable genres: {genres.shape[0]}\n" ,genres,"\n")

# Creating a new column for each genre (One-hot encoding)
for genre in genres:
    movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x else 0)

# Dropping the genres and title column
movies.drop(columns=['genres','title'], inplace=True)

movies.head()

Unique entry in genres column: 21
 ['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' None 'Romance'
 'Drama' 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'War'
 'Musical' 'Documentary' 'IMAX' 'Western' 'Film-Noir' '(no genres listed)'] 

Updated suitable genres: 14
 ['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'Documentary'] 



Unnamed: 0,movieId,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,1995,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,2,1995,1,0,1,0,1,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,1,0,1,0,0,0,0,0,0,0,0
3,4,1995,0,0,0,1,0,1,1,0,0,0,0,0,0,0
4,5,1995,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [268]:
users = ratings.groupby('userId')['rating'].count().reset_index().rename(columns={'rating':'count'})
# users = users.merge(ratings.groupby('userId')['rating'].mean().reset_index().rename(columns={'rating':'avg'}),on='userId')

merge_df = pd.merge(ratings, movies, on='movieId')
for genre in genres:
    genre_movies = merge_df[merge_df[genre] == 1]
    avg_rating_by_genre = genre_movies.groupby('userId')['rating'].mean().reset_index().rename(columns = {'rating':genre})
    users = users.merge(avg_rating_by_genre, on='userId', how='left')

users.drop(columns=['count'],inplace=True)
users.fillna(users.mean(),inplace=True)

users.head()

Unnamed: 0,userId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,4.388235,4.689655,4.547619,4.277108,4.297872,4.307692,4.529412,4.322222,4.355556,4.145455,3.470588,4.166667,4.225,3.779962
1,2,4.166667,3.639936,3.489831,4.0,3.570513,4.5,3.882353,3.954545,3.8,3.7,3.0,4.0,3.875,4.333333
2,3,2.727273,0.5,0.5,1.0,3.375,0.5,0.75,3.571429,0.5,4.142857,4.6875,5.0,4.2,3.779962
3,4,3.655172,4.0,3.8,3.509615,3.684211,3.37931,3.483333,3.32,3.814815,3.552632,4.25,3.478261,2.833333,4.0
4,5,3.25,4.333333,4.111111,3.466667,4.142857,3.090909,3.8,3.111111,3.833333,3.555556,3.0,4.0,2.5,3.779962


In [265]:
import torch
from sklearn.preprocessing import StandardScaler

users_numpy = users.values[::,1:]
user_scaler = StandardScaler()
tensor_user = torch.tensor(user_scaler.fit_transform(users_numpy), dtype=torch.float32)

movie_numpy = movies.values[::,1:]
movie_scaler = StandardScaler()
tensor_movie = torch.tensor(movie_scaler.fit_transform(movie_numpy),dtype=torch.float32)


In [267]:
print('Movie Dataeet Shape: ',tensor_movie.shape)
print('User Dataset Shape: ', tensor_user.shape)

Movie Dataeet Shape:  torch.Size([9742, 15])
User Dataset Shape:  torch.Size([610, 14])
