# Data preprocessing

As preprocessing, it was decided to do the following:
1. Encode some features of the original dataset to bring them to one of the following data types: boolean, integer, float, or categorical. For this purpose, OneHotEncoding was applied on the gender and occupation attributes. In addition, age was normalized to the range [0;1].
2. Remove some features that are difficult to work with or may not potentially carry useful information, such as movie URL, zip code, and timestamp.
3. Create an additional 10 features based on ratings left by similar users or received for similar movies.

In [2]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

--2023-12-03 13:53:04--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-12-03 13:53:05 (14.8 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base  

In [3]:
# !pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163754 sha256=6835eebca9008e14028092855c1ec58a6174b993175bf5203f44639f2eb8f6c5
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [4]:
# import useful libraries
import pickle
import numpy as np
import pandas as pd
from IPython.display import display_html
import warnings
from sklearn.model_selection import train_test_split
from surprise import SVD
import numpy as np
import surprise
from surprise import Reader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

warnings.filterwarnings('ignore')

base = "/content/ml-100k/"

%matplotlib inline

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw = True)

In [70]:
# load rating and user data
ratings_data = pd.read_csv(base + 'u.data', sep = '\t', header = None)
user_data = pd.read_csv(base + 'u.user', sep = '|', header = None)
ratings_data.columns = ['user_id', 'item_id', 'rating', 'timestamp']
user_data.columns = ['user_id', 'age', 'gender', 'occupation', 'zip code']
display_side_by_side(ratings_data.head(10), user_data.head(10))

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013

Unnamed: 0,user_id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,5201
8,9,29,M,student,1002
9,10,53,M,lawyer,90703


In [71]:
# load movie data
item = pd.read_csv(base + 'u.item', sep = '|', header = None, encoding='latin-1')
item.columns = ['item_id', 'item_title', 'release_date', 'video_release_date',
              'IMDb_URL', 'Unknown', 'Action', 'Adventure', 'Animation',
              "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']
item.head(10)

Unnamed: 0,item_id,item_title,release_date,video_release_date,IMDb_URL,Unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [72]:
from sklearn.model_selection import train_test_split

# Perform a 80/20 train-test split on the interactions in the dataset
train, test = train_test_split(ratings_data, test_size=0.2, random_state=16)
train_df = pd.DataFrame(train, columns=ratings_data.columns)
test_df = pd.DataFrame(test, columns=ratings_data.columns)

In [77]:
class Preprocessor:
    def __init__(self, items, users):
        """
        Initialize nessesary models and objects

        :param items: dataframe of movies
        :param users: dataframe of users
        """
        self.reader = Reader(rating_scale=(1,5))

        # SVD model for additional features
        self.svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
        self.users = users
        self.items = items
        self.train_sparse_matrix = None
        self.train_averages = dict()
        self.col_names = ['M', 'F'] + ["occupation_" + str(i + 1) for i in range(len(users['occupation'].unique()))]
        self.encoder = None

    def fit_svd(self, df):
        """
        Fitinf SVD model on train data

        :param df: train dataset
        """
        # Building special form dataset and fit SVD
        train_data_mf = Dataset.load_from_df(df[['user_id',
                                                           'item_id',
                                                           'rating']],
                                                 self.reader)
        trainset = train_data_mf.build_full_trainset()
        self.svd.fit(trainset)
        return trainset

    def get_average_ratings(self, of_users = True):
        """
        Calculating average rating for movies and users using User-Movie matrix

        :param of_user: boolean parameter controls switching between users and items
        :return: average rating
        """
        # Choose axes for users or movies
        ax = 1 if of_users else 0 # 1 - User axes, 0 - Movie axes
        sum_of_ratings = self.train_sparse_matrix.sum(axis = ax).A1
        # Whether a user rated that movie or not
        is_rated = self.train_sparse_matrix != 0
        no_of_ratings = is_rated.sum(axis=ax).A1

        # Maximum number of users and movies
        u, m = self.train_sparse_matrix.shape

        # Create a dictionary of users and their average ratings
        # Zero is in case of movie not presenting in ratings
        average_ratings = {i: sum_of_ratings[i]/no_of_ratings[i] if no_of_ratings[i] !=0 else 0
                        for i in range(u if of_users else m)}
        return average_ratings

    def top_users_rates(self, user, movie, k = 5):
        """
        Get rating for movie from most similar users

        :param user: user id
        :param movie: movie id
        :param k: number of similar users
        :return: rating of the most similar users
        """
        # Find nearest users for our user
        user_sim = cosine_similarity(self.train_sparse_matrix[user],
                                     self.train_sparse_matrix).ravel()
        # Sort by similarity and remove user himself
        # And take rating for this movie by the most similar users
        top_sim_users = user_sim.argsort()[::-1][1:]
        top_ratings = self.train_sparse_matrix[top_sim_users, movie].toarray().ravel()

        # If number of similar users less than k, fill by average for this movie
        top_sim_users_ratings = list(top_ratings[top_ratings != 0][:k])
        top_sim_users_ratings.extend([self.train_averages['movie'][movie]]*(k - len(top_sim_users_ratings)))
        return top_sim_users_ratings

    def top_movie_rates(self, user, movie, k = 5):
        """
        Get rating from user for most similar movies

        :param user: user id
        :param movie: movie id
        :param k: number of similar movies
        :return: rating of the most similar movies
        """
        # Find nearest movies for our movie
        movie_sim = cosine_similarity(self.train_sparse_matrix[:,movie].T,
                                      self.train_sparse_matrix.T).ravel()
        top_sim_movies = movie_sim.argsort()[::-1][1:]
        # Sort by similarity and remove movie himself
        # And take rating for movies most similar to current
        top_ratings = self.train_sparse_matrix[user, top_sim_movies].toarray().ravel()

        # If number of similar movies less than k, fill by average for this user
        top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:k])
        top_sim_movies_ratings.extend([self.train_averages['user'][user]]*(k - len(top_sim_movies_ratings)))
        return top_sim_movies_ratings

    def one_hot_encoding(self, df, encoder = None):
        """
        OneHot encoding

        :param df: dataframe to be encoded
        :param encoder: onehot encoder itself
        :return: encoder and encoded dataframe
        """
        # Converting type of columns to category
        df['gender'] = df['gender'].astype('category')
        df['occupation'] = df['occupation'].astype('category')

        if encoder:
            # For existing encoder
            enc_data = pd.DataFrame(encoder.transform(
            df[['gender', 'occupation']]).toarray(), columns=self.col_names)
        else:
            # Create new encoder and fit it
            encoder = OneHotEncoder()
            enc_data = pd.DataFrame(encoder.fit_transform(
            df[['gender', 'occupation']]).toarray(), columns=self.col_names)

        final_df = df.join(enc_data).drop(['gender', 'occupation'], axis = 1)

        return encoder, final_df

    def best_values(self, df, rated = False, k = 5):
        """
        Additional features based on similar user, similar movie and their average value

        :param df: dataframe to be encoded
        :param rated: was the data rated
        :param k: numer of similar movies/users
        :return: encoded data
        """
        data = df
        for i in tqdm(data.index):
            # Extract user and movie
            user = data.loc[i]['user_id']
            movie = data.loc[i]['item_id']

            # Get rating of most similar users and movies
            top_users_list = self.top_users_rates(user, movie, k = k)
            top_movie_list = self.top_movie_rates(user, movie, k = k)
            movies_columns = ["M" + str(i+1) for i in range(k)]
            users_columns = ["U" + str(i+1) for i in range(k)]

            # Average this values
            if rated:
                UAvg = data.loc[data['user_id'] == user, 'rating'].mean()
                MAvg = data.loc[data['item_id'] == movie, 'rating'].mean()
            else:
                UAvg = np.mean(top_users_list)
                MAvg = np.mean(top_movie_list)

            # Extend initial data
            columns = tuple(users_columns + movies_columns + ['UAvg', 'MAvg'])
            values = top_users_list + top_movie_list + [UAvg, MAvg]
            data.at[i, columns] = values
        return data

    def preprocess(self, df, set_type = 'Train'):
        """
        Preprocessor

        :param df: initial data
        :param set_type: Train/Test/Predict
        :return: preprocessed data
        """
        if set_type == 'Train':
            # Fit SVD, User-Item matrix and OneHot encoder on test data
            trainset = self.fit_svd(df)
            max_users = self.users['user_id'].max()
            max_items = self.items['item_id'].max()
            self.train_sparse_matrix = csr_matrix((df.rating.values,
             (df.user_id.values, df.item_id.values)),
                                 shape = (max_users + 1, max_items + 1))
            self.train_averages['global'] = self.train_sparse_matrix.sum()/self.train_sparse_matrix.count_nonzero()
            self.train_averages['user'] = self.get_average_ratings(of_users = True)
            self.train_averages['movie'] = self.get_average_ratings(of_users = False)

            final_train_data = self.best_values(df, rated = True)
            self.encoder, one_hot_train = self.one_hot_encoding(final_train_data)
            one_hot_train = one_hot_train.drop(['timestamp', 'zip code', 'item_title', 'release_date',
                                       'video_release_date', 'IMDb_URL'],
                                      axis=1)
            # Normalize age and add additional features from SVD predictions
            one_hot_train['age'] = one_hot_train['age']/100
            train_preds = self.svd.test(trainset.build_testset())
            train_pred_mf = np.array([pred.est for pred in train_preds])
            one_hot_train = one_hot_train.join(pd.DataFrame(train_pred_mf, columns=['pred']))
            return one_hot_train

        elif set_type == 'Test':
            # Use OneHot encoder to encode gender and occupation
            final_test = self.best_values(df, rated = True)
            _, one_hot_test = self.one_hot_encoding(final_test, encoder = self.encoder)
            one_hot_test = one_hot_test.drop(['timestamp', 'zip code', 'item_title', 'release_date',
                                       'video_release_date', 'IMDb_URL'],
                                      axis=1)
            # Normalize age and add additional features from SVD predictions
            one_hot_test['age'] = one_hot_test['age']/100

            test_data_mf = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], self.reader)
            testset = test_data_mf.build_full_trainset()
            test_preds = self.svd.test(testset.build_testset())
            test_pred_mf = np.array([pred.est for pred in test_preds])
            one_hot_test = one_hot_test.join(pd.DataFrame(test_pred_mf, columns=['pred']))
            return one_hot_test

        else:
            # Use OneHot encoder to encode gender and occupation
            final_test = self.best_values(df, rated = False)
            _, one_hot_pred = self.one_hot_encoding(final_test, encoder = self.encoder)
            rank = pd.DataFrame([0 for i in range(len(one_hot_pred))], columns=['ranking'])
            one_hot_pred = one_hot_pred.join(pd.DataFrame(rank, columns=['rating']))

            # Normalize age and add additional features from SVD predictions
            test_data_mf = Dataset.load_from_df(one_hot_pred[['user_id', 'item_id', 'rating']], self.reader)
            testset = test_data_mf.build_full_trainset()
            test_preds = self.svd.test(testset.build_testset())
            test_pred_mf = np.array([pred.est for pred in test_preds])
            one_hot_pred = one_hot_pred.join(pd.DataFrame(test_pred_mf, columns=['pred']))
            one_hot_pred['age'] = (one_hot_pred['age']/100).astype('float')

            one_hot_pred = one_hot_pred.drop(['user_id', 'zip code',
                                                        'item_title', 'release_date',
                                                        'video_release_date', 'IMDb_URL',
                                                        'rating'], axis=1)
            return one_hot_pred

In [78]:
preprocessor = Preprocessor(item, user_data)

In [79]:
train_preprocessed = preprocessor.preprocess((pd.merge(train_df, user_data)).merge(item), set_type = "Train")
train_preprocessed.head()

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


100%|██████████| 80000/80000 [18:29<00:00, 72.13it/s]


Unnamed: 0,user_id,item_id,rating,age,Unknown,Action,Adventure,Animation,Children's,Comedy,...,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,occupation_21,pred
0,524,414,4,0.56,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.520836
1,13,414,5,0.47,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.814272
2,456,414,3,0.24,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.083998
3,354,414,4,0.29,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.505869
4,766,414,4,0.42,0,0,0,0,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.612748


In [80]:
test_preprocessed = preprocessor.preprocess((pd.merge(test_df, user_data)).merge(item), set_type = "Test")
test_preprocessed.head()

100%|██████████| 20000/20000 [04:38<00:00, 71.84it/s]


Unnamed: 0,user_id,item_id,rating,age,Unknown,Action,Adventure,Animation,Children's,Comedy,...,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,occupation_21,pred
0,608,275,5,0.22,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.170197
1,672,275,5,0.54,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.487392
2,474,275,3,0.51,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.727305
3,145,275,2,0.31,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.934111
4,864,275,4,0.27,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.857969


In [81]:
train_preprocessed.to_csv('train_data_preprocessed.csv', index=False)
test_preprocessed.to_csv('test_data_preprocessed.csv', index=False)

In [82]:
with open('preprocessor.pkl', 'wb') as outp:
    pickle.dump(preprocessor, outp, pickle.HIGHEST_PROTOCOL)

# Reference:
[1] - F. M. Harper and J. A. Konstan, “The MovieLens datasets: History and context,” ACM Trans. Interact. Intell. Syst., vol. 5, no. 4, pp. 1–19, 2016. http://dx.doi.org/10.1145/2827872

[2] - F. M. Harper and J. A. Konstan, “The MovieLens datasets: History and context,” ACM Trans. Interact. Intell. Syst., vol. 5, no. 4, pp. 1–19, 2016.

[3] - R. Vidiyala, “How to build a movie recommendation system,” Towards Data Science, 02-Oct-2020. [Online]. Available: https://towardsdatascience.com/how-to-build-a-movie-recommendation-system-67e321339109. [Accessed: 03-Dec-2023].

[4] - P. Aher, “Evaluation metrics for recommendation systems — an overview,” Towards Data Science, 09-Aug-2023. [Online]. Available: https://towardsdatascience.com/evaluation-metrics-for-recommendation-systems-an-overview-71290690ecba. [Accessed: 03-Dec-2023].

[5] - “Recommendation system in python,” GeeksforGeeks, 18-Jul-2021. [Online]. Available: https://www.geeksforgeeks.org/recommendation-system-in-python/. [Accessed: 03-Dec-2023].