In [1]:
import pandas as pd

In [7]:
import pandas as pd
import numpy as np
import math
import re
import os
import time
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset
from surprise.prediction_algorithms.matrix_factorization import SVD, NMF
from surprise.model_selection import cross_validate


def main1():
    # Follow the codes from Kaggle, see https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data
    filename = "/Users/viraj/Documents/GitHub/MIPS/data"
    filename = os.path.join(filename, "netflix_price_dataset/combined_data_1.txt")
    df = pd.read_csv(filename, header=None, names=['Cust_Id', 'Rating'], usecols=[0, 1])

    df['Rating'] = df['Rating'].astype(float)

    print('Dataset 1 shape: {}'.format(df.shape))
    print('-Dataset examples-')
    print(df.iloc[:100, :])

    df_nan = pd.DataFrame(pd.isnull(df.Rating))
    df_nan = df_nan[df_nan['Rating'] == True]
    df_nan = df_nan.reset_index()

    movie_np = []
    movie_id = 1

    for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]):
        # numpy approach
        temp = np.full((1, i - j - 1), movie_id)
        movie_np = np.append(movie_np, temp)
        movie_id += 1

    # Account for last record and corresponding length
    # numpy approach
    last_record = np.full((1, len(df) - df_nan.iloc[-1, 0] - 1), movie_id)
    movie_np = np.append(movie_np, last_record)

    print('Movie numpy: {}'.format(movie_np))
    print('Length: {}'.format(len(movie_np)))

    # remove those Movie ID rows
    df = df[pd.notnull(df['Rating'])]

    df['Movie_Id'] = movie_np.astype(int)
    df['Cust_Id'] = df['Cust_Id'].astype(int)
    print('-Dataset examples-')
    print(df.iloc[::5000000, :])

    f = ['count', 'mean']

    df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(f)
    df_movie_summary.index = df_movie_summary.index.map(int)
    movie_benchmark = round(df_movie_summary['count'].quantile(0.7), 0)
    drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

    print('Movie minimum times of review: {}'.format(movie_benchmark))

    df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(f)
    df_cust_summary.index = df_cust_summary.index.map(int)
    cust_benchmark = round(df_cust_summary['count'].quantile(0.7), 0)
    drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

    print('Customer minimum times of review: {}'.format(cust_benchmark))

    print('Original Shape: {}'.format(df.shape))
    df = df[~df['Movie_Id'].isin(drop_movie_list)]
    df = df[~df['Cust_Id'].isin(drop_cust_list)]
    print('After Trim Shape: {}'.format(df.shape))
    print('-Data Examples-')
    print(df.iloc[::5000000, :])
    df.to_csv("./netflix_cleaned1.csv")

    # Fit SVD model and store arrays
    df = pd.read_csv("./netflix_cleaned1.csv")
    reader = Reader()
    data = Dataset.load_from_df(df[['Movie_Id', 'Cust_Id', 'Rating']][:], reader)
    trainset = data.build_full_trainset()
    model = SVD(verbose=True)

    print("start fitting")
    model.fit(trainset=trainset)

    cur_dir = os.getcwd()
    np.save(os.path.join(cur_dir, "Movie_factors_15_new.npy"), model.pu)
    np.save(os.path.join(cur_dir, "Movie_biases_15_new.npy"), model.bu)
    np.save(os.path.join(cur_dir, "Customer_factors_15_new.npy"), model.qi)
    np.save(os.path.join(cur_dir, "Customer_biases_15_new.npy"), model.bi)
    np.save(os.path.join(cur_dir, "netflix_global_mean.npy"), trainset.global_mean)
    print("end fitting")


# def main2(filename: str = "Movie_ratings"):
#     cur_dir = os.path.dirname(__file__)
#     movie_factors = np.load(os.path.join(cur_dir, "Movie_factors_15_new.npy"))
#     customer_factors = np.load(os.path.join(cur_dir, "Customer_factors_15_new.npy"))
#     movie_biases = np.load(os.path.join(cur_dir, "Movie_biases_15_new.npy"))
#     customer_biases = np.load(os.path.join(cur_dir, "Customer_biases_15_new.npy"))
#     global_mean = np.load(os.path.join(cur_dir, "netflix_global_mean.npy"))

#     data = movie_factors @ (customer_factors.transpose())
#     data += np.expand_dims(movie_biases, axis=1)
#     data += + np.expand_dims(customer_biases, axis=0)
#     data += global_mean

#     filename = os.path.join(cur_dir, f"{filename}.npy")
#     np.save(filename, data)
#     print("Store Movie ratings matrix.")

# def preprocess_netflix():
#     main1()
#     main2()


main1()
    # main2()

Dataset 1 shape: (24058263, 2)
-Dataset examples-
    Cust_Id  Rating
0        1:     NaN
1   1488844     3.0
2    822109     5.0
3    885013     4.0
4     30878     4.0
..      ...     ...
95  1245406     4.0
96  1834590     3.0
97   593225     3.0
98  1011918     4.0
99  1665054     4.0

[100 rows x 2 columns]
Movie numpy: [1.000e+00 1.000e+00 1.000e+00 ... 4.499e+03 4.499e+03 4.499e+03]
Length: 24053764
-Dataset examples-
          Cust_Id  Rating  Movie_Id
1         1488844     3.0         1
5000996    501954     2.0       996
10001962   404654     5.0      1962
15002876   886608     2.0      2876
20003825  1193835     2.0      3825
Movie minimum times of review: 1799.0
Customer minimum times of review: 52.0
Original Shape: (24053764, 3)
After Trim Shape: (17337458, 3)
-Data Examples-
          Cust_Id  Rating  Movie_Id
696        712664     5.0         3
6932490   1299309     5.0      1384
13860273   400155     3.0      2660
20766530   466962     4.0      3923
start fitting
Proces

In [10]:
def main2(filename: str = "movie_ratings"):
    cur_dir = os.getcwd()
    movie_factors = np.load(os.path.join(cur_dir, "Movie_factors_15_new.npy"))
    customer_factors = np.load(os.path.join(cur_dir, "Customer_factors_15_new.npy"))
    movie_biases = np.load(os.path.join(cur_dir, "Movie_biases_15_new.npy"))
    customer_biases = np.load(os.path.join(cur_dir, "Customer_biases_15_new.npy"))
    global_mean = np.load(os.path.join(cur_dir, "netflix_global_mean.npy"))

    data = movie_factors @ (customer_factors.transpose())
    data += np.expand_dims(movie_biases, axis=1)
    data += + np.expand_dims(customer_biases, axis=0)
    data += global_mean

    filename = os.path.join(cur_dir, f"{filename}.npy")
    np.save(filename, data)
    print("Store Movie ratings matrix.")

In [11]:
main2()

Store Movie ratings matrix.


In [13]:
mrs = np.load("/Users/viraj/Documents/GitHub/MIPS/data/mycode/movie_ratings.npy")
mrs.shape

(1350, 143458)

In [14]:
print(mrs)

[[4.18893675 3.93804694 2.97514296 ... 3.18208193 4.36759985 3.24433438]
 [3.17614068 3.37901069 2.59002104 ... 3.75228953 4.03185533 3.03243201]
 [3.2751007  2.71377304 2.4602786  ... 3.41542816 4.03894965 2.72822557]
 ...
 [2.36836285 2.23062555 2.04701642 ... 2.30244241 3.20041651 2.17102392]
 [3.34384158 3.10011515 2.33264784 ... 3.27898624 3.83039137 2.69163333]
 [4.21737977 3.81649531 3.04032904 ... 3.68071161 4.49925659 3.3437908 ]]
