In [None]:
!gdown 1X53R4nxZeW6dulZLxu7lCefpAX6V-jFG

Downloading...
From: https://drive.google.com/uc?id=1X53R4nxZeW6dulZLxu7lCefpAX6V-jFG
To: /content/netflix.zip
  0% 0.00/15.9M [00:00<?, ?B/s]100% 15.9M/15.9M [00:00<00:00, 190MB/s]


In [1]:
"""
Implements Collaborative Filtering Algoirithm
"""
from collections import defaultdict
import json


class CollabFilter:
    """
    Implements Collaborative Filtering Algoirithm
    """

    def __init__(self, dataset):
        self.mean_user_ratings = defaultdict(float)
        self.movie_rated_users = defaultdict()

        self.train(dataset)
        self.test(dataset)

    def train(self, dataset):
        """
        Calculates mean votes per user and creates a map of movie to users who have rated that movie
        """
        # Init empty list of users for each movie
        for movie in dataset["movies"].keys():
            self.movie_rated_users[movie] = set()

        for user_id, ratings in dataset["ratings"]["train"].items():
            ratings_count = len(ratings)
            self.mean_user_ratings[user_id] = float(0.0)
            for mv_id, mv_rating in ratings.items():
                # Add user to list of users for this movie mv_id
                self.movie_rated_users[mv_id].add(user_id)
                # Add this movie's rating for mean rating
                self.mean_user_ratings[user_id] += mv_rating
            # Divide by count of ratings to calculate mean rating
            self.mean_user_ratings[user_id] /= ratings_count

    def test(self, dataset):
        """
        Calculates error between predicted ratings and actual ratings
        """
        mean_absolute_error = float(0.0)
        root_mean_squared_error = float(0.0)
        count = 0

        # Calculate weights to access them later, instead of
        # calculating it at run time for every test instance
        test_users = defaultdict()
        for user_id, user_ratings in dataset["ratings"]["test"].items():
            test_users[user_id] = user_ratings.keys()

        self.calculate_weights_for_users(dataset, test_users)

        print("actual_rating\tpredicted_rating")
        print("------------------------------------------------------------------")
        for user_id, user_ratings in dataset["ratings"]["test"].items():
            for mv_id, actual_rating in user_ratings.items():
                predicted_rating = self.predict_user_mv_rating(dataset, user_id, mv_id)
                print(str(actual_rating) + "\t\t\t\t" + str(predicted_rating))

                mean_absolute_error += abs(predicted_rating - actual_rating)
                root_mean_squared_error += pow(abs(predicted_rating - actual_rating), 2)
                count += 1
        print("-------------------------------------------------\n")

        mean_absolute_error /= count
        root_mean_squared_error /= count
        root_mean_squared_error = pow(root_mean_squared_error, 0.5)
        print("Mean absolute error: " + str(mean_absolute_error))
        print("Root mean squared error: " + str(root_mean_squared_error))

    def calculate_weight_for_two_users(self, dataset, user_i, user_j) -> float:
        """
        Finds common movies rated by user_i and user_j,
        Uses it to calculate weight for user_i and user_j
        """
        common_rated_movies = set(dataset["ratings"]["train"][user_i].keys()) & set(
            dataset["ratings"]["train"][user_j].keys()
        )
        if len(common_rated_movies) == 0:
            return float(0.0)
        else:
            numerator = float(0.0)
            denominator1 = float(0.0)
            denominator2 = float(0.0)
            for mv_id in common_rated_movies:
                numerator += (
                    dataset["ratings"]["train"][user_i][mv_id]
                    - self.mean_user_ratings[user_i]
                ) * (
                    dataset["ratings"]["train"][user_j][mv_id]
                    - self.mean_user_ratings[user_j]
                )

                denominator1 += pow(
                    (
                        dataset["ratings"]["train"][user_i][mv_id]
                        - self.mean_user_ratings[user_i]
                    ),
                    2,
                )
                denominator2 += pow(
                    (
                        dataset["ratings"]["train"][user_j][mv_id]
                        - self.mean_user_ratings[user_j]
                    ),
                    2,
                )

            if denominator1 == 0 or denominator2 == 0:
                return float(0.0)
            else:
                return float(numerator / pow(denominator1 * denominator2, 0.5))

    def calculate_weights_for_users(self, dataset, test_users):
        """
        calculates rating weights for all test users using equation 2
        """
        for user_id, rated_movies in test_users.items():
            wt_users = set()
            user_wt = defaultdict()
            for mv_id in rated_movies:
                for mv_user in self.movie_rated_users[mv_id]:
                    if mv_user != user_id:
                        wt_users.add(mv_user)

            for mv_user in wt_users:
                wt = self.calculate_weight_for_two_users(dataset, user_id, mv_user)
                user_wt[mv_user] = wt

            # Normalize the weights
            wt_sum = float(0.0)
            for user_j_wt in user_wt.values():
                wt_sum += user_j_wt

            if wt_sum != 0:
                for user_j in user_wt:
                    user_wt[user_j] /= wt_sum

            # Writing weights to disk since I encountered a Memory error
            with open("./weights/" + user_id + ".json", "w", encoding="utf-8") as f:
                json.dump(user_wt, f)

    def predict_user_mv_rating(self, dataset, active_user_id, mv_id):
        """
        predicts user vote for a movie using equation 1
        """
        # Start with mean rating for active user
        predicted_rating = self.mean_user_ratings[active_user_id]

        # Load the weights from disk
        with open(
            "./weights/" + active_user_id + ".json", "r", encoding="utf-8"
        ) as user_wt_json:
            weights = json.load(user_wt_json)
            # Calculate weighted sum of other users' ratings
            for curr_user_i, curr_user_wt in weights.items():
                # Only count rating if this curr_user has rated movie mv_id
                if (
                    mv_id in dataset["ratings"]["train"][curr_user_i]
                    and curr_user_wt != 0
                ):
                    predicted_rating += curr_user_wt * (
                        dataset["ratings"]["train"][curr_user_i][mv_id]
                        - self.mean_user_ratings[curr_user_i]
                    )

                return predicted_rating


In [2]:
"""
Uses scikit-learn K nearest neighbors Classifier
"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


class KNNClassifier:
    """
    Uses scikit-learn K nearest neighbors Classifier to run experiments
    """

    def __init__(self, dataset):
        # Ran experiments and found that Accuracy does NOT change
        # if algorithm or leaf_size is altered, thus choosing algorithm = auto (default)
        self.classifiers = [
            # Changing n_neighbors
            KNeighborsClassifier(n_neighbors=3, weights="uniform", p=2, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=5, weights="uniform", p=2, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=7, weights="uniform", p=2, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=9, weights="uniform", p=2, n_jobs=-1),
            # Changing weights, now closer neighbors have larger weights
            KNeighborsClassifier(n_neighbors=3, weights="distance", p=2, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=5, weights="distance", p=2, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=7, weights="distance", p=2, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=9, weights="distance", p=2, n_jobs=-1),
            # Changing l_p for minkowski distance
            KNeighborsClassifier(n_neighbors=3, weights="distance", p=3, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=5, weights="distance", p=3, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=5, weights="distance", p=4, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=7, weights="distance", p=4, n_jobs=-1),
            KNeighborsClassifier(n_neighbors=9, weights="distance", p=4, n_jobs=-1),
        ]

        self.classifier = ""
        for classifier in self.classifiers:
            self.classifier = classifier
            self.train(dataset)
            self.test(dataset)

    def train(self, dataset):
        """
        Trains the classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        self.classifier.fit(x_train, y_train)

    def test(self, dataset):
        """
        Report Error Metrics on Test Data
        """
        x_test = dataset["test"][:, :-1]
        y_test = dataset["test"][:, -1]

        y_pred = self.classifier.predict(x_test)

        params = self.classifier.get_params()
        print("K-Nearest Neighbors Parameters:")
        for key, value in params.items():
            print(f"{key}: {value}")

        accuracy = accuracy_score(y_test, y_pred)
        print("\nAccuracy: " + str(accuracy))
        error_rate = round(1.0 - accuracy, 4)
        print("Error rate: " + str(error_rate))
        print("-------------------------------------------------\n")


In [3]:
"""
Uses scikit-learn SVM Classifier
"""
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


class SVMClassifier:
    """
    Uses scikit-learnSVM Classifier to run experiments
    """

    def __init__(self, dataset):
        self.classifiers = [
            # Changing Penalty Regularization
            SVC(C=1.0, kernel="rbf", gamma="scale", random_state=1),
            SVC(C=2.0, kernel="rbf", gamma="scale", random_state=1),
            SVC(C=0.5, kernel="rbf", gamma="scale", random_state=1),
            # Changing Kernel
            SVC(C=1.0, kernel="sigmoid", gamma="scale", random_state=1),
            SVC(C=2.0, kernel="sigmoid", gamma="scale", random_state=1),
            SVC(C=1.0, kernel="linear", gamma="scale", random_state=1),
            SVC(C=2.0, kernel="linear", gamma="scale", random_state=1),
            # Using Poly Kernel with different degree values
            SVC(C=1.0, kernel="poly", degree=3, gamma="scale", random_state=1),
            SVC(C=1.0, kernel="poly", degree=4, gamma="scale", random_state=1),
            SVC(C=1.0, kernel="poly", degree=5, gamma="scale", random_state=1),
            # Changing Gamma
            SVC(C=1.0, kernel="rbf", gamma="auto", random_state=1),
            SVC(C=1.0, kernel="linear", gamma="auto", random_state=1),
            SVC(C=1.0, kernel="poly", gamma="auto", random_state=1),
            SVC(C=1.0, kernel="sigmoid", gamma="auto", random_state=1),
        ]

        self.classifier = ""
        for classifier in self.classifiers:
            self.classifier = classifier
            self.train(dataset)
            self.test(dataset)

    def train(self, dataset):
        """
        Trains the classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        self.classifier.fit(x_train, y_train)

    def test(self, dataset):
        """
        Report Error Metrics on Test Data
        """
        x_test = dataset["test"][:, :-1]
        y_test = dataset["test"][:, -1]

        y_pred = self.classifier.predict(x_test)

        params = self.classifier.get_params()
        print("SVC Parameters:")
        for key, value in params.items():
            print(f"{key}: {value}")

        accuracy = accuracy_score(y_test, y_pred)
        print("\nAccuracy: " + str(accuracy))
        error_rate = round(1.0 - accuracy, 4)
        print("Error rate: " + str(error_rate))
        print("-------------------------------------------------\n")


In [4]:
"""
Instantiates and Runs all models
"""

import zipfile
import sys
from os.path import exists
from collections import defaultdict
import numpy as np
from sklearn.datasets import fetch_openml

class AllModels:
    """
    Instantiates and Runs all models
    """

    def __init__(self, models="all"):
        self.netflix_dataset = {
            "movies": {},
            "ratings": {
                "train": defaultdict(dict),
                "test": defaultdict(dict),
            },
        }

        if models == "all" or "collab" in models:
            dataset_path = "./netflix/"
            self.read_netflix_dataset(dataset_path)
            print("Running Collaborative Filtering on Netflix Dataset")
            print("*************************************************\n")
            CollabFilter(self.netflix_dataset)
            print("*************************************************\n")

        mnist_dataset = defaultdict()
        if models == "all" or "svm" in models or "knn" in models:
            # Fetch MNIST dataset for SVM and KNN
            x, y = fetch_openml("mnist_784", version=1, return_X_y=True)
            x = x / 255

            x_train, x_test = x[:60000], x[60000:]
            y_train, y_test = y[:60000], y[60000:]

            mnist_dataset["train"] = np.column_stack((x_train, y_train))
            mnist_dataset["test"] = np.column_stack((x_test, y_test))

        if models == "all" or "svm" in models:
            print("Running SVM Experiments on MNIST Dataset")
            print("*************************************************\n")
            SVMClassifier(mnist_dataset)
            print("*************************************************\n")

        if models == "all" or "knn" in models:
            print("Running K Nearest Neighbors Experiments on MNIST Dataset")
            print("*************************************************\n")
            KNNClassifier(mnist_dataset)
            print("*************************************************\n")

    def read_netflix_dataset(self, dataset_path):
        """
        Extracts and Reads txt files from the Netflix dataset
        """
        if not exists(dataset_path):
            with zipfile.ZipFile("./netflix.zip", "r") as zip_ref:
                zip_ref.extractall(dataset_path)

        movies_dataset = dataset_path + "movie_titles.txt"
        train_ratings = dataset_path + "TrainingRatings.txt"
        test_ratings = dataset_path + "TestingRatings.txt"

        with open(movies_dataset, "r", encoding="latin-1") as mv:
            lines = mv.readlines()
            for line in lines:
                mv_data = line.split(",")
                mv_id = mv_data[0]
                mv_year = mv_data[1]
                mv_name = mv_data[2].strip()
                self.netflix_dataset["movies"][mv_id] = {
                    "year": mv_year,
                    "name": mv_name,
                }

        with open(train_ratings, "r", encoding="utf-8") as trn:
            lines = trn.readlines()
            for line in lines:
                trn_data = line.split(",")
                mv_id = trn_data[0]
                user_id = trn_data[1]
                user_mv_rating = float(trn_data[2].strip())
                self.netflix_dataset["ratings"]["train"][user_id][
                    mv_id
                ] = user_mv_rating

        with open(test_ratings, "r", encoding="utf-8") as tst:
            lines = tst.readlines()
            for line in lines:
                tst_data = line.split(",")
                mv_id = tst_data[0]
                user_id = tst_data[1]
                user_mv_rating = float(tst_data[2].strip())
                self.netflix_dataset["ratings"]["test"][user_id][mv_id] = user_mv_rating


if __name__:
    # Run all models passed as arguments
    if len(sys.argv) >= 2:
        MODELS = " ".join(sys.argv[1:])
        all_models = AllModels(MODELS)
    else:
        print(
            'Argument ("all" or "collab" or "knn" or "svm") needed.',
            file=sys.stderr,
        )


Driver Code to run the above python scripts

In [None]:
AllModels(models="svm")

  warn(


Running SVM Experiments on MNIST Dataset
*************************************************

SVC Parameters:
C: 1.0
break_ties: False
cache_size: 200
class_weight: None
coef0: 0.0
decision_function_shape: ovr
degree: 3
gamma: scale
kernel: rbf
max_iter: -1
probability: False
random_state: 1
shrinking: True
tol: 0.001
verbose: False

Accuracy: 0.9792
Error rate: 0.0208
-------------------------------------------------

SVC Parameters:
C: 2.0
break_ties: False
cache_size: 200
class_weight: None
coef0: 0.0
decision_function_shape: ovr
degree: 3
gamma: scale
kernel: rbf
max_iter: -1
probability: False
random_state: 1
shrinking: True
tol: 0.001
verbose: False

Accuracy: 0.9831
Error rate: 0.0169
-------------------------------------------------

SVC Parameters:
C: 0.5
break_ties: False
cache_size: 200
class_weight: None
coef0: 0.0
decision_function_shape: ovr
degree: 3
gamma: scale
kernel: rbf
max_iter: -1
probability: False
random_state: 1
shrinking: True
tol: 0.001
verbose: False

Accuracy

<__main__.AllModels at 0x7a6f4d6b70d0>

In [5]:
AllModels(models="knn")

  warn(


Running K Nearest Neighbors Experiments on MNIST Dataset
*************************************************

K-Nearest Neighbors Parameters:
algorithm: auto
leaf_size: 30
metric: minkowski
metric_params: None
n_jobs: -1
n_neighbors: 3
p: 2
weights: uniform

Accuracy: 0.9705
Error rate: 0.0295
-------------------------------------------------

K-Nearest Neighbors Parameters:
algorithm: auto
leaf_size: 30
metric: minkowski
metric_params: None
n_jobs: -1
n_neighbors: 5
p: 2
weights: uniform

Accuracy: 0.9688
Error rate: 0.0312
-------------------------------------------------

K-Nearest Neighbors Parameters:
algorithm: auto
leaf_size: 30
metric: minkowski
metric_params: None
n_jobs: -1
n_neighbors: 7
p: 2
weights: uniform

Accuracy: 0.9694
Error rate: 0.0306
-------------------------------------------------

K-Nearest Neighbors Parameters:
algorithm: auto
leaf_size: 30
metric: minkowski
metric_params: None
n_jobs: -1
n_neighbors: 9
p: 2
weights: uniform

Accuracy: 0.9659
Error rate: 0.0341

<__main__.AllModels at 0x7ded25a03f70>

Not Running Collaborative Filtering on Colab because it takes a lot of time (9-10 hours) to complete. I've had problems in previous project with Colab when the jobs take more than 3-4 hours to run.

I ran Collaborative Filtering on my laptop, and I've attached the results below.




Running Collaborative Filtering on Netflix Dataset
*************************************************

Mean absolute error: 0.7902364002686847

Root mean squared error: 0.9887258652823789