In [208]:
import pandas as pd
import numpy as np
from tqdm import tqdm

class CustomKMeans:
    def __init__(self, n_clusters, max_iter=2, tol=1e-4, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.centroids = None
        self.labels = None

    def fit(self, X):
        np.random.seed(self.random_state)
        # Initialize centroids using kmeans++
        self.centroids = self._initialize_centroids(X)

        for _ in range(self.max_iter):
            # Assign each data point to the nearest centroid
            labels = self._assign_labels(X)
            # Update centroids based on the mean of data points in each cluster
            new_centroids = self._update_centroids(X, labels)
            # Check if centroids have converged
            if np.linalg.norm(new_centroids - self.centroids) < self.tol:
                break
            self.centroids = new_centroids
            self.labels = labels

    def predict(self, X):
        # Assign each data point to the nearest centroid
        return self._assign_labels(X)

    def _initialize_centroids(self, X):
        centroids = [X[np.random.choice(X.shape[0])]]
        while len(centroids) < self.n_clusters:
            dist_sq = np.array([min([np.linalg.norm(x - c)**2 for c in centroids]) for x in X])
            probs = dist_sq / dist_sq.sum()
            cumulative_probs = probs.cumsum()
            r = np.random.rand()
            i = np.searchsorted(cumulative_probs, r)
            centroids.append(X[i])
        return np.array(centroids)

    def _assign_labels(self, X):
        # Assign each data point to the nearest centroid
        return np.argmin(np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2), axis=1)

    def _update_centroids(self, X, labels):
        # Update centroids based on the mean of data points in each cluster
        new_centroids = np.array([X[labels == i].mean(axis=0) if np.sum(labels == i) > 0 else self.centroids[i] for i in range(self.n_clusters)])
        return new_centroids

class CustomSVD:
    def __init__(self, A, num_components=None):
        print("Computing SVD...")
        self.U, self.S, self.Vt = self.svd(A, num_components)

    def svd(self, A, num_components=None):
        m, n = A.shape

        # Compute A^T * A
        print("Compute A^T * A")
        ATA = A.T.dot(A)

        # Compute eigenvalues and eigenvectors of A^T * A
        print("Compute eigenvalues and eigenvectors of A^T * A")
        eigenvalues, Vt = self.power_iteration(ATA)

        # Sort eigenvalues in descending order
        print("Sort eigenvalues in descending order")
        idx = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idx]
        Vt = Vt[:, idx]
        # Determine the number of components
        if num_components is None:
            num_components = min(m, n)

        # Truncate singular values and vectors
        singular_values = np.sqrt(eigenvalues)[:num_components]
        U = np.zeros((m, num_components))
        for i in tqdm(range(num_components), desc="Computing U", position=0):
            u = np.dot(A, Vt[:, i]) / singular_values[i]
            for j in range(i):
                u -= np.dot(U[:, j], u) * U[:, j]
            U[:, i] = u / np.linalg.norm(u)

        return U, np.diag(singular_values), Vt.T[:, :num_components]

    def power_iteration(self, A, max_iter=2, tol=1e-6):
        n = A.shape[0]
        eigenvalues = np.zeros(n)
        eigenvectors = np.zeros((n, n))

        for i in tqdm(range(n), desc="power itr", position=0):
            # Set an initial guess for the eigenvector
            x = np.random.rand(n)
            x /= np.linalg.norm(x)

            # Iterative method to find eigenvalues and eigenvectors using Power Iteration
            for _ in range(max_iter):
                x_next = A.dot(x)
                eigenvalue = np.linalg.norm(x_next)
                x_next /= eigenvalue
                # Check for convergence
                if np.linalg.norm(x_next - x) < tol:
                    break

                x = x_next

            # Set the computed eigenvalue and eigenvector
            eigenvalues[i] = eigenvalue
            eigenvectors[:, i] = x

            # Deflate the matrix
            A -= eigenvalue * np.outer(x, x)

        return eigenvalues, eigenvectors


# Load data
print("Loading data...")
movie_data = pd.read_csv('/content/drive/MyDrive/rsdats/movies.csv')
user_data = pd.read_csv('/content/drive/MyDrive/rsdats/users.csv')
rating_data = pd.read_csv('/content/drive/MyDrive/rsdats/ratings.csv')

# Merge rating data with movie data
print("Merging rating data with movie data...")
merged_data = pd.merge(rating_data, movie_data, left_on='MovieID', right_on='ID')

# Merge with user data using UserID
print("Merging with user data using UserID...")
merged_data = pd.merge(merged_data, user_data, on='UserID')

# Pivot table of user ratings with movie titles as columns
print("Creating user-item matrix...")
user_ratings = merged_data.pivot_table(index='UserID', columns='Title', values='Rating').fillna(0)

# Apply KMeans clustering to group similar users
print("Performing KMeans clustering on user ratings...")
kmeans = CustomKMeans(n_clusters=4, random_state=42)
kmeans.fit(user_ratings.values)
cluster_labels = kmeans.predict(user_ratings.values)

# Apply SVD to each cluster separately
print("Applying SVD to each cluster separately...")
clusters = {}
for cluster_id in tqdm(range(4), desc="Cluster", position=0):
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    cluster_data = user_ratings.values[cluster_indices]
    svd = CustomSVD(cluster_data, num_components=50)
    clusters[cluster_id] = svd

Loading data...
Merging rating data with movie data...
Merging with user data using UserID...
Creating user-item matrix...
Performing KMeans clustering on user ratings...
Applying SVD to each cluster separately...


Cluster:   0%|          | 0/4 [00:00<?, ?it/s]

Computing SVD...
Compute A^T * A
Compute eigenvalues and eigenvectors of A^T * A


power itr:   2%|▏         | 75/3706 [00:12<10:08,  5.97it/s]
Cluster:   0%|          | 0/4 [00:12<?, ?it/s]


KeyboardInterrupt: 

In [37]:
import pickle

# Save clusters dictionary
with open('clusters.pkl', 'wb') as f:
    pickle.dump(clusters, f)

# Save kmeans object
with open('kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans, f)


In [113]:
# Save kmeans object
with open('labels.pkl', 'wb') as f:
    pickle.dump(cluster_labels, f)


In [114]:
# Load clusters dictionary
with open('clusters.pkl', 'rb') as f:
    clusters = pickle.load(f)

# Load kmeans object
with open('kmeans.pkl', 'rb') as f:
    kmeans = pickle.load(f)

# Load kmeans object
with open('labels.pkl', 'rb') as f:
    cluster_labels = pickle.load(f)


In [115]:
# New user ratings
new_user_ratings = {'Toy Story (1995)': 5, 'Jurassic Park (1993)': 4, 'Forrest Gump (1994)': 3}

# Create a DataFrame for the new user
new_user_data = pd.DataFrame(new_user_ratings, index=[0])

# Convert movie titles to columns
new_user_movies = pd.DataFrame(columns=user_ratings.columns)

# Merge new user data with the movie DataFrame
new_user_data_merged = pd.merge(new_user_movies, new_user_data, how='outer').fillna(0)

# Reorder the columns of new_user_data_merged to match the order of user_ratings
new_user_data_reordered = new_user_data_merged[user_ratings.columns]

In [116]:
new_user_df=new_user_data_reordered
new_user_df

Unnamed: 0,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [117]:
# Predict the cluster for the new user
print("Predicting the cluster for the new user...")
new_user_cluster = kmeans.predict(new_user_df.values)

Predicting the cluster for the new user...


In [118]:
new_user_cluster

array([2])

In [191]:
# New user ratings
new_user_ratings = {'Toy Story (1995)': 5, 'Jurassic Park (1993)': 4, 'Forrest Gump (1994)': 3}

# Create DataFrame for the new user
new_user_data = pd.DataFrame([new_user_ratings])

# Convert movie titles to columns
new_user_movies = pd.DataFrame(columns=user_ratings.columns)

# Merge new user data with the movie DataFrame
new_user_data_merged = pd.merge(new_user_movies, new_user_data, how='outer').fillna(0)

# Reorder the columns of new_user_data_merged to match the order of user_ratings
new_user_data_reordered = new_user_data_merged[user_ratings.columns]

# Predict the cluster for the new user
print("Predicting the cluster for the new user...")
new_user_cluster = kmeans.predict(new_user_data_reordered.values)

# Transform the new user data using SVD of the predicted cluster
print("Transforming new user data using SVD...")
new_user_data_reduced = np.dot(new_user_data_reordered.values, clusters[new_user_cluster[0]].Vt)

# Compute the predicted ratings matrix for the new user using the SVD of the predicted cluster
print("Computing predicted ratings for the new user...")
predicted_cluster_svd = clusters[new_user_cluster[0]]
predicted_ratings_intermediate = predicted_cluster_svd.S.dot(predicted_cluster_svd.Vt.T)
# Then multiply U
predicted_ratings = new_user_data_reduced.dot(predicted_ratings_intermediate)
print(predicted_ratings)

# Print the shape of the predicted ratings matrix
print("Shape of predicted ratings matrix:", predicted_ratings.shape)

# Get movie recommendations based on mean ratings of movies in the predicted cluster
print("Generating movie recommendations for the new user...")
recommended_movies = pd.DataFrame(columns=user_ratings.columns)

# Flatten the predicted ratings array
predicted_ratings_flat = predicted_ratings[0]
predicted_ratings_flat

# Get the indices of the top rated movies
top_indices = np.argsort(predicted_ratings_flat)[::-1][:5]
print(top_indices)

# Get the corresponding movies
top_movies = user_ratings.columns[top_indices]

# Get the corresponding ratings
top_ratings = predicted_ratings_flat[top_indices]

# Print the top recommended movies and their ratings
print("Top 5 recommended movies and their ratings:")
for movie, rating in zip(top_movies, top_ratings):
    print(f"{movie}: {rating}")

Predicting the cluster for the new user...
Transforming new user data using SVD...
Computing predicted ratings for the new user...
[[ 1.33869739  0.028569    0.1114276  ... -2.06312809  2.36067247
   3.31718124]]
Shape of predicted ratings matrix: (1, 3706)
Generating movie recommendations for the new user...
[1789  182 3412 1246 2456]
Top 5 recommended movies and their ratings:
Jurassic Park (1993): 9.833108071151507
Anne Frank Remembered (1995): 8.332457472011063
Toy Story 2 (1999): 8.101349821908544
French Connection, The (1971): 8.027699182226197
One Night Stand (1997): 8.025655536980695


In [207]:
import numpy as np

# Scale the predicted ratings to ensure they are between 1 and 5
min_rating = predicted_ratings.min()
max_rating = predicted_ratings.max()
predicted_ratings_scaled = 1 + ((predicted_ratings - min_rating) * 4 / (max_rating - min_rating))

# Round the scaled ratings to the nearest integer value
predicted_ratings_scaled_rounded = np.where(predicted_ratings_scaled - np.floor(predicted_ratings_scaled) >= 0.5, np.ceil(predicted_ratings_scaled), np.floor(predicted_ratings_scaled))

# Print the shape of the predicted ratings matrix
print("Shape of predicted ratings matrix:", predicted_ratings_scaled_rounded.shape)

# Get movie recommendations based on mean ratings of movies in the predicted cluster
print("Generating movie recommendations for the new user...")
recommended_movies = pd.DataFrame(columns=user_ratings.columns)

# Flatten the scaled and rounded predicted ratings array
predicted_ratings_flat_scaled_rounded = predicted_ratings_scaled_rounded[0]

# Get the indices of the top rated movies
top_indices_scaled_rounded = np.argsort(predicted_ratings_flat_scaled_rounded)[::-1][:5]

# Get the corresponding movies and ratings
top_movies_scaled_rounded = user_ratings.columns[top_indices_scaled_rounded]
top_ratings_scaled_rounded = predicted_ratings_flat_scaled_rounded[top_indices_scaled_rounded]

# Print the top recommended movies and their ratings
print("Top 5 recommended movies and their ratings:")
for movie, rating in zip(top_movies_scaled_rounded, top_ratings_scaled_rounded):
    print(f"{movie}: {rating}")


Shape of predicted ratings matrix: (1, 3706)
Generating movie recommendations for the new user...
Top 5 recommended movies and their ratings:
FairyTale: A True Story (1997): 5.0
Small Time Crooks (2000): 5.0
One Night Stand (1997): 5.0
French Connection, The (1971): 5.0
Anne Frank Remembered (1995): 5.0
