## Task 1

In [21]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import lsqr

# Load the data
user_clip_train = pd.read_csv('user_clip.csv').dropna()

# Calculate the average rating
r_avg = user_clip_train['weight'].mean()

# Create mappings for user_id and clip_id to indices
user_ids = pd.Categorical(user_clip_train['user_id'])
user_map = {user: i for i, user in enumerate(user_ids.categories)}
clip_ids = pd.Categorical(user_clip_train['clip_id'])
clip_map = {clip: i for i, clip in enumerate(clip_ids.categories)}

# Extract user and clip indices
user_indices = user_clip_train['user_id'].map(user_map).values
clip_indices = user_clip_train['clip_id'].map(clip_map).values

# Create the sparse design matrix
num_ratings = len(user_clip_train)
num_users = len(user_ids.categories)
num_clips = len(clip_ids.categories)

# Build the sparse matrix using user and clip indices
row_indices = np.arange(num_ratings)
col_indices_users = user_indices
col_indices_clips = clip_indices + num_users

data_users = np.ones(num_ratings)
data_clips = np.ones(num_ratings)

row_indices_combined = np.concatenate([row_indices, row_indices])
col_indices_combined = np.concatenate([col_indices_users, col_indices_clips])
data_combined = np.concatenate([data_users, data_clips])

# Create the sparse matrix
A = csr_matrix((data_combined, (row_indices_combined, col_indices_combined)), shape=(num_ratings, num_users + num_clips))

# Create the target vector by subtracting the average weight from the actual weights
y = np.array(user_clip_train['weight'] - r_avg)

# Solve the linear system to find the biases for users and clips using the least-squares method
b = lsqr(A, y)[0]

# Store the biases in a dictionary
bias_dict = pd.Series(b, index=[f"user_{user_id}" for user_id in user_ids.categories] + [f"clip_{clip_id}" for clip_id in clip_ids.categories])

# Function to predict rank for a single row
def predict_viewtime(row):
    user_bias = bias_dict.get(f"user_{row['user_id']}", 0)
    clip_bias = bias_dict.get(f"clip_{row['clip_id']}", 0)
    return r_avg + user_bias + clip_bias

# Load the test data
test_df = pd.read_csv('test.csv').dropna()

# Predict weights for the test dataset
test_df['weight_pred'] = test_df.apply(predict_viewtime, axis=1)

# Ensure no negative predictions
test_df['weight_pred'] = test_df['weight_pred'].clip(lower=0)

# Save the predictions to a CSV file
test_df[['user_id', 'clip_id', 'weight_pred']].to_csv('319044434_314779166_task1.csv', index=False)

# Function to calculate the objective function (error + regularization)
def f1(df, user_bias, clip_bias):
    error = ((df['prediction'] - df['weight']) ** 2).sum()
    regularization = 0.1 * ((user_bias ** 2).sum() + (clip_bias ** 2).sum())
    return error + regularization

# Calculate predictions for training data
user_clip_train['prediction'] = user_clip_train.apply(predict_viewtime, axis=1)

# Extract user and clip biases from the bias series
user_bias_values = bias_dict.iloc[:num_users].values
clip_bias_values = bias_dict.iloc[num_users:].values

# Calculate the F1 score for training data
f1_score = f1(user_clip_train, user_bias_values, clip_bias_values)
print(f'Loss function for training data: {f1_score}')


Loss function for training data: 935471510070.5972


## Task 2

In [22]:
import numpy as np
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

user_clip = pd.read_csv('user_clip.csv').dropna()
user_clip_matrix = user_clip.pivot(index='user_id', columns='clip_id', values='weight').fillna(0)
user_ids = user_clip_matrix.index
clip_ids = user_clip_matrix.columns

user_clip_matrix = csr_matrix(user_clip_matrix.values)
U, Σ, V_T = svds(user_clip_matrix, k=20)
predicted_user_clip_matrix = U @ np.diag(Σ) @ V_T
predicted_user_clip_matrix = pd.DataFrame(predicted_user_clip_matrix, columns=clip_ids, index=user_ids)

In [23]:
def predict(user_id_test, clip_id_test):
    if user_id_test in predicted_user_clip_matrix.index and clip_id_test in predicted_user_clip_matrix.columns:
        return predicted_user_clip_matrix.loc[user_id_test, clip_id_test]
    else:
        return 0

test_df = pd.read_csv('test.csv').filter(['user_id', 'clip_id']).dropna()
test_df['weight'] = test_df.apply(lambda row: predict(row['user_id'], row['clip_id']) ,axis=1)
test_df.to_csv('319044434_314779166_task2.csv', index=False)

In [24]:
predicted_user_clip = predicted_user_clip_matrix.reset_index().melt(id_vars='user_id', var_name='clip_id', value_name='weight')
predicted_user_clip = predicted_user_clip.rename(columns={'weight': 'prediction'})

In [25]:
def f2(weights, predictions):
    merged_df = weights.merge(predictions, on=['user_id', 'clip_id'])
    sse = ((merged_df['weight'] - merged_df['prediction']) ** 2).sum()
    return sse

print(f2(user_clip, predicted_user_clip))

229268390659.79825
