## Task 1

In [54]:
import pandas as pd

user_clip = pd.read_csv('user_clip.csv').dropna()
r_avg = user_clip['weight'].mean()
initial_users_bias = user_clip.groupby('user_id')['weight'].mean() - r_avg
initial_clips_bias = user_clip.groupby('clip_id')['weight'].mean() - r_avg

In [55]:
# Initialize parameters
users = user_clip['user_id'].unique()
clips = user_clip['clip_id'].unique()
num_users = len(users)
num_clips = len(clips)

user_map = {user: i for i, user in enumerate(users)}
clip_map = {clip: i for i, clip in enumerate(clips)}

user_indices = user_clip['user_id'].map(user_map).values
clip_indices = user_clip['clip_id'].map(clip_map).values
train_weights = user_clip['weight'].values

In [57]:
import numpy as np
import pandas as pd
import tqdm
from time import time

user_bias = np.zeros(num_users)
clip_bias = np.zeros(num_clips)

learning_rate = 0.01
num_iterations = 500

start = time()
while time() - start < 1800:
# for _ in tqdm.tqdm(range(num_iterations)):
# Gradient descent
# for _ in range(num_iterations):
    for u_idx, c_idx, r in zip(user_indices, clip_indices, train_weights):
        error = r - r_avg - user_bias[u_idx] - clip_bias[c_idx]
        user_bias[u_idx] -= learning_rate * (-2 * error + 0.2 * user_bias[u_idx])
        clip_bias[c_idx] -= learning_rate * (-2 * error + 0.2 * clip_bias[c_idx])
print(user_bias)

[-489.57542438 -350.16378199  793.9636895  ... -347.29151232 -344.43436947
  867.03255635]


In [58]:
# Create DataFrames for user and clip biases
users_bias = pd.DataFrame({'user_id': users, 'user_bias': user_bias})
clips_bias = pd.DataFrame({'clip_id': clips, 'clip_bias': clip_bias})

user_clip = user_clip.merge(users_bias, on=['user_id'], how='left')
user_clip = user_clip.merge(clips_bias, on=['clip_id'], how='left')
print(user_clip)
user_clip['prediction'] = r_avg + user_clip['user_bias'] + user_clip['clip_bias']
user_clip['prediction'] = user_clip['prediction'].clip(lower=0)
print(user_clip)

def f1(df):
    error = ((df['prediction'] - df['weight']) ** 2).sum()
    regularization = 0.1 * ((df['user_bias'] ** 2).sum() + (df['clip_bias'] ** 2).sum())
    return error + regularization

print(f1(user_clip))

       user_id  clip_id  weight   user_bias   clip_bias
0          145    64135     131 -489.575424  627.017770
1          145    71619     445 -489.575424 -188.407417
2          145    76710      74 -489.575424 -111.415854
3          145    77532     157 -489.575424  317.029923
4          145    98678      67 -489.575424 -255.014235
...        ...      ...     ...         ...         ...
80597   999659   844418      36 -867.202176  -88.060018
80598   999975    43850      74 -742.448001   72.338029
80599   999975   249959      82 -742.448001  243.376635
80600   999975   500176      98 -742.448001  102.903631
80601   999975   627211      73 -742.448001  -53.618399

[80602 rows x 5 columns]
       user_id  clip_id  weight   user_bias   clip_bias  prediction
0          145    64135     131 -489.575424  627.017770  870.754522
1          145    71619     445 -489.575424 -188.407417   55.329335
2          145    76710      74 -489.575424 -111.415854  132.320898
3          145    77532     15

In [59]:
# Prepare the test data for predictions
test_df = pd.read_csv('test.csv').filter(['user_id', 'clip_id']).dropna()
test_df = test_df.merge(users_bias, on='user_id', how='left')
test_df = test_df.merge(clips_bias, on='clip_id', how='left')

# Calculate the predictions
test_df['prediction'] = r_avg + test_df['user_bias'].fillna(0) + test_df['clip_bias'].fillna(0)
test_df['prediction'] = test_df.apply(
    lambda row: 0 if pd.isna(row['user_bias']) or pd.isna(row['clip_bias']) else r_avg + row['user_bias'] + row['clip_bias'],
    axis=1
)
test_df['prediction'] = test_df['prediction'].clip(lower=0)

# Save the predictions to CSV
test_df.filter(['user_id', 'clip_id', 'prediction']).rename(columns={'prediction': 'weight'}).to_csv('319044434_314779166_task1.csv', index=False)

# Display the DataFrame
print(test_df)

      user_id  clip_id   user_bias    clip_bias   prediction
0      703680   155818  -90.595789   -76.367212   566.349174
1      801034   674816 -336.434020   264.506940   661.385096
2      975852   181501   -1.052007  -213.965562   518.294607
3      563786   589532 -212.498880  -154.035842   366.777454
4      680731   173540 -194.354470    -6.078988   532.878718
...       ...      ...         ...          ...          ...
2973   888088   154970 -609.967284  -192.378217     0.000000
2974   791263    90870 -562.687802  -241.720756     0.000000
2975   527585   918042 -225.956967  3476.738529  3984.093738
2976   197117   765004 -180.900104  -313.323427   239.088645
2977    38878   225992  125.988739  -140.148731   719.152184

[2978 rows x 5 columns]


## Task 2

In [13]:
import numpy as np
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

user_clip = pd.read_csv('user_clip.csv').dropna()
user_clip_matrix = user_clip.pivot(index='user_id', columns='clip_id', values='weight').fillna(0)
user_ids = user_clip_matrix.index
clip_ids = user_clip_matrix.columns

user_clip_matrix = csr_matrix(user_clip_matrix.values)
U, Σ, V_T = svds(user_clip_matrix, k=20)
predicted_user_clip_matrix = U @ np.diag(Σ) @ V_T
predicted_user_clip_matrix = pd.DataFrame(predicted_user_clip_matrix, columns=clip_ids, index=user_ids)

In [14]:
def predict(user_id_test, clip_id_test):
    if user_id_test in predicted_user_clip_matrix.index and clip_id_test in predicted_user_clip_matrix.columns:
        return predicted_user_clip_matrix.loc[user_id_test, clip_id_test]
    else:
        return 0

test_df = pd.read_csv('test.csv').filter(['user_id', 'clip_id']).dropna()
test_df['weight'] = test_df.apply(lambda row: predict(row['user_id'], row['clip_id']) ,axis=1)
test_df.to_csv('319044434_314779166_task2.csv', index=False)

In [15]:
predicted_user_clip = predicted_user_clip_matrix.reset_index().melt(id_vars='user_id', var_name='clip_id', value_name='weight')
predicted_user_clip = predicted_user_clip.rename(columns={'weight': 'prediction'})

In [16]:
def f2(weights, predictions):
    merged_df = weights.merge(predictions, on=['user_id', 'clip_id'])
    sse = ((merged_df['weight'] - merged_df['prediction']) ** 2).sum()
    return sse

print(f2(user_clip, predicted_user_clip))

229268390659.79822
