## Assignment1: Recommendation Systems

#### Question 1 :

In [106]:
import pandas as pd
import numpy as np

posts_df = pd.read_csv('./csv/Posts.csv')
Tags = pd.read_csv('./csv/Tags.csv')

answers_df = posts_df[posts_df['PostTypeId'] == 2][['Id', 'OwnerUserId', 'ParentId']]
answers_df = answers_df.drop_duplicates(subset=['OwnerUserId','ParentId'])
answers_df['ParentId'] = answers_df['ParentId'].astype(int)

questions_df = posts_df[posts_df['PostTypeId'] == 1][['Id', 'Tags']]

questions_df['Id'] = questions_df['Id'].astype(int)

In [107]:
answerers_table = answers_df.groupby('OwnerUserId').size().reset_index(name='AnswerCount')
top_answerers = answerers_table.sort_values(by='AnswerCount', ascending=False).head(3)

top_tags = Tags[['TagName', 'Count']].sort_values(by='Count', ascending=False).head(3)

print("Top 3 users with the most answers:")
print(top_answerers)

print("\nTop 3 most used tags:")
print(top_tags)

Top 3 users with the most answers:
       OwnerUserId  AnswerCount
3189        9113.0         2838
19912     177980.0         2318
557         1204.0         2042

Top 3 most used tags:
    TagName  Count
259  design   5162
114      c#   4931
37     java   4929


### Question 2:

#### Step1 : First attach corresponding tags for answers by table join with questions table using Parent Id

In [108]:
merged_df = pd.merge(answers_df, questions_df, left_on='ParentId', right_on='Id', suffixes=('_answer', '_question'))

filtered_answers_df = merged_df[['Id_answer', 'OwnerUserId', 'Tags','ParentId']] 
filtered_answers_df = filtered_answers_df.drop_duplicates(['OwnerUserId','ParentId'])
answerer_counts = filtered_answers_df.groupby('OwnerUserId').size()
qualified_answerers = answerer_counts[answerer_counts >= 20].index
print("No. of Qualified Answerers:", len(qualified_answerers))


No. of Qualified Answerers: 1160


&nbsp;
#### Step2 : Filter the answers using the qualified answers ids

In [109]:

filtered_answers = filtered_answers_df[filtered_answers_df['OwnerUserId'].isin(qualified_answerers)]
print("filtered answerers:" , filtered_answers.head() )

filtered answerers:    Id_answer  OwnerUserId                                               Tags  \
0          3         11.0                           |comments|anti-patterns|   
3         13          4.0                           |comments|anti-patterns|   
4         20          6.0                     |productivity|time-management|   
6         23         11.0                     |productivity|time-management|   
8         26         17.0  |business|project-management|development-process|   

   ParentId  
0         1  
3         1  
4         9  
6         9  
8         4  


&nbsp;
#### Step3 :  Filter tags and expand the answers tables by expanding rows for each tag

In [110]:
qualified_tags = Tags[Tags['Count'] >= 20]['Id']
print("Qualified Tags:", len(qualified_tags))

tag_dict = Tags.set_index('TagName')['Id'].to_dict()

tags_expanded = filtered_answers.copy()

tags_expanded['Tags'] = tags_expanded['Tags'].str.split('|').apply(lambda x: x[1:-1])
tags_expanded = tags_expanded.explode('Tags')
tags_expanded['Tags'] = tags_expanded['Tags'].map(tag_dict)
tags_expanded = tags_expanded[tags_expanded['Tags'].isin(qualified_tags)]

Qualified Tags: 974


&nbsp;
#### Step4 : Create Utility matrix from the filtered answers table 

In [111]:
expert_matrix_df = pd.pivot_table(
    tags_expanded, 
    index='OwnerUserId', 
    columns='Tags', 
    aggfunc='size', 
    fill_value=np.nan
)

expert_matrix_df.index = expert_matrix_df.index.astype(int)
expert_matrix_df.columns = expert_matrix_df.columns.astype(int)

all_qualified_tags = pd.Series(qualified_tags, name='Tags')
print("Expert Matrix:", expert_matrix_df)
print("Dimensions of the Expert matrix:", expert_matrix_df.shape)

Expert Matrix: Tags         1     3     4     7     8     9     11    12    13    14    ...  \
OwnerUserId                                                              ...   
4            13.0   NaN   6.0   6.0  61.0  55.0   8.0   3.0   NaN   NaN  ...   
6             NaN   NaN   8.0   NaN   6.0   4.0   1.0   2.0   NaN   NaN  ...   
11            1.0   NaN   1.0   NaN   NaN   1.0   NaN   1.0   NaN   NaN  ...   
14            NaN   NaN   1.0   NaN   1.0   1.0   NaN   1.0   NaN   NaN  ...   
15            1.0   NaN   2.0   1.0   4.0   4.0   1.0   1.0   NaN   NaN  ...   
...           ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
356695        NaN   NaN   NaN   NaN   NaN   1.0   NaN   NaN   NaN   NaN  ...   
366014        NaN   NaN   NaN   NaN   NaN   NaN   1.0   NaN   NaN   NaN  ...   
373864        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
378329        1.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
379622        NaN   NaN  

&nbsp;
#### Step5: convert to numpy matrix

In [112]:
expert_matrix = expert_matrix_df.to_numpy()
print(expert_matrix)

[[13. nan  6. ... nan  1. nan]
 [nan nan  8. ... nan nan nan]
 [ 1. nan  1. ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [ 1. nan nan ... nan  5.  1.]
 [nan nan nan ... nan nan nan]]


### Question 3:

#### Step1: Normalize the utility matrix 

In [113]:
import numpy as np

normalize = lambda x: np.nan if np.isnan(x) else (np.floor(x / 3) if x < 15 else 5)
vectorized_modify_entries = np.vectorize(normalize)
utility_matrix = vectorized_modify_entries(expert_matrix)
utility_matrix_df = expert_matrix_df.map(normalize)

In [114]:
print(utility_matrix)

[[ 4. nan  2. ... nan  0. nan]
 [nan nan  2. ... nan nan nan]
 [ 0. nan  0. ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [ 0. nan nan ... nan  1.  0.]
 [nan nan nan ... nan nan nan]]


In [115]:
sum_utility_matrix = np.nansum(utility_matrix)
highest_row_sum = np.max(np.nansum(utility_matrix, axis=1))
highest_column_sum = np.max(np.nansum(utility_matrix, axis=0))

print("Utility Matrix Metrics:")
print("Summation value of the utility matrix:", sum_utility_matrix)
print("Highest row sum of the utility matrix:", highest_row_sum)
print("Highest column sum of the utility matrix:", highest_column_sum)

Utility Matrix Metrics:
Summation value of the utility matrix: 41180.0
Highest row sum of the utility matrix: 1162.0
Highest column sum of the utility matrix: 1403.0


#### Step2: Create Train and Test matrix

In [116]:
num_users, num_tags = utility_matrix.shape
user_cutoff = int(num_users * 0.85)
tag_cutoff = int(num_tags * 0.85)

train_matrix = utility_matrix.copy()
train_df = utility_matrix_df.copy()


train_matrix[user_cutoff:, tag_cutoff:] = np.nan
train_df.iloc[user_cutoff: ,tag_cutoff:] = np.nan
train_matrix_sum = np.nansum(train_matrix)

test_matrix = utility_matrix[user_cutoff:, tag_cutoff:]
test_df = utility_matrix_df.iloc[user_cutoff: , tag_cutoff:]
sum_test_matrix = np.nansum(test_matrix)

print("Train and Test Matrix Metrics:")
print("Summation value of the train matrix:", train_matrix_sum)
print("Dimension of Test Matrix:",test_matrix.shape)
print("Summation value of the test matrix:", sum_test_matrix)

Train and Test Matrix Metrics:
Summation value of the train matrix: 40538.0
Dimension of Test Matrix: (174, 146)
Summation value of the test matrix: 642.0


&nbsp;
### Question 4:

#### Tag-Tag Recommendation System

#### Step-1: Calculate Similarity matrix for Tag-Tag

In [117]:
import numpy as np
import pandas as pd

data = utility_matrix[:user_cutoff]

df = pd.DataFrame(data)

df_centered = df.sub(df.mean(axis=1), axis=0)

similarity_matrix = df_centered.corr(method='pearson')


Step-2: Prediction function

In [118]:
def predict_rating(user_id, item_id, ratings, similarity_matrix, N , type):
    
    item_similarities = np.array(similarity_matrix[item_id][:tag_cutoff])
    user_ratings = np.array(ratings[user_id])
    similar_items = np.argsort(item_similarities)[::-1]

    start = 0
    while start < len(similar_items):
        if np.isnan(item_similarities[similar_items[start]]): start += 1
        else : break
    
    non_nan_similar_items = similar_items[start:]

    rated_items = [item for item in non_nan_similar_items 
                         if not np.isnan(ratings[user_id][item])]
    top_n_rated_items = rated_items[:N]  

    if len(top_n_rated_items) < N:
        return np.nanmean(user_ratings)
    
    top_n_ratings = user_ratings[top_n_rated_items]
    top_n_similarities = item_similarities[top_n_rated_items]
    
    if type == "A":
     return np.sum(top_n_ratings) / N

    weighted_ratings_sum = np.dot(top_n_ratings, top_n_similarities)

    similarity_sum = np.sum(top_n_similarities)
    if similarity_sum != 0 : 
        predicted_rating = weighted_ratings_sum / similarity_sum
    else : 
        predicted_rating = 0
    
    return predicted_rating

#### Step-3: Predict and Calculate loss

##### Simple Average Prediction

In [119]:
print("Simple Average")
for k in {2,3,5}:
    loss = 0
    cnt = 0
    for x in range(user_cutoff,num_users):
        for i in range(tag_cutoff,num_tags):
            if np.isnan(utility_matrix[x][i]): continue
            else : 
                true_value = utility_matrix[x][i]
                ans = predict_rating(x,i,utility_matrix,similarity_matrix,k,"A")
                if(np.isnan(ans)) : 
                    print(x,i,'Hello')
                loss = loss + (true_value - ans)*(true_value - ans)
                cnt += 1
    print("Loss with k =",k,":",np.sqrt(loss / cnt))

Simple Average
Loss with k = 2 : 0.8368331915377201
Loss with k = 3 : 0.8068537836476033
Loss with k = 5 : 0.7667057566908195


##### Weighted Average Prediction

In [120]:
print("Weighted Average")
for k in {2,3,5}:
    loss = 0
    cnt = 0
    for x in range(user_cutoff,num_users):
        for i in range(tag_cutoff,num_tags):
            if np.isnan(utility_matrix[x][i]): continue
            else : 
                true_value = utility_matrix[x][i]
                ans = predict_rating(x,i,utility_matrix,similarity_matrix,k,"B")
                if(np.isnan(ans)) : 
                    print(x,i,'Hello')
                loss = loss + (true_value - ans)*(true_value - ans)
                cnt += 1
    print("Loss with k =",k,":",np.sqrt(loss / cnt))

Weighted Average
Loss with k = 2 : 0.8368997316823331
Loss with k = 3 : 0.8066557194070604
Loss with k = 5 : 0.768163144485662


#### User-User Recommendation System

#### Calculate Similarity Matrix

In [121]:
import numpy as np
import pandas as pd

data = utility_matrix[:, :tag_cutoff]

df = pd.DataFrame(data)
df_centered = df.sub(df.mean(axis=0), axis=1)

similarity_matrix = df_centered.T.corr(method='pearson')

#### Predict Function with Simple Average

In [122]:
def predict_rating(user_id, item_id, ratings, similarity_matrix, N, type, user_cutoff):
    user_similarities = np.array(similarity_matrix[user_id][:user_cutoff])
    item_ratings = np.array(ratings[:,item_id])
    similar_users = np.argsort(user_similarities)[::-1]
    similar_users = similar_users[similar_users != user_id]

    start = 0
    while start < len(similar_users):
        if np.isnan(user_similarities[similar_users[start]]):
            start += 1
        else:
            break

    non_nan_similar_users = similar_users[start:]

    rated_users = [user for user in non_nan_similar_users if not np.isnan(ratings[user, item_id])]

    top_n_rated_users = rated_users[:N]

    if len(top_n_rated_users) < N:
        return np.nanmean(item_ratings)

    top_n_ratings = item_ratings[top_n_rated_users]
    top_n_similarities = user_similarities[top_n_rated_users]

    if type == "A":
        return np.sum(top_n_ratings)/N

    weighted_ratings_sum = np.dot(top_n_ratings, top_n_similarities)

    similarity_sum = np.sum(top_n_similarities)

    if similarity_sum != 0:
        predicted_rating = weighted_ratings_sum / similarity_sum
    else:
        predicted_rating = 0

    return predicted_rating

#### Simple Average

In [123]:
print("Simple Average")
for k in {2, 3, 5}:
    loss = 0
    cnt = 0
    for i in range(tag_cutoff, num_tags):
     for x in range(user_cutoff, num_users):
            if np.isnan(utility_matrix[x][i]):
                continue
            else:
                true_value = utility_matrix[x][i]
                ans = predict_rating(x, i, utility_matrix, similarity_matrix, k, "A", user_cutoff)
                
                if np.isnan(ans): 
                    print(x, i, 'Hello')
                    continue
                
                loss += (true_value - ans) ** 2
                cnt += 1
    
    print(f"Loss with k = {k}: {np.sqrt(loss / cnt)},{cnt}")


Simple Average
Loss with k = 2: 0.7006938793917289,2243
Loss with k = 3: 0.690393644492957,2243
Loss with k = 5: 0.6769631382163931,2243


#### Weighted Average

In [124]:
print("Weighted Average")
for k in {2, 3, 5}: 
    loss = 0
    cnt = 0
    for i in range(tag_cutoff, num_tags):
     for x in range(user_cutoff, num_users):  
            if np.isnan(utility_matrix[x][i]):
                continue
            else:
                true_value = utility_matrix[x][i]
                ans = predict_rating(x, i, utility_matrix, similarity_matrix, k, "B", user_cutoff)
                
                if np.isnan(ans): 
                    continue
                
                loss += (true_value - ans) ** 2
                cnt += 1

    print(f"Loss with k = {k}: {np.sqrt(loss / cnt)}")


Weighted Average
Loss with k = 2: 1.0257074115306568
Loss with k = 3: 0.7457370435666525
Loss with k = 5: 0.6830546662035217


&nbsp;
### Question 5:

In [90]:
import numpy as np
from sklearn.metrics import mean_squared_error

class MatrixFactorizationSGD:
    def __init__(self, R, K, alpha=0.0005, epochs=10, lambda1=0, lambda2=0):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K 
        self.alpha = alpha
        self.epochs = epochs
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.P = np.random.normal(scale=1./K, size=(self.num_users, K))
        self.Q = np.random.normal(scale=1./K, size=(self.num_items, K))
        # self.P = np.random.rand(self.num_users, K)  # User-feature matrix
        # self.Q = np.random.rand(self.num_items, K)  # Item-feature matrix

    def train(self):
        for epoch in range(self.epochs):
            for i in range(self.num_users):
                for j in range(self.num_items):
                    if not np.isnan(self.R[i][j]):
                        eij = self.R[i][j] - np.dot(self.P[i, :], self.Q[j, :])
                        self.P[i][:] += self.alpha * (2 * eij * self.Q[j][:] - 2 * self.lambda1 * self.P[i][:])
                        self.Q[j][:] += self.alpha * (2 * eij * self.P[i][:] - 2 * self.lambda2 * self.Q[j][:])
            
            loss = self.compute_loss()
            print(f"Epoch {epoch + 1}/{self.epochs}: Loss = {loss}")
        
        return self.P, self.Q

    def compute_loss(self):
        predicted_R = np.dot(self.P, self.Q.T)
        loss = 0
        num_non_nan_entries = 0

        for i in range(self.num_users):
            for j in range(self.num_items):
                if not np.isnan(self.R[i][j]):
                    error = self.R[i][j] - predicted_R[i][j]
                    loss += error ** 2
                    num_non_nan_entries += 1
        
        loss += self.lambda1 * np.sum(self.P ** 2)
        loss += self.lambda2 * np.sum(self.Q ** 2)
        loss /= num_non_nan_entries

        return loss

    def predict(self, test_matrix):
        predicted_R = np.dot(self.P, self.Q.T)
        predicted_R_test = np.copy(test_matrix)
        
        for i in range(test_matrix.shape[0]):
            for j in range(test_matrix.shape[1]):
                if test_matrix[i][j] is not None:
                    predicted_R_test[i][j] = predicted_R[i+user_cutoff][j+tag_cutoff]

        return predicted_R_test
    
    def rmse(self, predicted_R, matrix):
        mask = ~np.isnan(matrix)
        actual_ratings = matrix[mask]
        predicted_ratings = predicted_R[mask]
        return np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

In [92]:
R_train = train_matrix
R_test = test_matrix

latent_factors = [2, 5, 10]
alpha = 0.0005
epochs = 50
lambda1List = [0, 0.001, 0.05, 0.5]
lambda2List = [0, 0.003, 0.05, 0.75]

for K in latent_factors:
    for lambda1, lambda2 in zip(lambda1List, lambda2List):
        mf_reg = MatrixFactorizationSGD(R_train, K, alpha, epochs, lambda1=lambda1, lambda2=lambda2)
        P_reg, Q_reg = mf_reg.train()

        predicted_R_train = np.dot(P_reg, Q_reg.T)
        train_rmse = mf_reg.rmse(predicted_R_train, R_train)

        predicted_R_reg = mf_reg.predict(R_test)
        rmse_reg = mf_reg.rmse(predicted_R_reg, R_test)

        print(f"Latent Factor: {K}")
        print(f"RMSE on train data with regularization (lambda1={lambda1}, lambda2={lambda2}): {train_rmse}")
        print(f"RMSE on test data with regularization (lambda1={lambda1}, lambda2={lambda2}): {rmse_reg}")

Epoch 1/50: Loss = 1.016373251602643
Epoch 2/50: Loss = 0.9959827107035324
Epoch 3/50: Loss = 0.9812743307650574
Epoch 4/50: Loss = 0.9699504081844296
Epoch 5/50: Loss = 0.9606303630029334
Epoch 6/50: Loss = 0.9523923222336005
Epoch 7/50: Loss = 0.9445408949322069
Epoch 8/50: Loss = 0.9364774751183278
Epoch 9/50: Loss = 0.9276209662909884
Epoch 10/50: Loss = 0.9173573772682662
Epoch 11/50: Loss = 0.9050124956682538
Epoch 12/50: Loss = 0.8898523059464707
Epoch 13/50: Loss = 0.8711237273971936
Epoch 14/50: Loss = 0.8481517474830981
Epoch 15/50: Loss = 0.8205017214663773
Epoch 16/50: Loss = 0.7881888500942859
Epoch 17/50: Loss = 0.7518694301422807
Epoch 18/50: Loss = 0.7129035251721156
Epoch 19/50: Loss = 0.67318818153183
Epoch 20/50: Loss = 0.6347635706001945
Epoch 21/50: Loss = 0.5993447499986889
Epoch 22/50: Loss = 0.5679979684180884
Epoch 23/50: Loss = 0.5410850464560236
Epoch 24/50: Loss = 0.5184283452623974
Epoch 25/50: Loss = 0.4995531277747763
Epoch 26/50: Loss = 0.483889037434800

KeyboardInterrupt: 

In [84]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, KNNBaseline
from surprise.model_selection import train_test_split
from surprise import accuracy

In [85]:
def matrix_to_surprise_df(matrix):
    df = matrix.stack().reset_index()
    df.columns = ['user', 'item', 'rating']
    return df[pd.notna(df['rating'])]

In [86]:
def surprise_knn(train_matrix, test_matrix, sim_options, N_values):
    train_df = matrix_to_surprise_df(train_matrix)
    print(train_df)
    test_df = matrix_to_surprise_df(test_matrix)
    print(test_df)
    
    reader = Reader(rating_scale=(0, max(train_matrix.max().max(), test_matrix.max().max())))
    train_data = Dataset.load_from_df(train_df[['user', 'item', 'rating']], reader)
    trainset = train_data.build_full_trainset()
    testset = list(test_df.itertuples(index=False, name=None))

    rmse_results = {}
    
    for N in N_values:
        sim_options['k'] = N
        algo = KNNBaseline(k=N, sim_options=sim_options)
        algo.fit(trainset)
        predictions = algo.test(testset)
        rmse = accuracy.rmse(predictions, verbose=False)
        rmse_results[N] = rmse

    return rmse_results

In [87]:
sim_options_item_item = {'name': 'pearson_baseline', 'user_based': False}  # Item-Item
sim_options_user_user = {'name': 'pearson_baseline', 'user_based': True}   # User-User
N_values = [2, 3, 5]

In [88]:
rmse_item_item_surprise = surprise_knn(train_df, test_df, sim_options_item_item, N_values)

          user  item  rating
0            4     1     4.0
1            4     4     2.0
2            4     7     2.0
3            4     8     5.0
4            4     9     5.0
...        ...   ...     ...
118222  379622  2845     0.0
118223  379622  2876     0.0
118224  379622  3018     0.0
118225  379622  3199     0.0
118226  379622  3242     0.0

[118227 rows x 3 columns]
        user  item  rating
0     121035  3393     0.0
1     121035  3586     0.0
2     121035  3631     0.0
3     121035  3757     0.0
4     121035  3761     0.0
...      ...   ...     ...
2238  379622  4094     0.0
2239  379622  4095     0.0
2240  379622  4164     0.0
2241  379622  4206     0.0
2242  379622  4533     0.0

[2243 rows x 3 columns]
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing t

In [89]:
print("Surprise Library RMSE Results (Tag-Tag):")
for N in rmse_item_item_surprise:
    print(f"N={N}: RMSE={rmse_item_item_surprise[N]}")

Surprise Library RMSE Results (Tag-Tag):
N=2: RMSE=0.7669281403985956
N=3: RMSE=0.7275695777976227
N=5: RMSE=0.694168988415908
