## Assignment1: Recommendation Systems

#### Question 1 :

In [152]:
import pandas as pd
import numpy as np

# Replace 'file_path.csv' with the actual path to your CSV file
posts_df = pd.read_csv('./csv/Posts.csv')
Tags = pd.read_csv('./csv/Tags.csv')

answers_df = posts_df[posts_df['PostTypeId'] == 2][['Id', 'OwnerUserId', 'ParentId']]
answers_df['ParentId'] = answers_df['ParentId'].astype(int)

# Step 4: Extract the questions with their tags (where 'PostTypeId == 1')
questions_df = posts_df[posts_df['PostTypeId'] == 1][['Id', 'Tags']]

# Step 5: Ensure the question Ids are also of type int64
questions_df['Id'] = questions_df['Id'].astype(int)

In [153]:
answerers_table = answers_df.groupby('OwnerUserId').size().reset_index(name='AnswerCount')
top_answerers = answerers_table.sort_values(by='AnswerCount', ascending=False).head(3)

# Group tags to find the tags with the highest count
top_tags = Tags[['TagName', 'Count']].sort_values(by='Count', ascending=False).head(3)


# Print the results
print("Top 3 users with the most answers:")
print(top_answerers)

print("\nTop 3 most used tags:")
print(top_tags)

Top 3 users with the most answers:
       OwnerUserId  AnswerCount
3189        9113.0         2839
19912     177980.0         2326
557         1204.0         2043

Top 3 most used tags:
    TagName  Count
259  design   5162
114      c#   4931
37     java   4929


### Question 2:

#### Step1 : First attach corresponding tags for answers by table join with questions table using Parent Id

In [154]:
# Step 3: Merge answers with the corresponding tags from the question (use ParentId to match question Id)
merged_df = pd.merge(answers_df, questions_df, left_on='ParentId', right_on='Id', suffixes=('_answer', '_question'))

# Step 4: Select relevant columns
filtered_answers_df = merged_df[['Id_answer', 'OwnerUserId', 'Tags']]  # 'Tags' here are from the question
answerer_counts = filtered_answers_df.groupby('OwnerUserId').size()
qualified_answerers = answerer_counts[answerer_counts >= 20].index
print("Qualified Answerers:", qualified_answerers[1:5])


Qualified Answerers: Index([6.0, 11.0, 14.0, 15.0], dtype='float64', name='OwnerUserId')


&nbsp;
#### Step2 : Filter the answers using the qualified answers ids

In [155]:

filtered_answers = filtered_answers_df[filtered_answers_df['OwnerUserId'].isin(qualified_answerers)]
print("filtered answrers:" , filtered_answers.head() )

filtered answrers:    Id_answer  OwnerUserId                                               Tags
0          3         11.0                           |comments|anti-patterns|
3         13          4.0                           |comments|anti-patterns|
4         20          6.0                     |productivity|time-management|
6         23         11.0                     |productivity|time-management|
8         26         17.0  |business|project-management|development-process|


&nbsp;
#### Step3 :  Filter tags and expand the answers tables by expanding rows for each tag

In [156]:
qualified_tags = Tags[Tags['Count'] >= 20]['Id']
print("Qualified Tags:", len(qualified_tags))

tag_dict = Tags.set_index('TagName')['Id'].to_dict()

tags_expanded = filtered_answers.copy()
tags_expanded['Tags'] = tags_expanded['Tags'].str.split('|').apply(lambda x: x[1:-1])
tags_expanded = tags_expanded.explode('Tags')
tags_expanded['Tags'] = tags_expanded['Tags'].map(tag_dict)
tags_expanded = tags_expanded[tags_expanded['Tags'].isin(qualified_tags)]

Qualified Tags: 974


&nbsp;
#### Step4 : Create Utility matrix from the filtered answers table 

In [157]:
expert_matrix = pd.pivot_table(
    tags_expanded, 
    index='OwnerUserId', 
    columns='Tags', 
    aggfunc='size', 
    fill_value=np.nan
)

all_qualified_tags = pd.Series(qualified_tags, name='Tags')
expert_matrix = expert_matrix.reindex(columns=all_qualified_tags, fill_value=np.nan)
print("Expert Matrix:", expert_matrix)
# dimensions = utility_matrix_sorted.shape
print("Dimensions of the Expert matrix:", expert_matrix.shape)

Expert Matrix: Tags         1     3     4     7     8     9     11    12    13    14    ...  \
OwnerUserId                                                              ...   
4.0          13.0   NaN   6.0   6.0  61.0  55.0   8.0   3.0   NaN   NaN  ...   
6.0           NaN   NaN   8.0   NaN   6.0   4.0   1.0   2.0   NaN   NaN  ...   
11.0          1.0   NaN   1.0   NaN   NaN   1.0   NaN   1.0   NaN   NaN  ...   
14.0          NaN   NaN   1.0   NaN   1.0   1.0   NaN   1.0   NaN   NaN  ...   
15.0          1.0   NaN   2.0   1.0   4.0   4.0   1.0   1.0   NaN   NaN  ...   
...           ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
356695.0      NaN   NaN   NaN   NaN   NaN   1.0   NaN   NaN   NaN   NaN  ...   
366014.0      NaN   NaN   NaN   NaN   NaN   NaN   1.0   NaN   NaN   NaN  ...   
373864.0      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
378329.0      1.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
379622.0      NaN   NaN  

&nbsp;
#### Step5: convert to numpy matrix

In [158]:
expert_matrix = expert_matrix.to_numpy()
print(expert_matrix)

[[13. nan  6. ... nan  1. nan]
 [nan nan  8. ... nan nan nan]
 [ 1. nan  1. ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [ 1. nan nan ... nan  5.  1.]
 [nan nan nan ... nan nan nan]]


### Question 3:

#### Step1: Normalize the utility matrix 

In [159]:
import numpy as np

# Create the utility_matrix with the same shape as expert_matrix
utility_matrix = expert_matrix

# Vectorized condition
utility_matrix[expert_matrix > 15] = 5
mask = ~np.isnan(expert_matrix) & (expert_matrix <= 15)
utility_matrix[mask] = expert_matrix[expert_matrix <= 15] // 3

In [160]:
print(utility_matrix)

[[ 4. nan  2. ... nan  0. nan]
 [nan nan  2. ... nan nan nan]
 [ 0. nan  0. ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [ 0. nan nan ... nan  1.  0.]
 [nan nan nan ... nan nan nan]]


In [161]:
sum_utility_matrix = np.sum(utility_matrix)
highest_row_sum = np.max(np.sum(utility_matrix, axis=1))
highest_column_sum = np.max(np.sum(utility_matrix, axis=0))

print("Utility Matrix Metrics:")
print("Summation value of the utility matrix:", sum_utility_matrix)
print("Highest row sum of the utility matrix:", highest_row_sum)
print("Highest column sum of the utility matrix:", highest_column_sum)

Utility Matrix Metrics:
Summation value of the utility matrix: nan
Highest row sum of the utility matrix: nan
Highest column sum of the utility matrix: nan


#### Step2: Create Test matrix

In [162]:
num_users, num_tags = utility_matrix.shape
user_cutoff = int(num_users * 0.85)
tag_cutoff = int(num_tags * 0.85)

print(user_cutoff,tag_cutoff)

test_matrix = utility_matrix[user_cutoff:, tag_cutoff:]
sum_test_matrix = np.sum(test_matrix)
highest_row_sum = np.max(np.sum(test_matrix, axis=1))
highest_column_sum = np.max(np.sum(test_matrix, axis=0))

print("Test Matrix Metrics:")
print("dimensions: ",test_matrix.shape)
print("Summation value of the utility matrix:", sum_test_matrix)
print("Highest row sum of the utility matrix:", highest_row_sum)
print("Highest column sum of the utility matrix:", highest_column_sum)

988 827
Test Matrix Metrics:
dimensions:  (175, 147)
Summation value of the utility matrix: nan
Highest row sum of the utility matrix: nan
Highest column sum of the utility matrix: nan


&nbsp;
### Question 4:

#### Tag-Tag Recommandation System

In [163]:
import numpy as np
import pandas as pd

data = utility_matrix[:user_cutoff]

# Convert to DataFrame
df = pd.DataFrame(data)

# Center the data by subtracting the mean of each item (column)
# Subtract column means from each value
df_centered = df.sub(df.mean(axis=1), axis=0)

# Pearson correlation matrix: compute pairwise correlation between items
similarity_matrix = df_centered.corr(method='pearson')


In [164]:
def predict_rating(user_id, item_id, ratings, similarity_matrix, N , type):
    
    # Get the similarity scores for the target item with all other items
    item_similarities = np.array(similarity_matrix[item_id][:tag_cutoff])
    
    # Get the ratings of user_id for all items
    user_ratings = np.array(ratings[user_id])
    
    # Sort the similarities and take the top N most similar items (excluding the target item itself)
    similar_items = np.argsort(item_similarities)[::-1]  # Sort indices by similarity in descending order

    start = 0
    while start < len(similar_items):
        if np.isnan(item_similarities[similar_items[start]]): start += 1
        else : break
    
    non_nan_similar_items = similar_items[start:]

    # Filter out items the user hasn't rated
    rated_items = [item for item in non_nan_similar_items 
                         if not np.isnan(ratings[user_id][item])]
    
    top_n_rated_items = rated_items[:N]  # Select top N similar items


    if len(top_n_rated_items) < N:
        return np.nanmean(user_ratings)
    
    # Get the user's ratings for the top N similar items and the corresponding similarities
    top_n_ratings = user_ratings[top_n_rated_items]
    top_n_similarities = item_similarities[top_n_rated_items]
    
    if type == "A":
     return np.sum(top_n_ratings) / N

    # Calculate the weighted sum of the ratings
    weighted_ratings_sum = np.dot(top_n_ratings, top_n_similarities)
    
    # Calculate the sum of the absolute values of the similarities

    similarity_sum = np.sum(top_n_similarities)
    
    # Return the weighted average as the predicted rating
    if similarity_sum != 0 : 
        predicted_rating = weighted_ratings_sum / similarity_sum
    else : 
        print("1")
        predicted_rating = 0
    
    return predicted_rating


In [165]:
for k in {2,3,5}:
    loss = 0
    cnt = 0
    for x in range(user_cutoff,num_users):
        for i in range(tag_cutoff,num_tags):
            if np.isnan(utility_matrix[x][i]): continue
            else : 
                true_value = utility_matrix[x][i]
                ans = predict_rating(x,i,utility_matrix,similarity_matrix,k,"A")
                if(np.isnan(ans)) : 
                    print(x,i,'Hello')
                loss = loss + (true_value - ans)*(true_value - ans)
                cnt += 1
    print("Loss with k= ",k,":",np.sqrt(loss / cnt))

Loss with k=  2 : 0.6714782216926365
Loss with k=  3 : 0.6519719714508565
Loss with k=  5 : 0.6257886856029714


In [166]:
for k in {2,3,5}:
    loss = 0
    cnt = 0
    for x in range(user_cutoff,num_users):
        for i in range(tag_cutoff,num_tags):
            if np.isnan(utility_matrix[x][i]): continue
            else : 
                true_value = utility_matrix[x][i]
                ans = predict_rating(x,i,utility_matrix,similarity_matrix,k,"B")
                if(np.isnan(ans)) : 
                    print(x,i,'Hello')
                loss = loss + (true_value - ans)*(true_value - ans)
                cnt += 1
    print("Loss with k= ",k,":",np.sqrt(loss / cnt))

Loss with k=  2 : 0.6720502670062237
Loss with k=  3 : 0.6519297599842564
Loss with k=  5 : 0.6257287296044581


#### User-User Recommandation System

In [167]:
import numpy as np
import pandas as pd

data = utility_matrix[:, :tag_cutoff]
print(data.shape)

# Convert to DataFrame
df = pd.DataFrame(data)

# Center the data by subtracting the mean of each item (column)
# Subtract column means from each value
df_centered = df.sub(df.mean(axis=0), axis=1)

# Pearson correlation matrix: compute pairwise correlation between items
similarity_matrix = df_centered.T.corr(method='pearson')
print(similarity_matrix.shape)

(1163, 827)
(1163, 1163)


In [168]:
def predict_rating(user_id, item_id, ratings, similarity_matrix, N, type, user_cutoff):
    # Get the similarity scores for the target user with all other users within the user_cutoff
    user_similarities = np.array(similarity_matrix[user_id][:user_cutoff])

    # Get the ratings for the item by all users within the user_cutoff
    item_ratings = np.array(ratings[:,item_id])

    # Sort the similarities and take the top N most similar users (excluding the target user itself)
    similar_users = np.argsort(user_similarities)[::-1]  # Sort indices by similarity in descending order

    # Exclude the target user from the similar users list (if present)
    similar_users = similar_users[similar_users != user_id]

    start = 0
    while start < len(similar_users):
        if np.isnan(user_similarities[similar_users[start]]):
            start += 1
        else:
            break

    non_nan_similar_users = similar_users[start:]


    # Filter out users who haven't rated the item
    rated_users = [user for user in non_nan_similar_users if not np.isnan(ratings[user, item_id])]

    top_n_rated_users = rated_users[:N]  # Select top N similar users

    if len(top_n_rated_users) < N:
        print("(tag_id:",item_id,")","Less similar users for N: ",N," ")
        return np.nanmean(item_ratings)

    # Get the ratings from the top N similar users and the corresponding similarities
    top_n_ratings = item_ratings[top_n_rated_users]
    top_n_similarities = user_similarities[top_n_rated_users]

    if type == "A":
        return np.sum(top_n_ratings) / N

    # Calculate the weighted sum of the ratings
    weighted_ratings_sum = np.dot(top_n_ratings, top_n_similarities)

    # Calculate the sum of the absolute values of the similarities
    similarity_sum = np.sum(top_n_similarities)

    # Return the weighted average as the predicted rating
    if similarity_sum != 0:
        predicted_rating = weighted_ratings_sum / similarity_sum
    else:
        print("1")
        predicted_rating = 0

    return predicted_rating

In [169]:
for k in {2, 3, 5}:  # Different values of N (number of similar users to consider)
    loss = 0
    cnt = 0
    for i in range(tag_cutoff, num_tags):  # Iterate through items
     for x in range(user_cutoff, num_users):  # Iterate through users
            if np.isnan(utility_matrix[x][i]):  # Skip if the user hasn't rated the item
                continue
            else:
                true_value = utility_matrix[x][i]  # Actual rating
                ans = predict_rating(x, i, utility_matrix, similarity_matrix, k, "A", user_cutoff)  # Predict rating
                
                if np.isnan(ans): 
                    print(x, i, 'Hello')
                    continue  # Skip NaN predictions to avoid errors
                
                # Calculate squared error
                loss += (true_value - ans) ** 2
                cnt += 1
    
    # Calculate and print RMSE (Root Mean Squared Error) for the current value of k
    print(f"Loss with k = {k}: {np.sqrt(loss / cnt)}")


Loss with k = 2: 0.6078950763421757
Loss with k = 3: 0.5806122807470818
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
Loss with k = 5: 0.5671975178741177


In [170]:
for k in {2, 3, 5}:  # Different values of N (number of similar users to consider)
    loss = 0
    cnt = 0
    for i in range(tag_cutoff, num_tags):  # Iterate through items
     for x in range(user_cutoff, num_users):  # Iterate through users
            if np.isnan(utility_matrix[x][i]):  # Skip if the user hasn't rated the item
                continue
            else:
                true_value = utility_matrix[x][i]  # Actual rating
                ans = predict_rating(x, i, utility_matrix, similarity_matrix, k, "B", user_cutoff)  # Predict rating
                
                if np.isnan(ans): 
                    print(x, i, 'Hello')
                    continue  # Skip NaN predictions to avoid errors
                
                # Calculate squared error
                loss += (true_value - ans) ** 2
                cnt += 1
    
    # Calculate and print RMSE (Root Mean Squared Error) for the current value of k
    print(f"Loss with k = {k}: {np.sqrt(loss / cnt)}")


Loss with k = 2: 0.607478275445503
Loss with k = 3: 0.5801996120223132
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
(tag_id: 862 ) Less similar users for N:  5  
Loss with k = 5: 0.7369209013096282


&nbsp;
### Question 5:

In [171]:
import numpy as np
from sklearn.metrics import mean_squared_error

class MatrixFactorizationSGD:
    def __init__(self, R, K, alpha=0.0005, epochs=10, lambda1=0, lambda2=0):
        """
        Initialize the matrix factorization model with two regularization parameters.
        
        Parameters:
        R : np.array (users x items) -> Utility matrix with values NaN, 0, 1, 2, 3, 4, 5
        K : int -> Number of latent factors
        alpha : float -> Learning rate
        epochs : int -> Number of full passes through the data
        lambda1 : float -> Regularization parameter for user-feature matrix P
        lambda2 : float -> Regularization parameter for item-feature matrix Q
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K  # Number of latent factors
        self.alpha = alpha  # Learning rate
        self.epochs = epochs  # Number of full passes (epochs)
        self.lambda1 = lambda1  # Regularization parameter for P
        self.lambda2 = lambda2  # Regularization parameter for Q
        self.P = np.random.rand(self.num_users, K)  # User-feature matrix
        self.Q = np.random.rand(self.num_items, K)  # Item-feature matrix

    def train(self):
        """
        Train the matrix factorization model using stochastic gradient descent (SGD) over multiple epochs.
        """
        for epoch in range(self.epochs):
            for i in range(self.num_users):
                for j in range(self.num_items):
                    if not np.isnan(self.R[i][j]):  # Skip NaN values
                        # Compute the prediction error
                        eij = self.R[i][j] - np.dot(self.P[i, :], self.Q[j, :])

                        # Update P and Q matrices with regularization terms lambda1 and lambda2
                        for k in range(self.K):
                            self.P[i][k] += self.alpha * (2 * eij * self.Q[j][k] - 2 * self.lambda1 * self.P[i][k])
                            self.Q[j][k] += self.alpha * (2 * eij * self.P[i][k] - 2 * self.lambda2 * self.Q[j][k])
            
            # Compute and print the loss at the end of each epoch
            loss = self.compute_loss()
            print(f"Epoch {epoch + 1}/{self.epochs}: Loss = {loss}")
        
        return self.P, self.Q

    def compute_loss(self):
        """
        Compute the cost function (loss) with two regularization parameters.
        The loss includes the squared error and regularization terms.
        """
        predicted_R = np.dot(self.P, self.Q.T)
        loss = 0
        num_non_nan_entries = 0

        for i in range(self.num_users):
            for j in range(self.num_items):
                if not np.isnan(self.R[i][j]):  # Only consider non-NaN values
                    error = self.R[i][j] - predicted_R[i][j]
                    loss += error ** 2
                    num_non_nan_entries += 1
        
        # Adding regularization terms for P and Q matrices
        loss += self.lambda1 * np.sum(self.P ** 2)  # Regularization for P
        loss += self.lambda2 * np.sum(self.Q ** 2)  # Regularization for Q

        # Normalize the loss by the number of non-NaN entries
        loss /= num_non_nan_entries

        return loss


    def predict(self):
        """
        Predict the rating matrix after training.
        """
        return np.dot(self.P, self.Q.T)

    def rmse(self, predicted_R):
        """
        Compute the Root Mean Square Error (RMSE) between the predicted and actual ratings.
        """
        actual_ratings = self.R[~np.isnan(self.R)]
        predicted_ratings = predicted_R[~np.isnan(self.R)]
        return np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

# Test the updated modular code with a sample utility matrix
def run_tests():
    # Sample utility matrix (NaN represents missing ratings)
    R = utility_matrix

    # Parameters
    latent_factors = [2, 5, 10]
    alpha = 0.0005
    epochs = 10
    lambda1List = [0, 0.001, 0.05, 0.5]  # Regularization parameter for P
    lambda2List = [0, 0.003, 0.05, 0.75]  # Regularization parameter for Q

    for K in latent_factors:
        for lambda1, lambda2 in zip(lambda1List, lambda2List):
            mf_reg = MatrixFactorizationSGD(R, K, alpha, epochs, lambda1=lambda1, lambda2=lambda2)
            P_reg, Q_reg = mf_reg.train()
            predicted_R_reg = mf_reg.predict()
            rmse_reg = mf_reg.rmse(predicted_R_reg)

            print(f"Latent Factors: {K}")
            print(f"RMSE with regularization (lambda1={lambda1}, lambda2={lambda2}): {rmse_reg}")

# Run the tests
run_tests()


ModuleNotFoundError: No module named 'sklearn'