## Assignment1: Recommendation Systems

#### Question 1 :

In [20]:
import pandas as pd
import numpy as np

# Replace 'file_path.csv' with the actual path to your CSV file
posts_df = pd.read_csv('./csv/Posts.csv')
Tags = pd.read_csv('./csv/Tags.csv')

answers_df = posts_df[posts_df['PostTypeId'] == 2][['Id', 'OwnerUserId', 'ParentId']]
answers_df['ParentId'] = answers_df['ParentId'].astype(int)

# Step 4: Extract the questions with their tags (where 'PostTypeId == 1')
questions_df = posts_df[posts_df['PostTypeId'] == 1][['Id', 'Tags']]

# Step 5: Ensure the question Ids are also of type int64
questions_df['Id'] = questions_df['Id'].astype(int)

In [21]:
answerers_table = answers_df.groupby('OwnerUserId').size().reset_index(name='AnswerCount')
top_answerers = answerers_table.sort_values(by='AnswerCount', ascending=False).head(3)

# Group tags to find the tags with the highest count
top_tags = Tags[['TagName', 'Count']].sort_values(by='Count', ascending=False).head(3)


# Print the results
print("Top 3 users with the most answers:")
print(top_answerers)

print("\nTop 3 most used tags:")
print(top_tags)

Top 3 users with the most answers:
       OwnerUserId  AnswerCount
3189        9113.0         2839
19912     177980.0         2326
557         1204.0         2043

Top 3 most used tags:
    TagName  Count
259  design   5162
114      c#   4931
37     java   4929


### Question 2:

#### Step1 : First attach corresponding tags for answers by table join with questions table using Parent Id

In [22]:
# Step 3: Merge answers with the corresponding tags from the question (use ParentId to match question Id)
merged_df = pd.merge(answers_df, questions_df, left_on='ParentId', right_on='Id', suffixes=('_answer', '_question'))

# Step 4: Select relevant columns
filtered_answers_df = merged_df[['Id_answer', 'OwnerUserId', 'Tags']]  # 'Tags' here are from the question
answerer_counts = filtered_answers_df.groupby('OwnerUserId').size()
qualified_answerers = answerer_counts[answerer_counts >= 20].index
print("Qualified Answerers:", qualified_answerers[1:5])


Qualified Answerers: Float64Index([6.0, 11.0, 14.0, 15.0], dtype='float64', name='OwnerUserId')


&nbsp;
#### Step2 : Filter the answers using the qualified answers ids

In [23]:

filtered_answers = filtered_answers_df[filtered_answers_df['OwnerUserId'].isin(qualified_answerers)]
print("filtered answrers:" , filtered_answers.head() )

filtered answrers:    Id_answer  OwnerUserId                      Tags
0          3         11.0  |comments|anti-patterns|
2         13          4.0  |comments|anti-patterns|
3         56         17.0  |comments|anti-patterns|
6        482        148.0  |comments|anti-patterns|
8       1680        552.0  |comments|anti-patterns|


&nbsp;
#### Step3 :  Filter tags and expand the answers tables by expanding rows for each tag

In [24]:
qualified_tags = Tags[Tags['Count'] >= 20]['Id']
print("Qualified Tags:", len(qualified_tags))

tag_dict = Tags.set_index('TagName')['Id'].to_dict()

tags_expanded = filtered_answers.copy()
tags_expanded['Tags'] = tags_expanded['Tags'].str.split('|').apply(lambda x: x[1:-1])
tags_expanded = tags_expanded.explode('Tags')
tags_expanded['Tags'] = tags_expanded['Tags'].map(tag_dict)
tags_expanded = tags_expanded[tags_expanded['Tags'].isin(qualified_tags)]

Qualified Tags: 974


&nbsp;
#### Step4 : Create Utility matrix from the filtered answers table 

In [25]:
expert_matrix = pd.pivot_table(
    tags_expanded, 
    index='OwnerUserId', 
    columns='Tags', 
    aggfunc='size', 
    fill_value=np.nan
)

all_qualified_tags = pd.Series(qualified_tags, name='Tags')
expert_matrix = expert_matrix.reindex(columns=all_qualified_tags, fill_value=0)
print("Expert Matrix:", expert_matrix)

# dimensions = utility_matrix_sorted.shape
print("Dimensions of the Expert matrix:", expert_matrix.shape)

Expert Matrix: Tags         1     3     4     7     8     9     11    12    13    14    ...  \
OwnerUserId                                                              ...   
4.0          13.0   NaN   6.0   6.0  61.0  55.0   8.0   3.0   NaN   NaN  ...   
6.0           NaN   NaN   8.0   NaN   6.0   4.0   1.0   2.0   NaN   NaN  ...   
11.0          1.0   NaN   1.0   NaN   NaN   1.0   NaN   1.0   NaN   NaN  ...   
14.0          NaN   NaN   1.0   NaN   1.0   1.0   NaN   1.0   NaN   NaN  ...   
15.0          1.0   NaN   2.0   1.0   4.0   4.0   1.0   1.0   NaN   NaN  ...   
...           ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
356695.0      NaN   NaN   NaN   NaN   NaN   1.0   NaN   NaN   NaN   NaN  ...   
366014.0      NaN   NaN   NaN   NaN   NaN   NaN   1.0   NaN   NaN   NaN  ...   
373864.0      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
378329.0      1.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
379622.0      NaN   NaN  

&nbsp;
#### Step5: convert to numpy matrix

In [26]:
expert_matrix = expert_matrix.to_numpy()
print(expert_matrix)

[[13. nan  6. ... nan  1. nan]
 [nan nan  8. ... nan nan nan]
 [ 1. nan  1. ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [ 1. nan nan ... nan  5.  1.]
 [nan nan nan ... nan nan nan]]


### Question 3:

#### Step1: Normalize the utility matrix 

In [27]:
import numpy as np

# Create the utility_matrix with the same shape as expert_matrix
utility_matrix = expert_matrix

# Vectorized condition
utility_matrix[expert_matrix > 15] = 5
mask = ~np.isnan(expert_matrix) & (expert_matrix <= 15)
utility_matrix[mask] = expert_matrix[expert_matrix <= 15] // 3

In [28]:
print(utility_matrix)

[[ 4. nan  2. ... nan  0. nan]
 [nan nan  2. ... nan nan nan]
 [ 0. nan  0. ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [ 0. nan nan ... nan  1.  0.]
 [nan nan nan ... nan nan nan]]


In [29]:
sum_utility_matrix = np.sum(utility_matrix)
highest_row_sum = np.max(np.sum(utility_matrix, axis=1))
highest_column_sum = np.max(np.sum(utility_matrix, axis=0))

print("Utility Matrix Metrics:")
print("Summation value of the utility matrix:", sum_utility_matrix)
print("Highest row sum of the utility matrix:", highest_row_sum)
print("Highest column sum of the utility matrix:", highest_column_sum)

Utility Matrix Metrics:
Summation value of the utility matrix: nan
Highest row sum of the utility matrix: nan
Highest column sum of the utility matrix: nan


#### Step2: Create Test matrix

In [32]:
num_users, num_tags = utility_matrix.shape
user_cutoff = int(num_users * 0.85)
tag_cutoff = int(num_tags * 0.85)

print(user_cutoff,tag_cutoff)

test_matrix = utility_matrix[user_cutoff:, tag_cutoff:]
sum_test_matrix = np.sum(test_matrix)
highest_row_sum = np.max(np.sum(test_matrix, axis=1))
highest_column_sum = np.max(np.sum(test_matrix, axis=0))

print("Test Matrix Metrics:")
print("dimensions: ",test_matrix.shape)
print("Summation value of the utility matrix:", sum_test_matrix)
print("Highest row sum of the utility matrix:", highest_row_sum)
print("Highest column sum of the utility matrix:", highest_column_sum)

988 827
Test Matrix Metrics:
dimensions:  (175, 147)
Summation value of the utility matrix: nan
Highest row sum of the utility matrix: nan
Highest column sum of the utility matrix: nan


&nbsp;
### Question 4:

In [48]:
import numpy as np
import pandas as pd

# Sample user-item matrix
# Rows are users, columns are items, and values are ratings
# data = {
#     'item1': [5, 4, 3, np.nan, 1],
#     'item2': [3, np.nan, 4, 2, 2],
#     'item3': [4, 5, 2, 1, np.nan],
#     'item4': [np.nan, 3, 5, 4, 4],
# }

data = utility_matrix[:user_cutoff]

# Convert to DataFrame
df = pd.DataFrame(data)

# Center the data by subtracting the mean of each item (column)
# Subtract column means from each value
df_centered = df.sub(df.mean(axis=0), axis=1)

# Pearson correlation matrix: compute pairwise correlation between items
similarity_matrix = df_centered.corr(method='pearson')

# Display the similarity matrix
item = tag_cutoff + 10
k = 2
arr = similarity_matrix[item]

# Step 2: Remove NaN values from the filtered array
valid_indices = np.where(~np.isnan(arr))[0]
valid_values = arr[valid_indices]
print(valid_indices)

# top_k_valid_indices = np.argpartition(valid_values, -k)[-k:]

# print(top_k_valid_indices)
# print(arr[top_k_valid_indices])
# print(arr)

[]
