In [110]:
import pandas as pd

# Replace 'file_path.csv' with the actual path to your CSV file
posts_df = pd.read_csv('./csv/Posts.csv')
Tags = pd.read_csv('./csv/Tags.csv')

answers_df = posts_df[posts_df['PostTypeId'] == 2][['Id', 'OwnerUserId', 'ParentId']]
answers_df['ParentId'] = answers_df['ParentId'].astype(int)
print(len(answers_df))

# Step 4: Extract the questions with their tags (where 'PostTypeId == 1')
questions_df = posts_df[posts_df['PostTypeId'] == 1][['Id', 'Tags']]

# Step 5: Ensure the question Ids are also of type int64
questions_df['Id'] = questions_df['Id'].astype(int)
print(len(questions_df))


# Step 3: Merge answers with the corresponding tags from the question (use ParentId to match question Id)
merged_df = pd.merge(answers_df, questions_df, left_on='ParentId', right_on='Id', suffixes=('_answer', '_question'))

# Step 4: Select relevant columns
filtered_answers_df = merged_df[['Id_answer', 'OwnerUserId', 'Tags']]  # 'Tags' here are from the question
answerer_counts = filtered_answers_df.groupby('OwnerUserId').size()
qualified_answerers = answerer_counts[answerer_counts >= 20].index
print("Qualified Answerers:", len(qualified_answerers))
filtered_answers = filtered_answers_df[filtered_answers_df['OwnerUserId'].isin(qualified_answerers)]
# print(len(filtered_answers))

qualified_tags = Tags[Tags['Count'] >= 20]['Id']
print("Qualified Tags:", len(qualified_tags))

tag_dict = Tags.set_index('TagName')['Id'].to_dict()

tags_expanded = filtered_answers.copy()
tags_expanded['Tags'] = tags_expanded['Tags'].str.split('|').apply(lambda x: x[1:-1])
tags_expanded = tags_expanded.explode('Tags')
tags_expanded['Tags'] = tags_expanded['Tags'].map(tag_dict)
tags_expanded = tags_expanded[tags_expanded['Tags'].isin(qualified_tags)]

expert_matrix = pd.pivot_table(
    tags_expanded, 
    index='OwnerUserId', 
    columns='Tags', 
    aggfunc='size', 
    fill_value=0
)

all_qualified_tags = pd.Series(qualified_tags, name='Tags')
expert_matrix = expert_matrix.reindex(columns=all_qualified_tags, fill_value=0)
print("Expert Matrix:", expert_matrix)

# dimensions = utility_matrix_sorted.shape
print("Dimensions of the Expert matrix:", expert_matrix.shape)


178628
63423
Qualified Answerers: 1163
Qualified Tags: 974
Expert Matrix: Tags         1     3     4     7     8     9     11    12    13    14    ...  \
OwnerUserId                                                              ...   
4.0            13     0     6     6    61    55     8     3     0     0  ...   
6.0             0     0     8     0     6     4     1     2     0     0  ...   
11.0            1     0     1     0     0     1     0     1     0     0  ...   
14.0            0     0     1     0     1     1     0     1     0     0  ...   
15.0            1     0     2     1     4     4     1     1     0     0  ...   
...           ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
356695.0        0     0     0     0     0     1     0     0     0     0  ...   
366014.0        0     0     0     0     0     0     1     0     0     0  ...   
373864.0        0     0     0     0     0     0     0     0     0     0  ...   
378329.0        1     0     0     0     0     

In [111]:
import numpy as np
expert_matrix = expert_matrix.to_numpy()
print(expert_matrix)

[[13  0  6 ...  0  1  0]
 [ 0  0  8 ...  0  0  0]
 [ 1  0  1 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  5  1]
 [ 0  0  0 ...  0  0  0]]


In [121]:
utility_matrix = np.zeros((len(expert_matrix), len(expert_matrix[0])))
# Populate utility_matrix based on the condition
for i in range(len(expert_matrix)):
    for j in range(len(expert_matrix[0])):
        if expert_matrix[i][j] > 15:
            utility_matrix[i][j] = 5
        else:
            utility_matrix[i][j] = expert_matrix[i][j] // 3


In [122]:
print(utility_matrix)

[[4. 0. 2. ... 0. 0. 0.]
 [0. 0. 2. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [126]:
sum_utility_matrix = np.sum(utility_matrix)
highest_row_sum = np.max(np.sum(utility_matrix, axis=1))
highest_column_sum = np.max(np.sum(utility_matrix, axis=0))

print("Utility Matrix Metrics:")
print("Summation value of the utility matrix:", sum_utility_matrix)
print("Highest row sum of the utility matrix:", highest_row_sum)
print("Highest column sum of the utility matrix:", highest_column_sum)

Utility Matrix Metrics:
Summation value of the utility matrix: 41354.0
Highest row sum of the utility matrix: 1162.0
Highest column sum of the utility matrix: 1403.0
