In [1]:
import os

import pandas as pd

os.chdir("/Users/aadityabhatia/Documents/GitHub/SE_Bugs_vs_Features")

# getting posts
issues = pd.read_csv('data/issues.csv')
len(issues) == 11074  # validation

True

In [3]:
# gettign answers

posts = pd.read_csv('data/data_main/posts.csv')
answers = posts[posts.PostTypeId == 2]
del posts


In [4]:
# getting votes

votes = pd.read_csv('data/data_main/votes.csv')
votes = votes.rename(columns={'Id': 'VoteId',
                              "CreationDate": "VoteCreationDate"})

# Condense the votes into upvotes and downvotes for issues and answers
votes = votes[(votes.VoteTypeId==2) | votes.VoteTypeId==3]
votes['VoteType'] = votes['VoteTypeId'].apply(lambda x: 'upvote' if x == 2 else ('downvote' if x == 3 else 'other'))
grouped_votes = votes.groupby(['PostId', 'VoteType']).size().unstack(fill_value=0).reset_index()

del votes

In [6]:
# getting comments

comments = pd.read_csv('data/data_main/comments.csv')
comments = comments.rename(columns={"Text": "Text_Comment",
                                    "UserId": "UserId_Comment"})

# grouping
grouped_comments = comments.groupby('PostId')['Text_Comment'].apply(list).reset_index()
grouped_commenters = comments.groupby('PostId')['UserId_Comment'].apply(list).reset_index()

del comments


In [7]:
# Merge comments and commenters with issues
issues_with_comments = pd.merge(issues, grouped_comments, left_on='Id', right_on='PostId', how='left')
issues_with_comments = pd.merge(issues_with_comments, grouped_commenters, on='PostId', how='left', suffixes=['_Comments', '_Commenters'])

# Merge votes with issues
issues_with_votes_and_comments = pd.merge(issues_with_comments, grouped_votes, on='PostId', how='left')

# Merge comments and commenters with answers
answers_with_comments = pd.merge(answers, grouped_comments, left_on='Id', right_on='PostId', how='left')
answers_with_comments = pd.merge(answers_with_comments, grouped_commenters, on='PostId', how='left', suffixes=['_Comments', '_Commenters'])

# Merge votes with answers
answers_with_votes_and_comments = pd.merge(answers_with_comments, grouped_votes, on='PostId', how='left')

# Ensure there's no duplicate PostId in the resulting dataframes
issues_with_votes_and_comments = issues_with_votes_and_comments.drop_duplicates(subset='PostId')
answers_with_votes_and_comments = answers_with_votes_and_comments.drop_duplicates(subset='PostId')

# Display the structure of the dataframes
print(f"issues_with_votes_and_comments:\n{issues_with_votes_and_comments.columns}")
print(f"answers_with_votes_and_comments:\n{answers_with_votes_and_comments.columns}")


issues_with_votes_and_comments:
Index(['Id', 'PostTypeId', 'CreationDate', 'Score', 'ViewCount', 'Body',
       'OwnerUserId', 'LastEditorUserId', 'LastEditDate', 'LastActivityDate',
       'Title', 'Tags', 'AnswerCount', 'CommentCount', 'ClosedDate',
       'ContentLicense', 'PostId', 'Text_Comment', 'UserId_Comment'],
      dtype='object')
answers_with_votes_and_comments:
Index(['Id', 'PostTypeId', 'AcceptedAnswerId', 'ParentId', 'CreationDate',
       'DeletionDate', 'Score', 'ViewCount', 'Body', 'OwnerUserId',
       'LastEditorUserId', 'LastEditDate', 'LastActivityDate', 'Title', 'Tags',
       'AnswerCount', 'CommentCount', 'FavouriteCount', 'ClosedDate',
       'ContentLicense', 'PostId', 'Text_Comment', 'UserId_Comment'],
      dtype='object')


In [8]:
issue_cols = ['Id', 'CreationDate', 'Score', 'ViewCount', 'Body',
       'OwnerUserId', 'Title', 'Tags', 'AnswerCount', 'CommentCount', 
       'Text_Comment', 'UserId_Comment']

ans_cols = ['Id', 'ParentId', 'CreationDate',
       'Score', 'ViewCount', 'Body', 'OwnerUserId',
       'CommentCount', 'Text_Comment', 'UserId_Comment']


issues_with_votes_and_comments = issues_with_votes_and_comments[issue_cols]
answers_with_votes_and_comments = answers_with_votes_and_comments[ans_cols]

In [9]:
# getting edits:

edits = pd.read_csv('data/data_main/PostHistory.csv')

# Assuming that the 'PostHistoryTypeId' indicates an edit and 'PostId' relates to the question's ID,
# and we're interested in edits of the body (which might be represented by a specific 'PostHistoryTypeId'),
# you might need to adjust the filtering condition based on your schema for 'PostHistoryTypeId' for edits.

# Filter for edit entries (assuming PostHistoryTypeId for edits is either 4 for body edits or 5 for title edits)
edits = edits[edits['PostHistoryTypeId'].isin([4, 5])]

# Count the number of edits for each question
question_edit_counts = edits.groupby('PostId')['Id'].count().reset_index(name='EditCount')

# Get the unique list of editor IDs for each question
question_editor_ids = edits.groupby('PostId')['UserId'].apply(lambda x: x.unique().tolist()).reset_index(
    name='EditorIds')

del edits


In [10]:
# Combine the counts and the editor IDs into a single dataframe
question_edits_info = pd.merge(question_edit_counts, question_editor_ids, on='PostId', how='left')

# Merge this information with the issues dataframe
issues_with_votes_comments_edits = pd.merge(issues_with_votes_and_comments, question_edits_info, left_on='Id', right_on='PostId', how='left')

# Add a column for the number of unique editors
issues_with_votes_comments_edits['num_ques_editors'] = issues_with_votes_comments_edits['EditorIds'].apply(
    lambda x: len(x) if isinstance(x, list) else 0)

In [11]:
# Now, we merge the issues with answers. This will give us a table with issues and the corresponding answers
issue_answers = pd.merge(issues_with_votes_comments_edits, answers_with_votes_and_comments, how='left', left_on="Id",
                         right_on="ParentId", suffixes=["_Quens", "_Answer"])

# Then we can perform the next steps of feature generation on this merged dataframe
print(f"count: {len(issue_answers)}, cols: {issue_answers.columns}")

count: 11354, cols: Index(['Id_Quens', 'CreationDate_Quens', 'Score_Quens', 'ViewCount_Quens',
       'Body_Quens', 'OwnerUserId_Quens', 'Title', 'Tags', 'AnswerCount',
       'CommentCount_Quens', 'Text_Comment_Quens', 'UserId_Comment_Quens',
       'PostId', 'EditCount', 'EditorIds', 'num_ques_editors', 'Id_Answer',
       'ParentId', 'CreationDate_Answer', 'Score_Answer', 'ViewCount_Answer',
       'Body_Answer', 'OwnerUserId_Answer', 'CommentCount_Answer',
       'Text_Comment_Answer', 'UserId_Comment_Answer'],
      dtype='object')


In [12]:
# let go of the unwanted columns and save 

for col in ['PostId', "ParentId"]:
    if col in issue_answers.columns:
        del issue_answers[col]

issue_answers.to_pickle("data/all_issue_data.pkl")