In [None]:
import pandas as pd

In [None]:
# Load the anonymized English cases dataset from processed folder in data directory
cases_df = pd.read_csv('data/processed/cases.csv')

# Show the columns in the dataset
print(cases_df.columns)

In [None]:
# drop unnecessary columns for merging with comments
cases_df = cases_df[['case_number', 'case_origin', 'case_subject', 'description']]


In [None]:
# Load the anonymized English comments dataset from processed folder in data directory
comments_df = pd.read_csv('data/processed/comments.csv')

# Show the columns in the dataset
print(comments_df.columns)

In [None]:
# Drop unnecessary columns for merging with cases
comments_df = comments_df[['case_number', 'message_is_customer', 'message_body']]

In [None]:
# Shorten the name of message_body_s to comment
comments_df = comments_df.rename(columns={'message_body': 'comment'})


print("Comments dataset columns after renaming:")
print(comments_df.columns)

In [None]:
# Check if message is customer true or false. If customer is "true", then the comment is a customer comment, of "false" it is a support comment, so add a word to the comment to 
# indicate that it is a customer comment or support comment.
comments_df['comment'] = comments_df.apply(lambda x: f"customer: {x['comment']}" if x['message_is_customer'] else f"support: {x['comment']}", axis=1)

# Show the first 5 rows of the comments dataset after adding customer or support to the comment
print("First 5 rows of comments dataset after adding customer or support to the comment:")
print(comments_df.head()) 

In [None]:
# Drop the message_is_customer column from the comments dataset as it is no longer needed
comments_df = comments_df.drop(columns=['message_is_customer'])

In [None]:
# Show the number of rows in the dataset before grouping comments
print(f"Number of rows before grouping comments: {len(comments_df)}")

# 1. First, group comments by case_number and combine them into a single text field
grouped_comments = comments_df.groupby('case_number')['comment'].apply(lambda x: ' '.join(x)).reset_index()

# 2. Merge grouped comments with cases
merged_df = pd.merge(cases_df, grouped_comments, on='case_number', how='left')  # left join to keep all cases

# 3. Update the description
merged_df['description'] = merged_df.apply(
    lambda row: f"{row['description']} {row['comment']}" if pd.notna(row['comment']) else row['description'],
    axis=1
)
# 4. Drop the temporary comment column
merged_df = merged_df.drop(columns=['comment'])

# Now merged_df is your final dataset!
print(merged_df.head())

# Show the number of rows in the dataset after grouping comments
print(f"Number of rows after merging cases and comments: {len(merged_df)}")

In [None]:
# Show the columns in the merged dataset
print("Merged dataset columns:")
print(merged_df.columns)


In [None]:
# Save the merged dataset to a new CSV file and a neew Excel file
merged_df.to_csv('data/final/merged_cases_comments.csv', index=False)
merged_df.to_excel('data/final/merged_cases_comments.xlsx', index=False)