Import Required Modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Load the CSV Files

In [None]:
content = pd.read_csv('Content.csv')
reaction = pd.read_csv('Reactions.csv')
reaction_type = pd.read_csv('ReactionTypes.csv')

Drop Unnecessary Columns

In [15]:
content.drop(['Unnamed: 0'], axis=1, inplace=True)
reaction.drop(['Unnamed: 0'], axis=1, inplace=True)
reaction_type.drop(['Unnamed: 0'], axis=1, inplace=True)

Drop Non-Relevant Columns (Focus on Reactions & Categories)

In [16]:
content.drop(['User ID', 'URL'], axis=1, inplace=True)
reaction.drop(['User ID'], axis=1, inplace=True)

Rename Columns for Clean Merge

In [17]:
reaction.rename(columns={'Type': 'reaction_type'}, inplace=True)
reaction_type.rename(columns={'Type': 'reaction_type'}, inplace=True)
content.rename(columns={'Type': 'content_type'}, inplace=True)

Remove Missing/Null Values

In [18]:
content.dropna(inplace=True)
reaction.dropna(inplace=True)
reaction_type.dropna(inplace=True)

Merge Datasets

In [19]:
reaction_merged = pd.merge(reaction, reaction_type, on='reaction_type', how='inner')
final_df = pd.merge(content, reaction_merged, on='Content ID', how='inner')

Clean Up the Category Field

In [21]:
final_df['Category'] = final_df['Category'].str.replace(r'"([^"]+)"', r'\1', regex=True)
final_df['Category'] = final_df['Category'].str.lower()

View & Export the Cleaned Dataset

In [23]:
final_df.to_csv('Cleaned Dataset.csv', index=False)
final_df.to_excel('Cleaned Dataset.xlsx', index=False)

Get the Top 5 Categories by Score

In [24]:
top5_categories = final_df.groupby("Category")["Score"].sum().sort_values(ascending=False).head(5)
print(top5_categories)

Category
animals           74965
science           71168
healthy eating    69339
technology        68738
food              66676
Name: Score, dtype: int64


You can also export this as: Top_5_Categories

In [25]:
top5_categories.to_excel("Top_5_Categories.xlsx")