In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
content = pd.read_csv('Content.csv')
content.head(5)

In [None]:
content.shape

In [None]:
content.info()

In [None]:
reactions = pd.read_csv('Reactions.csv')
reactions.head()

In [None]:
reactions.shape

In [None]:
reactions.info()

In [None]:
reactionTypes = pd.read_csv('reactionTypes.csv')
reactionTypes.head(5)

In [None]:
reactionTypes.shape

# Merging Data sets

In [None]:
#To merge the data sets, we are going to have to rename either one of the colums from reactions or reactionsType to make it easy to merge
# renaming the 'Reaction Type' column to 'Type'
reactions = reactions.rename(columns={'Reaction Type': 'Type'})
reactions.head(5)

In [None]:

# Merging content_data and reactions_data on Content ID
merged_data = pd.merge(content, reactions, on='Content ID')

# merge merged_data and reaction_types_data on Type
final_data = pd.merge(merged_data, reactionTypes, on='Type')

# display the final merged data set
final_data.head()

In [None]:
# selecting Relevant columns
final_data = final_data.loc[:, ['Content ID', 'Content Type', 'Category','Type','Datetime','Sentiment','Score']]
final_data.head()

In [None]:
final_data.shape

In [None]:
final_data.info()

In [None]:
final_data.describe()

In [None]:
final_data.isnull().sum()

In [None]:
final_data.duplicated()

In [None]:
final_data.duplicated().sum()

# Exploratory Data Analysis

In [None]:
print(final_data['Content Type'].unique())
print(final_data['Sentiment'].unique())
print(final_data['Category'].unique())
print(final_data['Type'].unique())

In [None]:
#Which content type received the most reactions?
final_data['Content Type'].value_counts()

In [None]:
final_data['Content Type'].value_counts().plot(kind = 'bar')

In [None]:
final_data['Category'].value_counts()

In [None]:
final_data['Category'].value_counts().plot(kind = 'bar')

In [None]:
#Top 5 Categories
category_count = final_data['Category'].value_counts().nlargest(5)
print(category_count)

In [None]:
final_data['Category'].value_counts().nlargest(5).plot(kind = 'bar')
plt.title('Top 5 Categories')
plt.show()

In [None]:
final_data['Type'].value_counts()

In [None]:
final_data['Type'].value_counts().plot(kind = 'barh')

In [None]:
# Reaction type with the highest number of negative sentiment scores
negative_sentiments = final_data[final_data['Sentiment'] == 'negative']
type_count = negative_sentiments['Type'].value_counts()
print(type_count)

In [None]:
negative_sentiments['Type'].value_counts().plot(kind = 'bar')
plt.title('Reaction Type with Highest Negative Sentiment Score')
plt.show()

In [None]:
# Reaction type with the highest number of Positive sentiment scores
positive_sentiments = final_data[final_data['Sentiment'] == 'positive']
type_count = positive_sentiments['Type'].value_counts()
print(type_count)

In [None]:
positive_sentiments['Type'].value_counts().plot(kind = 'bar')
plt.title('Reaction Type with Highest Positive Sentiment Score')
plt.show()

In [None]:
final_data['Sentiment'].value_counts()

In [None]:
final_data['Sentiment'].value_counts().plot(kind = 'pie')
plt.title('Sentiment Distribution')
plt.show()

In [None]:
final_data.groupby('Sentiment')['Score'].sum()

In [None]:
# group the data by sentiment and sum the scores
sentiment_scores = final_data.groupby('Sentiment')['Score'].sum()

# create a pie chart
plt.pie(sentiment_scores, labels=sentiment_scores.index, autopct='%1.1f%%')
plt.title('Sentiment Distribution Based on Scores')
plt.show()

In [None]:
# Distribution of sentiment scores for each content category
sns.boxplot(x="Category", y="Score", data=final_data)

In [None]:
# To determine which content category has the highest number of positive sentiment scores
positive_sentiments = final_data[final_data['Sentiment'] == 'positive']
category_count = positive_sentiments['Category'].value_counts()
print(category_count)

In [None]:
category_count = positive_sentiments['Category'].value_counts().plot(kind = 'barh')
plt.show()

# Trends and Distribution for Year and Month

In [None]:
# Group data by datetime and calculate mean score
date_scores = final_data.groupby('Datetime')['Score'].sum()

# Convert index to datetime format
date_scores.index = pd.to_datetime(date_scores.index)

# Create line chart
plt.plot(date_scores.index, date_scores.values)
plt.title('Trends in Scores over Time')
plt.xlabel('Date')
plt.ylabel('Score')
plt.show()

In [None]:
#Monthly Performance based on Scores
# Convert the Datetime column to a pandas datetime object
final_data['Datetime'] = pd.to_datetime(final_data['Datetime'], format='%d/%m/%Y %H:%M')

# Create a new column for the month and year
final_data['Month'] = final_data['Datetime'].dt.month
final_data['Year'] = final_data['Datetime'].dt.year

# Calculate the Total score for each month and year
monthly_scores = final_data.groupby('Month')['Score'].sum()
yearly_scores = final_data.groupby('Year')['Score'].sum()

# Plot the monthly scores
plt.figure(figsize=(10,6))
monthly_scores.plot(kind='bar')
plt.title('Best Performing Months')
plt.xlabel('Month')
plt.ylabel('Mean Score')
plt.show()

In [None]:
#Yearly Performance based on Scores
# Plot the yearly scores
plt.figure(figsize=(10,6))
yearly_scores.plot(kind='bar')
plt.title('Distribution of Scores by Year')
plt.xlabel('Year')
plt.ylabel('Mean Score')
plt.show()