# 📊 Test Exploratory Data Analysis — Social Media Sentiment Insights

**Objective:**  
Understand the dataset structure, detect missing values, explore distributions and early patterns to guide future modeling or insights presentation.

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Visual settings
pd.set_option("display.max_colwidth", 120)
plt.style.use("default")

print("✅ Libraries loaded correctly.")

## 📥 1. Load Dataset & Initial Overview

We'll load the dataset and quickly explore its structure to understand the number of rows, columns, and get a glimpse of the data.

In [None]:
# Load dataset
df = pd.read_csv("../data/sentimentdataset.csv")
print("✅ Dataset loaded. Shape:", df.shape)

# Quick peek
df.head(30)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df.sort_values(by="Likes", ascending=False).head(10)

In [None]:
df.sort_values(by="Retweets", ascending=False).head(10)

In [None]:
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
df_corr = df.corr(numeric_only=True).round(2)
df_corr.style.format("{:.2f}").background_gradient(cmap="coolwarm")
# df.corr(numeric_only=True)

Unnamed data should be removed
Total correlation between likes and retweets

In [None]:
df_corr = df.corr(numeric_only=True).round(2)
df_corr.style.format("{:.2f}").background_gradient(cmap="magma")

When applying a background gradient, especially in data visualization libraries like Pandas or Matplotlib, the cmap argument (colormap) specifies the color scheme to be used. While coolwarm provides a diverging colormap ranging from cool blues to warm reds, many other options exist, each with a distinct aesthetic and purpose.
Here are some categories of colormaps and examples of other colors you could use:
1. Sequential Colormaps: These colormaps are designed to show a progression of values, typically from low to high, using a single hue or a gradual change in lightness/saturation.
Examples: viridis, plasma, inferno, magma, gray, Blues, Greens, Reds, Purples, Oranges.
2. Diverging Colormaps: Similar to coolwarm, these colormaps emphasize a central neutral value and diverge to two distinct colors at the extremes, suitable for showing deviations from a mean or a zero point.
Examples: seismic, RdBu, PiYG, PRGn, BrBG, bwr.
3. Qualitative Colormaps: These colormaps are designed to distinguish between discrete categories or groups, using distinct and easily differentiable colors.
Examples: tab10, tab20, Paired, Set1, Set2, Dark2.
4. Cyclic Colormaps: These colormaps are useful for data that wraps around a central point, like angles or phases, where the start and end colors are the same or very similar.
Examples: twilight, hsv.
To choose the best colormap, consider:
Data Type: Is your data sequential, diverging, or categorical?
Clarity: Does the colormap effectively convey the information without causing misinterpretations?
Accessibility: Is the colormap colorblind-friendly? (e.g., viridis is often recommended for this).
Aesthetics: Does the colormap align with the overall design and purpose of your visualization?

In [None]:
df['text_len'] = df['Text'].str.len()
df['word_count'] = df['Text'].str.split().str.len()
df['hashtag_count'] = df['Hashtags'].str.count('#')
df['emoji_count'] = df['Text'].str.count(r'[^\w\s,]')  # emojis/símbolos

df[['text_len', 'word_count', 'hashtag_count', 'emoji_count', 'Likes', 'Retweets']].corr()

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot = True)

plt.rcParams['figure.figsize'] = (20,7)

plt.show()

In [None]:
df_groupby = df.groupby('Platform')[['Likes', 'Retweets']].mean().sort_values(by='Likes', ascending=False)
df_groupby.style.format("{:.1f}")

In [None]:
df['Country'].value_counts()


Strings might have empty values and that messes up de 'country' data

In [None]:
df['Country'].value_counts(normalize=True) * 100

In [None]:
df.groupby('Country')[['Likes', 'Retweets', 'emoji_count', 'word_count']].mean().sort_values(by='Likes', ascending=False)

In [None]:
df_groupby = df.groupby('Sentiment')[['Likes', 'Retweets']].mean().sort_values(by='Retweets', ascending=False)
df_groupby.style.format("{:.1f}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# --- CLEANUP ---
# Clean the 'Platform' column to avoid duplicates like 'Twitter ' vs 'twitter'
df["Platform"] = df["Platform"].astype(str).str.strip().str.title()


In [None]:

df = pd.read_csv("sentimentdataset.csv")
print(df.head())
print(df.shape)

df.drop(columns='Unnamed: 0.1', inplace=True)
df.rename(columns={'Unnamed: 0':'Id'}, inplace=True)
df.isnull().sum()
df.dtypes

df["Platform"] = df["Platform"].astype(str).str.strip().str.title()
top10_sentiments = df["Sentiment"].value_counts().head(10).index
df_platform = df[df["Sentiment"].isin(top10_sentiments)].copy()

print("Platforms in dataset:", df_platform["Platform"].unique())


In [None]:
# --- 1) Count of sentiments per platform ---
'''platform_counts = (
    df_platform.groupby(["Platform", "Sentiment"])
    .size()
    .unstack(fill_value=0)
)
platform_percent = platform_counts.div(platform_counts.sum(axis=1), axis=0) * 100
plt.figure(figsize=(10,6))
bottom_val = np.zeros(len(platform_percent))

for sentiment in platform_percent.columns:
    plt.bar(platform_percent.index, platform_percent[sentiment],
            bottom=bottom_val, label=sentiment)
    bottom_val += platform_percent[sentiment]

plt.title("Sentiment Distribution Across Platforms (Top 10 Sentiments)")
plt.xlabel("Platform")
plt.ylabel("Percentage of Posts (%)")
plt.legend(title="Sentiment", fontsize=7, ncol=2)
plt.tight_layout()
plt.show()'''

'''as expected, positive, joy, excitment are leading sentiments across platforms.
nothing really stands out here'''


In [None]:

# --- 2) Temporal trend per platform ---
# Extract month for grouping
'''df_platform["Month"] = pd.to_datetime(df_platform["Timestamp"], errors="coerce").dt.month

# Loop through each platform and plot separate area charts
for platform in df_platform["Platform"].unique():
    sub = df_platform[df_platform["Platform"] == platform]
    monthly = sub.groupby(["Month","Sentiment"]).size().reset_index(name="Count")
    pivot_month = monthly.pivot_table(index="Month", columns="Sentiment", values="Count", fill_value=0)

    plt.figure(figsize=(9,5))
    pivot_month.plot(kind="area", alpha=0.8, figsize=(9,5))
    plt.title(f"Monthly Sentiment Trends on {platform} (Top 10 Sentiments)")
    plt.xlabel("Month")
    plt.ylabel("Post Count")
    plt.legend(title="Sentiment", fontsize=7, ncol=2)
    plt.tight_layout()
    plt.show()'''

''' monthly sentiments (twitter)
summer months is when theres a lot of excitment, joy type of hashtags being used. beginning of the year is more
positive sentiment and then near september to october, we see a rise in sad sentiments.'''

'''monthly sentiments (instagram)
similar trends as twitter with overwhelming joy and excitement during summers
and beginning of the year but instagram has a lot more hopeful sentiments in the summer + neutral in the fall'''

'''monthly sentiments (facebook)
joy sentiment spans across the year with the peak in summer months. 
'''

###using consistent color scheme for sentiments across platforms

In [None]:
### --- 3) sentiment distribution by country TOP 10 ---
'''country_sentiment = (
    df_geo.groupby(["Country", "Sentiment"])
    .size()
    .unstack(fill_value=0)
)

country_sentiment.plot(kind="bar", stacked=True, figsize=(10,6))
plt.title("Sentiment Distribution by Country (Top 10)")
plt.xlabel("Country")
plt.ylabel("Number of Posts")
plt.legend(title="Sentiment", fontsize=7, ncol=2)
plt.tight_layout()
plt.show()'''

'''
Top three countries, positive sentiment distribution is the highest across all three 
countries, with words such as positive, joy, and excitement taking the lead
'''

In [None]:

# --- 3) platform usage by country TOP 10 ---
'''import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Clean up text columns
df["Country"] = df["Country"].astype(str).str.strip().str.title()
df["Platform"] = df["Platform"].astype(str).str.strip().str.title()

# Focus on top 10 countries (most posts)
top_countries = df["Country"].value_counts().head(10).index
df_country_platform = df[df["Country"].isin(top_countries)].copy()

heat_data = (
    df_country_platform.groupby(["Country", "Platform"])
    .size()
    .unstack(fill_value=0)
)

plt.figure(figsize=(10,6))
sns.heatmap(heat_data, cmap="YlGnBu", annot=True, fmt="d")
plt.title("Heatmap of Platform Usage by Country (Top 10)")
plt.xlabel("Platform")
plt.ylabel("Country")
plt.tight_layout()
plt.show()'''

''' Instagram is the number one platform used across the top three countries and majority of the other Top 10s
However, India has Twitter as the most used platform'''




In [None]:

# --- 5) ENGAGEMENT VS SENTIMENT ---
'''sentiments = df_top["Sentiment"].value_counts().index.tolist()
data_for_box = [df_top[df_top["Sentiment"]==s]["Engagement"].values for s in sentiments]

plt.figure(figsize=(12,6))
plt.boxplot(data_for_box, labels=sentiments, showfliers=False)
plt.title("Engagement (Likes + Retweets) by Sentiment (Top 50)")
plt.xlabel("Sentiment")
plt.ylabel("Engagement")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()'''