In [None]:
from IPython.display import Markdown, display
import numpy as np
import pandas as pd
import plotly.express as px

In [None]:
df = pd.read_csv('data/social_media_posts.csv')
df.head()


Unnamed: 0,post_id,user_id,post_time,post_hour,post_type,paid_boost,hashtags_count,followers_at_post,views,likes,clicks,comments,shares,watch_time_total_sec,like_rate,click_through_rate,comment_rate,session_time_normal,clicks_binomial
0,1,186,2025-06-12 03:23:46.951695,3,text,0,0,22828,23112,198,96,35,15,10889,0.008567,0.004154,0.001514,32.483571,4
1,2,273,2025-07-27 10:40:21.951695,10,image,0,5,4459,2893,148,29,5,6,1835,0.051158,0.010024,0.001728,29.308678,6
2,3,80,2025-08-25 13:23:31.951695,13,video,0,2,3607,5644,285,56,0,17,149281,0.050496,0.009922,0.0,33.238443,2
3,4,310,2025-08-16 12:44:40.951695,12,link,0,3,27786,10885,164,135,17,3,1236,0.015067,0.012402,0.001562,37.615149,6
4,5,131,2025-06-27 09:38:41.951695,9,video,0,4,3328,2974,163,58,0,8,193917,0.054808,0.019502,0.0,28.829233,3


In [10]:
def plot_histos(df, column='likes', title='title'):
    df = df.dropna(subset=column)
    mean_val = df[column].mean()
    mode_val = df[column].mode()[0]
    median_val = df[column].median()

    fig = px.histogram(df, x=column, template='plotly_dark', color_discrete_sequence=['#00CC96'], 
                       barmode='overlay',
                       title=title)


    fig.add_vline(x=mean_val, line_width=2, line_dash="dash", line_color="cyan",
                    annotation_text=f"Mean: {mean_val:.2f}", annotation_position="top right")
        

    fig.add_vline(x=mode_val, line_dash="dash", line_color="pink",
                    annotation_text=f"Mode: {mode_val:.2f}", annotation_position="top left")
        

    fig.add_vline(x=median_val, line_dash="dash", line_color="yellow", 
                    annotation_text=f"Median: {median_val:.2f}", annotation_position="bottom left")
    fig.show()
    display(Markdown(f"**{column}:** mean={mean_val:.2f}, median={median_val:.2f}, mode={mode_val}. "))

plot_histos(df, 'comments', title='distribution of comments')
plot_histos(df, 'session_time_normal', title='Distribution of session time (time spent in one session in secods)')
plot_histos(df, 'clicks_binomial', title='Probabily of N Clicks (out of 10 Views)')

**comments:** mean=13.13, median=0.00, mode=0. 

**session_time_normal:** mean=30.16, median=30.12, mode=13.793663299654638. 

**clicks_binomial:** mean=2.98, median=3.00, mode=3. 

## Teaching Notes — Comparing Shapes & Talking to PMs

| Distribution | Typical Shape | Story You’re Telling | PM-friendly Description | Reporting Tip |
|--------------|----------------|----------------------|-------------------------|---------------|
| **Poisson-like (Comments)** | Right-skewed, many lows, few highs | Most posts get few comments; rare spikes | “Most posts are quiet; a few go semi-viral.” | Use **median/IQR**, percentiles; avoid just the mean |
| **Normal (Session Time)** | Bell-shaped, symmetric | Users cluster around a typical usage | “Like commute times — most near the average.” | Mean & SD are fine; median ~ mean |
| **Binomial (Clicks out of 10, p=0.3)** | Discrete, skewed right when p<0.5 | Successes out of fixed tries | “10 chances; about 3 successes on average.” | Show **proportions with CIs**; aggregates over many trials |

**Skew rule of thumb:**  
Right-skewed data tends to have **mean > median > mode**; left-skewed tends to have **mean < median < mode**.  
When skewed, prefer **medians/percentiles** for summaries.

**Mode caution:** For continuous data, the exact “mode” often isn’t meaningful.
We estimate it by the **densest histogram bin** and label it as an approximation.
If you’d prefer **KDE-based** mode, say the word and I’ll switch the estimator.