[Reference](https://medium.com/@Rohan_Dutt/10-data-analysis-approaches-used-in-real-world-reporting-not-textbook-dashboards-c3dba4bbfb09)

# 1. Data Storytelling to Drive Action
Insights donâ€™t create impact decisions do. Data storytelling turns analysis into a clear narrative that motivates action.

# 2. Machine Learning for Pattern Detection

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load data
df = pd.read_csv("user_behavior.csv")
features = [
    "total_sessions",
    "avg_session_duration",
    "pages_per_session",
    "purchases_last_30d"
]
X = df[features]
# Normalize features (critical)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train k-means
kmeans = KMeans(
    n_clusters=4,
    random_state=42,
    n_init=10
)
df["cluster_id"] = kmeans.fit_predict(X_scaled)
# Inspect cluster profiles
cluster_profiles = (
    df.groupby("cluster_id")[features]
      .mean()
      .round(2)
)
cluster_profiles

In [2]:
# Choosing k (Quick Reality Check)
from sklearn.metrics import silhouette_score

silhouette = silhouette_score(X_scaled, df["cluster_id"])
silhouette

```
Rule of thumb: Silhouette > 0.4 = meaningful separation
```

# 3. Geospatial Analysis for Location-Based Insights

### SQL based Geospatial (Customer Clusters & Coverage)
```sql
WITH customers AS (
  SELECT
    customer_id,
    spend,
    ST_SetSRID(ST_Point(lon, lat), 4326) AS geom
  FROM customer_locations
),
stores AS (
  SELECT
    store_id,
    ST_SetSRID(ST_Point(lon, lat), 4326) AS geom
  FROM store_locations
),
customer_density AS (
  SELECT
    ST_SnapToGrid(geom, 0.05) AS grid_cell,   -- ~5km grid
    COUNT(*) AS customers,
    SUM(spend) AS total_spend
  FROM customers
  GROUP BY 1
),
underserved AS (
  SELECT
    d.grid_cell,
    d.customers,
    d.total_spend,
    MIN(ST_Distance(d.grid_cell, s.geom)) / 1000 AS km_to_nearest_store
  FROM customer_density d
  LEFT JOIN stores s
    ON ST_DWithin(d.grid_cell, s.geom, 50000) -- 50km search radius
  GROUP BY d.grid_cell, d.customers, d.total_spend
)
SELECT *
FROM underserved
WHERE km_to_nearest_store > 20
ORDER BY total_spend DESC;
```

### Python based Geospatial (Customer Clusters & Coverage)

In [3]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np

# Load data
customers_df = pd.read_csv("customer_locations.csv")  # customer_id, lat, lon, spend
stores_df = pd.read_csv("store_locations.csv")        # store_id, lat, lon

# Create GeoDataFrames (WGS84)
customers = gpd.GeoDataFrame(
    customers_df,
    geometry=gpd.points_from_xy(customers_df.lon, customers_df.lat),
    crs="EPSG:4326"
)

stores = gpd.GeoDataFrame(
    stores_df,
    geometry=gpd.points_from_xy(stores_df.lon, stores_df.lat),
    crs="EPSG:4326"
)

# Project to meters for distance calculations
customers = customers.to_crs(epsg=3857)
stores = stores.to_crs(epsg=3857)

# Grid-based clustering (~5km)
grid_size = 5000  # meters

customers["grid_x"] = (customers.geometry.x // grid_size) * grid_size
customers["grid_y"] = (customers.geometry.y // grid_size) * grid_size

customers["grid_cell"] = customers.apply(
    lambda r: Point(r["grid_x"], r["grid_y"]), axis=1
)

# Aggregate customer density & spend
density = (
    customers.groupby("grid_cell")
    .agg(
        customers=("customer_id", "count"),
        total_spend=("spend", "sum")
    )
    .reset_index()
)

density = gpd.GeoDataFrame(density, geometry="grid_cell", crs=customers.crs)

# # Distance to nearest store
# density["km_to_nearest_store"] = (
#     density.geometry.apply(
#         lambda g:
#     )
# )

# 4. Network Analysis to Map Relationships

## SQL based Network Analysis (Influence & Connectors)
```sql
WITH edges AS (
  SELECT
    follower_id AS src,
    followee_id AS dst
  FROM user_follows
),
degree AS (
  SELECT
    user_id,
    SUM(in_degree) AS in_degree,
    SUM(out_degree) AS out_degree
  FROM (
    SELECT dst AS user_id, COUNT(*) AS in_degree, 0 AS out_degree
    FROM edges
    GROUP BY dst
    UNION ALL
    SELECT src AS user_id, 0 AS in_degree, COUNT(*) AS out_degree
    FROM edges
    GROUP BY src
  ) d
  GROUP BY user_id
),
two_hop_paths AS (
  SELECT
    e1.src AS from_user,
    e2.dst AS to_user
  FROM edges e1
  JOIN edges e2
    ON e1.dst = e2.src
  WHERE e1.src <> e2.dst
),
betweenness_proxy AS (
  SELECT
    from_user AS user_id,
    COUNT(DISTINCT to_user) AS bridge_count
  FROM two_hop_paths
  GROUP BY from_user
)
SELECT
  d.user_id,
  d.in_degree,
  d.out_degree,
  COALESCE(b.bridge_count, 0) AS bridge_score
FROM degree d
LEFT JOIN betweenness_proxy b
  ON d.user_id = b.user_id
ORDER BY bridge_score DESC, in_degree DESC;
```

## Python based Network Analysis (Influence & Connectors)


In [4]:
import pandas as pd
import networkx as nx

# Load edge list
edges = pd.read_csv("user_follows.csv")  # follower_id, followee_id

# Build directed graph
G = nx.from_pandas_edgelist(
    edges,
    source="follower_id",
    target="followee_id",
    create_using=nx.DiGraph()
)

# Degree metrics
in_degree = dict(G.in_degree())
out_degree = dict(G.out_degree())

# Betweenness centrality (true bridge metric)
betweenness = nx.betweenness_centrality(G, normalized=True)

# Combine results
influence = (
    pd.DataFrame({
        "user_id": list(G.nodes()),
        "in_degree": [in_degree.get(u, 0) for u in G.nodes()],
        "out_degree": [out_degree.get(u, 0) for u in G.nodes()],
        "betweenness": [betweenness.get(u, 0) for u in G.nodes()]
    })
    .sort_values(
        ["betweenness", "in_degree"],
        ascending=False
    )
)

influence.head(10)

# 5. A/B Testing to Validate Assumptions

## SQL based A/B Testing (Conversion Rate + Significance)
```sql
WITH experiment_data AS (
  SELECT
    variant,                           -- 'A' or 'B'
    COUNT(*) AS users,
    SUM(CASE WHEN converted = 1 THEN 1 ELSE 0 END) AS conversions
  FROM ab_events
  WHERE experiment_name = 'landing_page_v2'
  GROUP BY variant
),
rates AS (
  SELECT
    variant,
    users,
    conversions,
    conversions * 1.0 / users AS conversion_rate
  FROM experiment_data
),
stats AS (
  SELECT
    MAX(CASE WHEN variant = 'A' THEN conversion_rate END) AS cr_a,
    MAX(CASE WHEN variant = 'B' THEN conversion_rate END) AS cr_b,
    MAX(CASE WHEN variant = 'A' THEN users END) AS n_a,
    MAX(CASE WHEN variant = 'B' THEN users END) AS n_b
  FROM rates
)
SELECT
  cr_a,
  cr_b,
  ROUND(cr_b - cr_a, 4) AS lift,
  ROUND(
    (cr_b - cr_a) /
    SQRT(
      (cr_a * (1 - cr_a) / n_a) +
      (cr_b * (1 - cr_b) / n_b)
    ),
    3
  ) AS z_score
FROM stats;
```

## Python based A/B Testing (Conversion Rate + Significance)


In [5]:
import pandas as pd
import numpy as np
from scipy.stats import norm

# Load data
df = pd.read_csv("ab_events.csv")

# Filter experiment
df = df[df["experiment_name"] == "landing_page_v2"]

# Aggregate metrics
summary = (
    df.groupby("variant")
      .agg(
          users=("user_id", "count"),
          conversions=("converted", "sum")
      )
      .reset_index()
)

# Extract A/B values
a = summary.loc[summary["variant"] == "A"].iloc[0]
b = summary.loc[summary["variant"] == "B"].iloc[0]

cr_a = a["conversions"] / a["users"]
cr_b = b["conversions"] / b["users"]

lift = cr_b - cr_a

# Z-test for proportions
se = np.sqrt(
    (cr_a * (1 - cr_a) / a["users"]) +
    (cr_b * (1 - cr_b) / b["users"])
)

z_score = lift / se
p_value = 2 * (1 - norm.cdf(abs(z_score)))

# Results
results = {
    "cr_a": round(cr_a, 4),
    "cr_b": round(cr_b, 4),
    "lift": round(lift, 4),
    "z_score": round(z_score, 3),
    "p_value": round(p_value, 4)
}

results

# 6. Time Series Forecasting for Strategic Decisions
```sql
WITH monthly_traffic AS (
  SELECT
    DATE_TRUNC('month', visit_date) AS month,
    COUNT(*) AS visits
  FROM web_visits
  GROUP BY 1
),
features AS (
  SELECT
    month,
    visits,
    ROW_NUMBER() OVER (ORDER BY month) AS t,
    AVG(visits) OVER (
      ORDER BY month
      ROWS BETWEEN 3 PRECEDING AND 1 PRECEDING
    ) AS rolling_avg_3m,
    LAG(visits, 12) OVER (ORDER BY month) AS visits_last_year
  FROM monthly_traffic
),
forecast AS (
  SELECT
    month,
    visits,
    ROUND(
      0.6 * rolling_avg_3m +
      0.4 * visits_last_year
    ) AS forecast_visits
  FROM features
)
SELECT *
FROM forecast
ORDER BY month;
```


## Python based Time Series Forecasting (Seasonality-Aware)

In [6]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("web_visits.csv", parse_dates=["visit_date"])

# Monthly aggregation
monthly = (
    df.assign(month=df["visit_date"].dt.to_period("M").dt.to_timestamp())
      .groupby("month")
      .size()
      .rename("visits")
      .reset_index()
      .sort_values("month")
)

# Feature engineering (same as SQL)
monthly["t"] = range(1, len(monthly) + 1)

monthly["rolling_avg_3m"] = (
    monthly["visits"]
    .rolling(window=3)
    .mean()
    .shift(1)
)

monthly["visits_last_year"] = monthly["visits"].shift(12)

# Forecast logic
monthly["forecast_visits"] = (
    0.6 * monthly["rolling_avg_3m"] +
    0.4 * monthly["visits_last_year"]
).round()

monthly

# 7. Sentiment Analysis for Unstructured Data


## Python based Sentiment Analysis (vaderSentiment)

In [7]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load data
df = pd.read_csv("support_emails.csv", parse_dates=["created_at"])

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Compute sentiment score (-1 to +1)
df["sentiment_score"] = df["email_text"].apply(
    lambda x: analyzer.polarity_scores(str(x))["compound"]
)

# Normalize text
df["text"] = df["email_text"].str.lower()

# Issue tagging (rule-based, transparent)
def tag_issue(text):
    if "clean" in text:
        return "cleanliness"
    elif "delay" in text or "late" in text:
        return "delivery"
    elif "refund" in text or "charge" in text:
        return "billing"
    elif "support" in text or "agent" in text:
        return "support"
    else:
        return "other"

df["issue_tag"] = df["text"].apply(tag_issue)

# Aggregate insights
summary = (
    df.groupby("issue_tag")
      .agg(
          email_count=("email_id", "count"),
          avg_sentiment=("sentiment_score", "mean"),
          strongly_negative_count=(
              "sentiment_score", lambda x: (x < -0.3).sum()
          )
      )
      .reset_index()
      .sort_values("strongly_negative_count", ascending=False)
)

summary

## SQL based Sentiment Analysis (Keyword + Scoring)
```sql
WITH enriched AS (
  SELECT
    email_id,
    customer_id,
    created_at,
    sentiment_score,           -- range: -1 to +1
    LOWER(email_text) AS text
  FROM support_emails
),
tagged AS (
  SELECT
    *,
    CASE
      WHEN text LIKE '%clean%' THEN 'cleanliness'
      WHEN text LIKE '%delay%' OR text LIKE '%late%' THEN 'delivery'
      WHEN text LIKE '%refund%' OR text LIKE '%charge%' THEN 'billing'
      WHEN text LIKE '%support%' OR text LIKE '%agent%' THEN 'support'
      ELSE 'other'
    END AS issue_tag
  FROM enriched
)
SELECT
  issue_tag,
  COUNT(*)                              AS email_count,
  ROUND(AVG(sentiment_score), 3)        AS avg_sentiment,
  SUM(CASE WHEN sentiment_score < -0.3 THEN 1 ELSE 0 END)
                                        AS strongly_negative_count
FROM tagged
GROUP BY issue_tag
ORDER BY strongly_negative_count DESC;
```

# 8. Regression Analysis to Isolate Key Drivers

## Python based Regression (Linear Regression via Aggregation)

In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load data
df = pd.read_csv("sales_data.csv", parse_dates=["sale_date"])

# Monthly feature engineering (same as SQL)
features = (
    df.assign(month=df["sale_date"].dt.to_period("M").dt.to_timestamp())
      .groupby("month")
      .agg(
          ad_spend=("ad_spend", "sum"),
          discounts=("discount_amount", "sum"),
          holiday_ratio=("is_holiday", "mean"),
          sales=("revenue", "sum")
      )
      .reset_index()
)

# Z-score normalization (standardized betas)
for col in ["ad_spend", "discounts", "holiday_ratio"]:
    features[f"{col}_z"] = (
        (features[col] - features[col].mean()) /
        features[col].std()
    )

# Regression setup
X = features[["ad_spend_z", "discounts_z", "holiday_ratio_z"]]
X = sm.add_constant(X)
y = features["sales"]

# Fit OLS model
model = sm.OLS(y, X).fit()

model.summary()

## SQL based Regression (Linear Regression via Aggregation)
```sql
WITH features AS (
  SELECT
    DATE_TRUNC('month', sale_date) AS month,
    SUM(ad_spend)                  AS ad_spend,
    SUM(discount_amount)           AS discounts,
    AVG(is_holiday::int)           AS holiday_ratio,
    SUM(revenue)                   AS sales
  FROM sales_data
  GROUP BY 1
),
normalized AS (
  SELECT
    month,
    (ad_spend - AVG(ad_spend) OVER ()) / STDDEV(ad_spend) OVER () AS ad_spend_z,
    (discounts - AVG(discounts) OVER ()) / STDDEV(discounts) OVER () AS discounts_z,
    (holiday_ratio - AVG(holiday_ratio) OVER ()) / STDDEV(holiday_ratio) OVER () AS holiday_z,
    sales
  FROM features
)
SELECT
  REGR_SLOPE(sales, ad_spend_z)     AS beta_ad_spend,
  REGR_SLOPE(sales, discounts_z)    AS beta_discounts,
  REGR_SLOPE(sales, holiday_z)      AS beta_holidays,
  REGR_INTERCEPT(sales, ad_spend_z) AS intercept,
  REGR_R2(sales, ad_spend_z)        AS r_squared
FROM normalized;
```

# 9. Cohort Analysis for Hidden Trends

## SQL Cohort Analysis (Retention)
```
WITH user_cohorts AS (
  SELECT
    user_id,
    DATE_TRUNC('month', signup_date) AS cohort_month
  FROM users
),
activity AS (
  SELECT
    u.user_id,
    u.cohort_month,
    DATE_TRUNC('month', a.activity_date) AS activity_month
  FROM user_cohorts u
  JOIN user_activity a
    ON u.user_id = a.user_id
),
cohort_metrics AS (
  SELECT
    cohort_month,
    activity_month,
    DATE_PART('month', activity_month) -
    DATE_PART('month', cohort_month) +
    12 * (DATE_PART('year', activity_month) -
          DATE_PART('year', cohort_month)) AS cohort_age,
    COUNT(DISTINCT user_id) AS active_users
  FROM activity
  GROUP BY cohort_month, activity_month
),
cohort_sizes AS (
  SELECT
    cohort_month,
    COUNT(DISTINCT user_id) AS cohort_size
  FROM user_cohorts
  GROUP BY cohort_month
)
SELECT
  m.cohort_month,
  m.cohort_age,
  m.active_users,
  ROUND(m.active_users * 1.0 / s.cohort_size, 3) AS retention_rate
FROM cohort_metrics m
JOIN cohort_sizes s
  ON m.cohort_month = s.cohort_month
ORDER BY cohort_month, cohort_age;
```

## Python Cohort Analysis (Retention)

In [9]:
import pandas as pd
import numpy as np

# Load data
users = pd.read_csv("users.csv", parse_dates=["signup_date"])
activity = pd.read_csv("user_activity.csv", parse_dates=["activity_date"])

# Define cohort month
users["cohort_month"] = users["signup_date"].dt.to_period("M").dt.to_timestamp()

# Join activity to cohorts
df = activity.merge(
    users[["user_id", "cohort_month"]],
    on="user_id",
    how="inner"
)

# Activity month
df["activity_month"] = df["activity_date"].dt.to_period("M").dt.to_timestamp()

# Cohort age in months
df["cohort_age"] = (
    (df["activity_month"].dt.year - df["cohort_month"].dt.year) * 12 +
    (df["activity_month"].dt.month - df["cohort_month"].dt.month)
)

# Active users per cohort per month
cohort_metrics = (
    df.groupby(["cohort_month", "cohort_age"])["user_id"]
      .nunique()
      .rename("active_users")
      .reset_index()
)

# Cohort sizes
cohort_sizes = (
    users.groupby("cohort_month")["user_id"]
         .nunique()
         .rename("cohort_size")
         .reset_index()
)

# Retention calculation
retention = cohort_metrics.merge(
    cohort_sizes, on="cohort_month", how="left"
)

retention["retention_rate"] = (
    retention["active_users"] / retention["cohort_size"]
).round(3)

retention.sort_values(["cohort_month", "cohort_age"])

# 10. Exploratory Data Analysis (EDA) Before Anything Else

## SQL based EDA
```
WITH base_stats AS (
  SELECT
    district,
    COUNT(*)                      AS total_records,
    COUNT(crime_id)               AS non_null_ids,
    COUNT(*) - COUNT(crime_id)    AS missing_ids,
    AVG(crime_count)              AS avg_crime,
    PERCENTILE_CONT(0.5)
      WITHIN GROUP (ORDER BY crime_count) AS median_crime,
    STDDEV(crime_count)           AS stddev_crime,
    MIN(crime_count)              AS min_crime,
    MAX(crime_count)              AS max_crime
  FROM crime_data
  GROUP BY district
),
outliers AS (
  SELECT
    district,
    crime_date,
    crime_count,
    AVG(crime_count) OVER (PARTITION BY district) +
    3 * STDDEV(crime_count) OVER (PARTITION BY district) AS outlier_threshold
  FROM crime_data
)
SELECT
  b.*,
  COUNT(o.crime_date) AS extreme_outlier_days
FROM base_stats b
LEFT JOIN outliers o
  ON b.district = o.district
 AND o.crime_count > o.outlier_threshold
GROUP BY
  b.district, b.total_records, b.non_null_ids, b.missing_ids,
  b.avg_crime, b.median_crime, b.stddev_crime, b.min_crime, b.max_crime
ORDER BY extreme_outlier_days DESC;
```

## Python based EDA


In [10]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("crime_data.csv")

# Basic sanity checks
eda_summary = (
    df.groupby("district")
      .agg(
          total_records=("crime_id", "size"),
          missing_ids=("crime_id", lambda x: x.isna().sum()),
          avg_crime=("crime_count", "mean"),
          median_crime=("crime_count", "median"),
          stddev_crime=("crime_count", "std"),
          min_crime=("crime_count", "min"),
          max_crime=("crime_count", "max")
      )
      .reset_index()
)

# Outlier detection: 3-sigma rule per district
df["outlier_threshold"] = (
    df.groupby("district")["crime_count"]
      .transform(lambda x: x.mean() + 3 * x.std())
)

df["is_outlier"] = df["crime_count"] > df["outlier_threshold"]

outlier_counts = (
    df[df["is_outlier"]]
    .groupby("district")
    .size()
    .rename("extreme_outlier_days")
    .reset_index()
)

# Combine results
final_eda = eda_summary.merge(
    outlier_counts, on="district", how="left"
).fillna({"extreme_outlier_days": 0})

final_eda.sort_values("extreme_outlier_days", ascending=False)