# ðŸ““ Notebook 3: Insight Extraction (Customer Voice)

**Goal:** Extract actionable insights:
1.  **Tweet Issue Classifiction (Zero-Shot):** Identify Shipping vs Billing vs Technical issues.
2.  **Review Sizing Analysis (Rule-Based):** Detect "Running Small" vs "Running Large".
3.  **Sentiment Mapping:** Correlate rating with text sentiment.

In [None]:
!pip install -q transformers torch pandas loguru plotly

In [None]:
import pandas as pd
from transformers import pipeline
import torch
from tqdm.auto import tqdm
import os

INPUT_DIR = "/kaggle/input/customer-voice-processed-nb1" 
if not os.path.exists(INPUT_DIR): INPUT_DIR = "/kaggle/working"
    
df = pd.read_parquet(f"{INPUT_DIR}/all_chunks.parquet")
print(f"Loaded {len(df)} docs total")

In [None]:
# â”€â”€â”€ 1. SUPPORT TICKET CLASSIFICATION â”€â”€â”€
# Run zero-shot only on tweets
tweets = df[df['source'] == 'twitter_support'].copy()
print(f"Classifying {len(tweets)} tweets...")

SUPPORT_LABELS = ["shipping delay", "billing issue", "app technical error", "general inquiry", "complaint", "praise"]

# Load Classifier
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

results = []
# Batch process 1000 for demo speed (remove slice for full run)
TARGET_TWEETS = tweets.head(1000)

for i, row in tqdm(TARGET_TWEETS.iterrows(), total=len(TARGET_TWEETS)):
    txt = row['text']
    try:
        pred = classifier(txt, candidate_labels=SUPPORT_LABELS)
        results.append({
            "text": txt,
            "author": row.get('author', 'unknown'),
            "issue_type": pred['labels'][0],
            "confidence": pred['scores'][0]
        })
    except Exception as e:
        continue
    
pd.DataFrame(results).to_csv("/kaggle/working/tweet_issues.csv", index=False)
print("Tweet classification complete.")

In [None]:
# â”€â”€â”€ 2. PRODUCT SIZING ANALYSIS (REGEX) â”€â”€â”€
# Heuristic analysis on reviews
reviews = df[df['source'] == 'clothing_reviews'].copy()
print(f"Analyzing {len(reviews)} reviews for sizing feedback...")

def analyze_sizing(text):
    t = str(text).lower()
    if "small" in t or "tight" in t: return "Runs Small"
    if "large" in t or "loose" in t or "big" in t: return "Runs Large"
    if "perfect" in t or "fits well" in t: return "True to Size"
    return "Neutral"

reviews['sizing_feedback'] = reviews['text'].apply(analyze_sizing)

# Save findings
reviews[['text', 'sizing_feedback', 'rating', 'category', 'age']].to_csv("/kaggle/working/sizing_analysis.csv", index=False)

print("Sizing breakdown:")
print(reviews['sizing_feedback'].value_counts())