# NLP Feedback Modeling Framework (NLP + Survey Analytics) — **Colab-ready + src modular**
**Autor:** David José Parales Araujo  

Este notebook está preparado para ejecutarse en **Google Colab** sin romper nada:

✅ Si `src/` **no existe**, lo crea automáticamente (módulos: preprocessing/targets/modeling/analytics).  
✅ Si ejecutas desde tu repo clonado, también funciona (solo reutiliza `src/`).  

Incluye:
- Índice 0–100 desde Likert
- Clasificación binaria + multiclase
- Desbalance (class_weight + threshold tuning + SMOTE opcional)
- Agregaciones listas para Power BI / Looker
- Interpretabilidad (top tokens por clase)

## 1) Install & Imports

In [None]:
# --- Colab: instalar dependencias (silencioso) ---
!pip -q install pandas numpy scikit-learn nltk matplotlib joblib imbalanced-learn

import os, sys, re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    classification_report, ConfusionMatrixDisplay,
    f1_score
)
import matplotlib.pyplot as plt

import nltk
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

SEED = 42
np.random.seed(SEED)

## 2) Ensure `src/` modules exist (auto-create if missing)

In [None]:
# Si estás ejecutando este notebook suelto en Colab, no tendrás la carpeta src/.
# Este bloque la crea automáticamente para que el notebook sea 100% ejecutable.

os.makedirs("src", exist_ok=True)

preprocessing_py = r'''
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

STOPWORDS_ES = set(stopwords.words("spanish"))

def preprocess_text(text: str) -> str:
    text = (text or "").lower()
    text = re.sub(r"[^a-záéíóúñü\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = word_tokenize(text, language="spanish")
    tokens = [t for t in tokens if t not in STOPWORDS_ES and len(t) > 2]
    return " ".join(tokens)
'''

targets_py = r'''
def performance_index_from_likert(likert_avg: float) -> int:
    return int(round(likert_avg * 100))

def multiclass_from_index(idx: int) -> str:
    if idx < 50:
        return "Negativa"
    if idx <= 75:
        return "Neutral"
    return "Positiva"

def binary_from_index(idx: int, thr_ok: int = 60) -> int:
    return int(idx >= thr_ok)
'''

modeling_py = r'''
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

def make_binary_lr(seed: int = 42):
    return Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
        ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", random_state=seed)),
    ])

def make_binary_svm(seed: int = 42):
    return Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
        ("clf", LinearSVC(class_weight="balanced", random_state=seed)),
    ])

def best_threshold(y_true, prob_pos, thresholds=None):
    if thresholds is None:
        thresholds = np.linspace(0.2, 0.8, 13)
    best_thr, best_f1 = None, -1.0
    for thr in thresholds:
        pred = (prob_pos >= thr).astype(int)
        f1 = f1_score(y_true, pred, average="macro")
        if f1 > best_f1:
            best_thr, best_f1 = float(thr), float(f1)
    return best_thr, best_f1
'''

analytics_py = r'''
import numpy as np

def composite_index(likert_avg, prob_ok_text, w_likert: float = 0.6, w_text: float = 0.4) -> int:
    score = (w_likert * likert_avg + w_text * prob_ok_text) * 100
    return int(round(score))

def make_aggregations(df):
    return (df.groupby(["year", "level", "role_evaluator"], as_index=False)
            .agg(
                n=("text_feedback", "count"),
                index_mean=("performance_index_composite", "mean"),
                index_p25=("performance_index_composite", lambda x: np.percentile(x, 25)),
                index_p75=("performance_index_composite", lambda x: np.percentile(x, 75)),
            ))
'''

files = {
    "src/preprocessing.py": preprocessing_py,
    "src/targets.py": targets_py,
    "src/modeling.py": modeling_py,
    "src/analytics.py": analytics_py,
    "src/__init__.py": ""
}

for path, content in files.items():
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        with open(path, "w", encoding="utf-8") as f:
            f.write(content.strip() + "\n")

sys.path.append(os.path.abspath("."))

from src.preprocessing import preprocess_text
from src.targets import performance_index_from_likert, multiclass_from_index, binary_from_index
from src.modeling import make_binary_lr, make_binary_svm, best_threshold
from src.analytics import composite_index, make_aggregations

print("✅ src/ listo e importado correctamente.")

## 3) Load data (sample) — replace with your anonymized CSV

In [None]:
# Reemplaza por: df = pd.read_csv("data/tu_archivo.csv")

positive_texts = [
    "Explica con claridad y responde dudas con paciencia.",
    "Las clases son dinámicas y se nota dominio del tema.",
    "Motiva al curso y brinda material útil.",
    "Evalúa de forma justa y retroalimenta con detalle."
]
neutral_texts = [
    "Algunas clases son buenas, otras podrían mejorar.",
    "A veces explica claro, a veces rápido.",
    "El ritmo es variable, en general cumple."
]
negative_texts = [
    "Las explicaciones son confusas y desorganizadas.",
    "No responde bien a las preguntas y llega tarde.",
    "Las clases son aburridas y poco productivas.",
    "Las evaluaciones no reflejan lo visto en clase."
]

def make_rows(n=900, p_pos=0.65, p_neu=0.20, p_neg=0.15):
    rows = []
    for _ in range(n):
        r = np.random.rand()
        if r < p_neg:
            txt = np.random.choice(negative_texts)
            likert = np.random.uniform(0.15, 0.55)
        elif r < p_neg + p_neu:
            txt = np.random.choice(neutral_texts)
            likert = np.random.uniform(0.45, 0.75)
        else:
            txt = np.random.choice(positive_texts)
            likert = np.random.uniform(0.65, 0.95)

        source = np.random.choice(["student", "leadership", "self"], p=[0.7, 0.2, 0.1])
        role = {"student":"student", "leadership":"director", "self":"self"}[source]
        year = np.random.choice([2023, 2024, 2025])
        level = np.random.choice([1,2,3,4,5])
        rows.append([source, year, level, role, float(likert), txt])

    return pd.DataFrame(rows, columns=["source","year","level","role_evaluator","likert_avg","text_feedback"])

df = make_rows()
df["text_clean"] = df["text_feedback"].apply(preprocess_text)

# Targets derivados desde Likert (evita leakage)
df["performance_index"] = df["likert_avg"].apply(performance_index_from_likert)
df["target_multiclass"] = df["performance_index"].apply(multiclass_from_index)
df["target_binary"] = df["performance_index"].apply(binary_from_index)

df.head()

## 4) Binary classification (Risk vs OK)

In [None]:
X = df["text_clean"].values
y_bin = df["target_binary"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y_bin, test_size=0.2, random_state=SEED, stratify=y_bin
)

binary_lr = make_binary_lr(seed=SEED)
binary_svm = make_binary_svm(seed=SEED)

binary_lr.fit(X_train, y_train)
pred_lr = binary_lr.predict(X_test)

binary_svm.fit(X_train, y_train)
pred_svm = binary_svm.predict(X_test)

print("=== Logistic Regression (Binary) ===")
print(classification_report(y_test, pred_lr, target_names=["risk(0)","ok(1)"]))

print("=== Linear SVM (Binary) ===")
print(classification_report(y_test, pred_svm, target_names=["risk(0)","ok(1)"]))

ConfusionMatrixDisplay.from_predictions(y_test, pred_lr, display_labels=["risk","ok"])
plt.title("Binary - Logistic Regression")
plt.show()

## 5) Cross-validation + Threshold tuning

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
scores = cross_val_score(binary_lr, X, y_bin, cv=cv, scoring="f1_macro")
print("Macro F1 (5-fold):", round(scores.mean(), 4), "+/-", round(scores.std(), 4))

proba = binary_lr.predict_proba(X_test)[:, 1]
thr, f1 = best_threshold(y_test, proba)
print("Best threshold:", thr, "Macro F1:", round(f1, 4))

pred_thr = (proba >= thr).astype(int)
print(classification_report(y_test, pred_thr, target_names=["risk(0)","ok(1)"]))

## 6) Multiclass classification (Negativa / Neutral / Positiva)

In [None]:
y_multi = df["target_multiclass"].values
Xm_train, Xm_test, ym_train, ym_test = train_test_split(
    X, y_multi, test_size=0.2, random_state=SEED, stratify=y_multi
)

multi_lr = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=8000)),
    ("clf", LogisticRegression(max_iter=3000, class_weight="balanced", random_state=SEED))
])

multi_lr.fit(Xm_train, ym_train)
pred_m = multi_lr.predict(Xm_test)

print(classification_report(ym_test, pred_m))
ConfusionMatrixDisplay.from_predictions(ym_test, pred_m, xticks_rotation=45)
plt.title("Multiclass - Logistic Regression")
plt.show()

## 7) Optional SMOTE (oversampling)

In [None]:
try:
    from imblearn.over_sampling import SMOTE
    smote_ok = True
except Exception:
    smote_ok = False

if smote_ok:
    vec = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
    Xv = vec.fit_transform(X_train)
    sm = SMOTE(random_state=SEED)
    X_res, y_res = sm.fit_resample(Xv, y_train)
    clf = LogisticRegression(max_iter=2000, random_state=SEED)
    clf.fit(X_res, y_res)

    Xt = vec.transform(X_test)
    pred = clf.predict(Xt)
    print(classification_report(y_test, pred, target_names=["risk(0)","ok(1)"]))
else:
    print("SMOTE no disponible.")

## 8) Composite Performance Index (0–100) + Dashboard-ready aggregations

In [None]:
prob_ok_text = binary_lr.predict_proba(df["text_clean"])[:, 1]
df["performance_index_composite"] = [
    composite_index(l, p) for l, p in zip(df["likert_avg"], prob_ok_text)
]

agg = make_aggregations(df)
agg.head(10)

## 9) Interpretability — top weighted tokens (Binary LR)

In [None]:
tfidf = binary_lr.named_steps["tfidf"]
clf = binary_lr.named_steps["clf"]

feature_names = np.array(tfidf.get_feature_names_out())
coefs = clf.coef_.ravel()

top_ok = feature_names[np.argsort(coefs)[-15:]][::-1]
top_risk = feature_names[np.argsort(coefs)[:15]]

print("Top tokens que empujan a OK (1):\n", top_ok)
print("\nTop tokens que empujan a RISK (0):\n", top_risk)

## 10) Export outputs (optional)

In [None]:
agg.to_csv("dashboard_aggregations.csv", index=False, encoding="utf-8")
df[["source","year","level","role_evaluator","likert_avg","performance_index_composite","target_binary","target_multiclass","text_feedback"]]\
  .to_csv("row_level_scored_feedback.csv", index=False, encoding="utf-8")

print("✅ Exportados: dashboard_aggregations.csv y row_level_scored_feedback.csv")