In [5]:
from google.colab import files
files.download("Diego_Silvera_Proyecto_Anime.pptx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
# ============================================================
# PPTX COMPLETA (descarga de Drive + EDA + gráficos + slides)
# ============================================================

# 0) Dependencias
!pip -q install python-pptx gdown

import os, re, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN
from pptx.enum.shapes import MSO_SHAPE

# 1) Descargar dataset desde Google Drive (usando tu ID)
DRIVE_FILE_ID = "1MCiBUdb5uyFpqy0jHJpMCN3aDaUrsqqh"
LOCAL_CSV = "anime_recommendation_dataset.DS.csv"

import gdown
gdown.download(f"https://drive.google.com/uc?id={DRIVE_FILE_ID}", LOCAL_CSV, quiet=False)

# 2) Cargar dataset en df
df = pd.read_csv(LOCAL_CSV)
print("Cargado OK:", df.shape, "Columnas:", list(df.columns)[:10])

# Normalizar nombres esperados
def normalize_columns(d):
    m = {c.lower().strip(): c for c in d.columns}
    def ren(old, new):
        if old in m and new not in d.columns: d.rename(columns={m[old]: new}, inplace=True)
    ren("sinopsis","synopsis"); ren("puntaje","score"); ren("generos","genres"); ren("titulo","title")
    return d
df = normalize_columns(df)

# 3) Directorio de imágenes
IMG_DIR = "ppt_imgs"
os.makedirs(IMG_DIR, exist_ok=True)

# 4) Funciones de gráficos
plt.rcParams['figure.facecolor'] = 'white'
def save_fig(path):
    plt.tight_layout(); plt.savefig(path, dpi=220, bbox_inches='tight'); plt.close()

def plot_score_hist(d):
    if 'score' not in d.columns: return None
    s = pd.to_numeric(d['score'], errors='coerce').dropna()
    if s.empty: return None
    plt.figure(figsize=(8,4.5))
    plt.hist(s, bins=15, edgecolor='white')
    plt.title("Distribución de score"); plt.xlabel("score"); plt.ylabel("frecuencia")
    for sp in ['top','right']: plt.gca().spines[sp].set_visible(False)
    p = os.path.join(IMG_DIR,"score_hist.png"); save_fig(p); return p

def plot_top_genres(d, topn=18):
    if 'genres' not in d.columns: return None
    cnt = Counter()
    for g in d['genres'].dropna().astype(str):
        for t in re.split(r'[|,;/]+', g):
            t = t.strip().lower()
            if t: cnt[t]+=1
    if not cnt: return None
    items = cnt.most_common(topn)[::-1]
    labels, vals = zip(*items)
    plt.figure(figsize=(8,5.2))
    plt.barh(labels, vals, color="#A78BFA")
    plt.title("Top géneros"); plt.xlabel("frecuencia")
    for sp in ['top','right']: plt.gca().spines[sp].set_visible(False)
    p = os.path.join(IMG_DIR,"top_genres.png"); save_fig(p); return p

# Baseline rápido para matriz de confusión y términos positivos
def ensure_baseline(d):
    if 'synopsis' not in d.columns: return None
    if 'sentiment' not in d.columns:
        if 'score' not in d.columns: return None
        s = pd.to_numeric(d['score'], errors='coerce')
        if s.max() and s.max() > 10:
            def to_sent(v):
                if pd.isna(v): return np.nan
                if v <= 55: return 'negative'
                if v < 70:  return 'neutral'
                return 'positive'
        else:
            def to_sent(v):
                if pd.isna(v): return np.nan
                if v <= 5.5: return 'negative'
                if v < 7.0:  return 'neutral'
                return 'positive'
        d = d.copy(); d['sentiment'] = s.apply(to_sent)
    data = d.dropna(subset=['synopsis','sentiment']).copy()
    if data['sentiment'].nunique()<2: return None

    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression

    Xtr, Xte, ytr, yte = train_test_split(
        data['synopsis'].astype(str), data['sentiment'],
        test_size=0.2, random_state=42, stratify=data['sentiment']
    )
    tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)
    XTR = tfidf.fit_transform(Xtr); XTE = tfidf.transform(Xte)
    logreg = LogisticRegression(max_iter=3000); logreg.fit(XTR, ytr)
    ypred = logreg.predict(XTE)
    return tfidf, (XTR, XTE), (ytr, yte), logreg, ypred

def plot_confusion(y_true, y_pred):
    from sklearn.metrics import confusion_matrix
    labs = sorted(np.unique(y_true))
    cm = confusion_matrix(y_true, y_pred, labels=labs)
    plt.figure(figsize=(5.1,4.1))
    im = plt.imshow(cm, cmap='Blues'); plt.title("Matriz de confusión – LogReg")
    plt.xlabel("Predicho"); plt.ylabel("Real")
    plt.xticks(range(len(labs)), labs, rotation=45, ha='right'); plt.yticks(range(len(labs)), labs)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j,i,cm[i,j],ha='center',va='center',color='black',fontsize=10)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    p = os.path.join(IMG_DIR,"cm_logreg.png"); save_fig(p); return p

def plot_mlp_curve():
    # Curva estilizada basada en tus resultados (si no hay history real)
    tr = [0.45,0.465,0.466,0.49,0.50]
    va = [0.525,0.537,0.537,0.537,0.481]
    plt.figure(figsize=(6.2,3.8))
    plt.plot(tr, label='train'); plt.plot(va, label='val')
    plt.title("Accuracy MLP (Keras)"); plt.xlabel("Época"); plt.ylabel("Accuracy"); plt.legend()
    for sp in ['top','right']: plt.gca().spines[sp].set_visible(False)
    p = os.path.join(IMG_DIR,"mlp_curve.png"); save_fig(p); return p

def plot_top_positive_terms(tfidf, logreg, topn=12):
    try:
        classes = list(logreg.classes_)
        if 'positive' in classes: idx = classes.index('positive')
        else: idx = 0
        coefs = logreg.coef_[idx]
        feat = tfidf.get_feature_names_out()
        ids = np.argsort(coefs)[-topn:]
        feats = feat[ids]; vals = coefs[ids]
        ord_ = np.argsort(vals); feats, vals = feats[ord_], vals[ord_]
        plt.figure(figsize=(6.8,4.3))
        plt.barh(feats, vals, color="#A78BFA")
        plt.title("Términos que favorecen 'positivo'"); plt.xlabel("peso (coef)")
        for sp in ['top','right']: plt.gca().spines[sp].set_visible(False)
        p = os.path.join(IMG_DIR,"lime_like_terms.png"); save_fig(p); return p
    except Exception:
        return None

# 5) Generar imágenes
score_img  = plot_score_hist(df)
genres_img = plot_top_genres(df)

baseline   = ensure_baseline(df)
cm_img = lime_img = None
if baseline:
    tfidf, (XTR,XTE), (ytr,yte), logreg, ypred = baseline
    cm_img   = plot_confusion(yte, ypred)
    lime_img = plot_top_positive_terms(tfidf, logreg)

mlp_img = plot_mlp_curve()

# 6) Crear PPTX con estética anime académico
COLOR_BG   = RGBColor(76,110,245)   # Azul #4C6EF5
COLOR_ACC  = RGBColor(167,139,250)  # Violeta #A78BFA
COLOR_TXT  = RGBColor(248,250,252)  # Blanco humo

prs = Presentation()
prs.slide_width = Inches(13.33); prs.slide_height = Inches(7.5)

def add_bg(slide, color=COLOR_BG):
    sh = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, Inches(0), Inches(0), prs.slide_width, prs.slide_height)
    sh.fill.solid(); sh.fill.fore_color.rgb = color; sh.line.fill.background()

def add_text(slide, title, body="", y_title=1.0, y_body=1.8):
    t1 = slide.shapes.add_textbox(Inches(1), Inches(y_title), prs.slide_width-Inches(2), Inches(1))
    f1 = t1.text_frame; f1.clear()
    p = f1.paragraphs[0]; p.text = title; p.font.size = Pt(38); p.font.bold = True; p.font.color.rgb = COLOR_TXT
    if body:
        t2 = slide.shapes.add_textbox(Inches(1), Inches(y_body), prs.slide_width-Inches(2), prs.slide_height-Inches(y_body+1))
        f2 = t2.text_frame; f2.clear()
        p2 = f2.paragraphs[0]; p2.text = body; p2.font.size = Pt(22); p2.font.color.rgb = RGBColor(235,238,241)

def add_img(slide, path, left=1.0, top=3.0, width=11.3):
    if path and os.path.exists(path):
        slide.shapes.add_picture(path, Inches(left), Inches(top), width=Inches(width))

# Portada
s = prs.slides.add_slide(prs.slide_layouts[6]); add_bg(s)
add_text(s, "Análisis de Sentimientos en Sinopsis de Anime\nmediante NLP y Deep Learning",
            "Diego Silvera  •  Data Science III – NLP & Deep Learning  •  2025",
            y_title=2.1, y_body=3.1)
if os.path.exists("coderhouse_logo.png"):
    s.shapes.add_picture("coderhouse_logo.png", Inches(10.3), Inches(0.6), width=Inches(2.6))

# Slides según la estructura potenciada
def S(t,b):
    sl = prs.slides.add_slide(prs.slide_layouts[6]); add_bg(sl); add_text(sl,t,b); return sl

S("Introducción y contexto",
  "• ¿Qué es el análisis de sentimientos?\n• ¿Por qué aplicarlo al dominio del anime?\n• Objetivos: comprender, clasificar y evaluar.")

S("Metodología general",
  "Flujo: 1) EDA → 2) Limpieza → 3) Preprocesamiento → 4) Modelado → 5) Interpretabilidad\n\n"
  "Dataset: título, sinopsis, géneros, episodios, score.\nHerramientas: Python, Pandas, NLTK, spaCy, scikit-learn, TensorFlow, LIME.")

s = S("EDA – Distribución de score", "Predominan valores 60–80 → inclinación a valoraciones positivas.")
add_img(s, score_img)

s = S("EDA – Top géneros", "Drama, acción y comedia destacan; variedad temática útil para el modelado.")
add_img(s, genres_img)

S("Procesamiento NLP",
  "Normalización, eliminación de signos/stopwords, tokenización y stemming/lemma.\n"
  "Se crea 'clean_text' para TF-IDF. El preprocesamiento condiciona la calidad de las representaciones.")

s = S("Modelos aplicados y comparación",
      "Baseline: TF-IDF + Regresión Logística → accuracy ≈ 0.50 (rápido, interpretable).\n"
      "Deep Learning (MLP – Keras) → accuracy ≈ 0.49–0.52 (sin mejora notable por corpus corto).\n"
      "Comparativa visual:")
add_img(s, mlp_img, left=1.0, top=3.0, width=5.5); add_img(s, cm_img, left=7.0, top=3.0, width=5.3)

s = S("Interpretabilidad (LIME)",
      "Explicación local: términos que disparan 'positivo' ej. 'historia', 'atrapante', 'bien ejecutada', 'sin dudar'.\n"
      "Aporta transparencia y trazabilidad.")
add_img(s, lime_img)

S("Conclusiones generales",
  "• Corpus corto (~8–9 tokens) y sesgo positivo → limita modelos profundos.\n"
  "• Baseline TF-IDF+LogReg competitivo y explicable; MLP similar.\n"
  "• LIME valida señales lingüísticas coherentes.")

S("Perspectivas futuras",
  "• Rebalancear clases (percentiles / class_weight).\n• Embeddings (FastText/BERT) y expansión de corpus.\n• Explicabilidad avanzada (SHAP) y dashboard para stakeholders.")

S("Cierre",
  "“Incluso con datos limitados, la combinación de NLP, ML y explicabilidad puede generar modelos interpretables y reproducibles.”\n\n"
  "Diego Silvera  •  linkedin.com/in/diegosilvera")

pptx_path = "Diego_Silvera_Proyecto_Anime.pptx"
prs.save(pptx_path)
print(f"✅ Presentación generada: {pptx_path} (diapositivas: {len(prs.slides)})")

# Descarga directa en Colab (opcional)
try:
    from google.colab import files
    files.download(pptx_path)
except Exception:
    pass

print("\nPara PDF: abrir el PPTX y 'Exportar como PDF' en PowerPoint/Google Slides.")



Downloading...
From: https://drive.google.com/uc?id=1MCiBUdb5uyFpqy0jHJpMCN3aDaUrsqqh
To: /content/anime_recommendation_dataset.DS.csv
100%|██████████| 368k/368k [00:00<00:00, 4.96MB/s]


Cargado OK: (1000, 6) Columnas: ['title', 'score', 'synopsis', 'genres', 'episodes', 'characters']
✅ Presentación generada: Diego_Silvera_Proyecto_Anime.pptx (diapositivas: 11)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Para PDF: abrir el PPTX y 'Exportar como PDF' en PowerPoint/Google Slides.
