In [1]:
from pathlib import Path
import sys

PROJECT_ROOT = Path(r"C:\Users\luigu\OneDrive\Escritorio\ProyectoML_YouTube")
sys.path.append(str(PROJECT_ROOT))

DATA_PATH = PROJECT_ROOT / "data" / "youtube_data.csv"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.preprocessing import (
    build_model_frame,
    make_preprocessor_prepub,
    make_preprocessor_unsupervised,
    make_tfidf_only,
    prepare_graph_for_greedy,
    make_target_hit_er
)

In [2]:
df_raw = pd.read_csv(DATA_PATH, low_memory=False)
df = build_model_frame(df_raw)

print("Shape raw:", df_raw.shape, "| Shape model-frame:", df.shape)
df[["title","category","duration_sec","title_len","title_words","tag_count"]].head(5)

Shape raw: (17589, 17) | Shape model-frame: (17589, 25)


Unnamed: 0,title,category,duration_sec,title_len,title_words,tag_count
0,«السودان ينتفض» أمام السفارة بالقاهرة,News & Politics,180.0,37,5,47
1,Pokemon Tower Defense Episodio 2,Gaming,930.0,34,5,5
2,New Hip Hop - Kemo Treats - Pancakes,Comedy,233.0,36,8,37
3,Sister's Keeper Impression,People & Blogs,562.0,26,3,6
4,JBHAMMER777 (AKUMA) VS KtossPol Rank 1 (Vega)...,Gaming,300.0,76,14,55


In [3]:
# Preprocesador supervisado
pre_sup, manifest = make_preprocessor_prepub(use_hashtags=False)
manifest

{'numericas': ['duration_sec', 'title_len', 'title_words', 'tag_count'],
 'categoricas': ['category',
  'duration_bucket',
  'title_has_question',
  'title_has_exclaim'],
 'texto': ['title']}

In [4]:
# Ajuste/transformación
X_sup = pre_sup.fit_transform(df)
print("X_sup shape:", X_sup.shape, "| tipo:", type(X_sup))

X_sup shape: (17589, 2731) | tipo: <class 'scipy.sparse._csr.csr_matrix'>


In [5]:
y = make_target_hit_er(df, p=0.90)
y.value_counts(normalize=True).rename("class_ratio")

er
0    0.898459
1    0.101541
Name: class_ratio, dtype: float64

In [6]:
# Preprocesador para clustering (metadata + TF-IDF de título)
pre_unsup = make_preprocessor_unsupervised(
    include_numeric=True,
    include_categorical=True,
    include_title_tfidf=True,
    include_hashtags_tfidf=False,
    pca_components=None  
)

X_unsup = pre_unsup.fit_transform(df)
print("X_unsup shape:", X_unsup.shape, "| tipo:", type(X_unsup))

X_unsup shape: (17589, 2731) | tipo: <class 'scipy.sparse._csr.csr_matrix'>


In [7]:
# K-Means rápido para verificar que la matriz sirve para clusterizar
from sklearn.cluster import KMeans

k = 8
km = KMeans(n_clusters=k, n_init=10, random_state=42)
labels_km = km.fit_predict(X_unsup)

pd.Series(labels_km).value_counts().sort_index()

[WinError 2] The system cannot find the file specified
  File "c:\Users\luigu\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\luigu\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\luigu\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^

0     182
1    2820
2    3355
3     763
4    4191
5       3
6    1641
7    4634
Name: count, dtype: int64

In [8]:
# K-Means solo con TF-IDF de títulos
vec = make_tfidf_only(column="title", min_df=5, ngram_range=(1,2))
X_tfidf = vec.fit_transform(df["title"].fillna("").astype(str))

km2 = KMeans(n_clusters=8, n_init=10, random_state=42)
labels_km2 = km2.fit_predict(X_tfidf)

pd.Series(labels_km2).value_counts().sort_index()

0    14144
1      386
2      933
3      827
4      601
5      325
6      273
7      100
Name: count, dtype: int64

In [None]:
# Construye edges del grafo (kNN en coseno) desde TF-IDF de títulos
edges, idx_map = prepare_graph_for_greedy(
    df=df,
    text_col="title",
    min_df=5,
    ngram_range=(1,2),
    k=15,              # vecinos por nodo
    sim_threshold=0.2  # umbral mínimo de similitud para crear aristas
)

len(edges), list(edges)[:5]

(201348,
 [(1, 7440, 0.5776657280893153),
  (1, 8329, 0.5165201695660847),
  (1, 14140, 0.5165201695660847),
  (1, 3142, 0.5165201695660847),
  (1, 11583, 0.43045497113940256)])

In [10]:
# Greedy Modularity
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities, modularity

G = nx.Graph()
G.add_nodes_from(range(len(df)))
for i, j, w in edges:
    G.add_edge(i, j, weight=w)

comms = list(greedy_modularity_communities(G, weight="weight"))
Q = modularity(G, comms, weight="weight")

print(f"#Comunidades: {len(comms)} | Modularity Q: {Q:.3f}")
# Vista rápida del tamaño de las primeras comunidades
sorted([len(c) for c in comms], reverse=True)[:10]

#Comunidades: 3527 | Modularity Q: 0.666


[4023, 2246, 1961, 1630, 705, 606, 511, 436, 188, 181]

In [11]:
# Verificación end-to-end (sin métricas aún): pipeline + LogReg en un split pequeño
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Usamos el preprocesador supervisado y un modelo sencillo
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, stratify=y, random_state=42)

clf = Pipeline([("pre", pre_sup), ("logreg", LogisticRegression(max_iter=200, class_weight="balanced"))])
clf.fit(X_train, y_train)

probs = clf.predict_proba(X_test)[:,1]
print("Probs sample:", probs[:10])

Probs sample: [0.3359413  0.22637526 0.25152632 0.47635937 0.61859327 0.43622538
 0.34007941 0.55724238 0.43941374 0.25594   ]
