In [2]:
import os, json, re, math, string, unicodedata, collections, itertools, warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter, defaultdict
from unidecode import unidecode

import nltk
from nltk.corpus import stopwords

import networkx as nx
import community as community_louvain  # paquete python-louvain

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from wordcloud import WordCloud

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Config general de plots
plt.rcParams["figure.figsize"] = (8,4)
plt.rcParams["axes.grid"] = True

DATA_PATH = "data/traficogt.txt"   # <- AJUSTA si es necesario
SEED = 42
np.random.seed(SEED)

In [None]:
def read_jsonl_in_chunks(path, max_rows=None, chunk_size=10000):
    """
    Lee un JSONL pesado en bloques, devuelve DataFrame con columnas clave ya extraídas.
    max_rows: si quieres muestrear por desarrollo (e.g., 50_000); None = todo.
    """
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_rows is not None and i >= max_rows:
                break
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            # Campos principales
            row = {
                "id": obj.get("id"),
                "date": obj.get("date"),
                "lang": obj.get("lang"),
                "rawContent": obj.get("rawContent") or "",
                "replyCount": obj.get("replyCount"),
                "retweetCount": obj.get("retweetCount"),
                "likeCount": obj.get("likeCount"),
                "quoteCount": obj.get("quoteCount"),
                "viewCount": obj.get("viewCount"),
                "sourceLabel": (obj.get("sourceLabel") or obj.get("source")),
            }

            # Usuario autor
            u = obj.get("user") or {}
            row.update({
                "user_id": u.get("id"),
                "username": (u.get("username") or "").lower(),
                "displayname": u.get("displayname") or "",
                "user_followers": u.get("followersCount"),
                "user_friends": u.get("friendsCount"),
            })

            # Menciones (lista de objetos con username/displayname) -> guardamos usernames (lower)
            mentioned = obj.get("mentionedUsers") or []
            row["mentions"] = [ (mu.get("username") or "").lower()
                                for mu in mentioned if mu and mu.get("username") ]

            # Hashtags si existieran (como no me puse a ver todo el JSON, asumo que es lista de strings)
            row["hashtags"] = [h.lower() for h in (obj.get("hashtags") or [])]

            # Retweet/Reply/Quote (heurísticas)
            row["is_retweet"] = obj.get("retweetedTweet") is not None
            row["is_quote"]   = obj.get("quotedTweet") is not None

            # Reply detectado por metadata si existe; si no, por texto que inicia con @
            row["is_reply"] = False
            if obj.get("inReplyToTweetId") or obj.get("inReplyToUser"):
                row["is_reply"] = True
            else:
                # fallback textual
                row["is_reply"] = bool(re.match(r"^\s*@\w+", row["rawContent"]))

            rows.append(row)

    df = pd.DataFrame(rows)
    if not df.empty:
        df["date"] = pd.to_datetime(df["date"], errors="coerce", utc=True)
    return df

# TIP para desarrollo: usa max_rows para iterar rápido; luego quítalo para el corrida final.
df = read_jsonl_in_chunks(DATA_PATH, max_rows=None)
df.info()
df.head(3)
