## Consolidación y features de User_Tweet

Este notebook:
- Recorre `PhDBotnetsDB/User_Tweet/**` (archivos .txt en formato JSON-lines)
- Extrae campos clave de cada tweet (incluyendo `created_at`) y del usuario
- Deriva features temporales y de contenido relevantes para detección de bots/redes
- Genera un dataset enriquecido a CSV y muestra `head()`

Notas:
- Los archivos son grandes; se incluye modo muestra y modo completo por streaming a CSV.
- Se agrega `carpeta_origen` (subcarpeta de donde proviene el archivo) para etiquetar procedencia.


In [None]:
import os
import json
import gzip
from pathlib import Path
from typing import Iterator, Dict, Any, List, Optional

import pandas as pd
import numpy as np

BASE_DIR = Path(r"C:/Users/felip/OneDrive/Escritorio/BotNetsCode")
USER_TWEET_DIR = BASE_DIR / "PhDBotnetsDB/User_Tweet"
OUTPUT_DIR = USER_TWEET_DIR / "_outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SAMPLE_ROWS = 20000 
RANDOM_SEED = 42

TWEET_COLS = ["tweet_id", "tweet_id_str", "tweet_text", "tweet_lang", "tweet_source","created_at","is_retweet", "retweet_count", "favorite_count","has_hashtags", "has_urls", "has_mentions",]
USER_COLS = ["user_id", "user_id_str", "screen_name", "user_name", "user_location","user_followers_count", "user_friends_count", "user_statuses_count","user_favourites_count", "user_verified", "user_protected", "user_created_at",]
AUX_COLS = ["carpeta_origen", "archivo_origen"]

ALL_COLS = TWEET_COLS + USER_COLS + AUX_COLS

np.random.seed(RANDOM_SEED)

def iter_json_lines(path: Path) -> Iterator[Dict[str, Any]]:
    """Itera registros JSON por linea, soporta .txt, .jsonl y .gz."""
    open_fn = gzip.open if path.suffix == ".gz" else open
    mode = "rt" if path.suffix == ".gz" else "r"
    with open_fn(path, mode, encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except Exception:
                continue

def safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
    return d.get(key, default)

def parse_tweet_record(rec: Dict[str, Any], carpeta: str, archivo: str) -> Optional[Dict[str, Any]]:
    try:
        user = rec.get("user") or {}
        entities = rec.get("entities") or {}
        hashtags = entities.get("hashtags") or []
        urls = entities.get("urls") or []
        mentions = entities.get("user_mentions") or []

        out = {
            # tweet
            "tweet_id": safe_get(rec, "id"),
            "tweet_id_str": safe_get(rec, "id_str"),
            "tweet_text": safe_get(rec, "text"),
            "tweet_lang": safe_get(rec, "lang"),
            "tweet_source": safe_get(rec, "source"),
            "created_at": safe_get(rec, "created_at"),
            "is_retweet": bool(rec.get("retweeted", False)) or (str(safe_get(rec, "text", "")).startswith("RT ")),
            "retweet_count": safe_get(rec, "retweet_count"),
            "favorite_count": safe_get(rec, "favorite_count"),
            "has_hashtags": len(hashtags) > 0,
            "has_urls": len(urls) > 0,
            "has_mentions": len(mentions) > 0,
            # user
            "user_id": safe_get(user, "id"),
            "user_id_str": safe_get(user, "id_str"),
            "screen_name": safe_get(user, "screen_name"),
            "user_name": safe_get(user, "name"),
            "user_location": safe_get(user, "location"),
            "user_followers_count": safe_get(user, "followers_count"),
            "user_friends_count": safe_get(user, "friends_count"),
            "user_statuses_count": safe_get(user, "statuses_count"),
            "user_favourites_count": safe_get(user, "favourites_count"),
            "user_verified": safe_get(user, "verified"),
            "user_protected": safe_get(user, "protected"),
            "user_created_at": safe_get(user, "created_at"),
            # aux
            "carpeta_origen": carpeta,
            "archivo_origen": archivo,
        }
        return out
    except Exception:
        return None


def find_all_files(base_dir: Path) -> List[Path]:
    exts = {".txt", ".jsonl", ".gz"}
    files = []
    for root, _, filenames in os.walk(base_dir):
        for fn in filenames:
            p = Path(root) / fn
            if p.suffix.lower() in exts:
                files.append(p)
    return files


all_files = find_all_files(USER_TWEET_DIR)
len(all_files), str(USER_TWEET_DIR)


(6588,
 'C:\\Users\\felip\\OneDrive\\Escritorio\\BotNetsCode\\PhDBotnetsDB\\User_Tweet')

In [None]:
from datetime import datetime

def parse_twitter_datetime(dt: Any) -> Optional[pd.Timestamp]:
    if pd.isna(dt) or dt is None:
        return None
    if isinstance(dt, (int, float)):
        try:
            return pd.to_datetime(int(dt), unit="s", utc=True)
        except Exception:
            return None
    if isinstance(dt, (pd.Timestamp, np.datetime64)):
        return pd.to_datetime(dt, utc=True, errors="coerce")
    try:
        return pd.to_datetime(dt, utc=True, errors="coerce")
    except Exception:
        return None


def derive_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["created_at_ts"] = df["created_at"].apply(parse_twitter_datetime)
    df["tweet_date"] = df["created_at_ts"].dt.date
    df["tweet_hour"] = df["created_at_ts"].dt.hour
    df["tweet_wday"] = df["created_at_ts"].dt.weekday  # 0=Lunes
    df["tweet_month"] = df["created_at_ts"].dt.month
    df["tweet_year"] = df["created_at_ts"].dt.year
    bins = [0, 6, 12, 18, 24]
    labels = ["noche", "mañana", "tarde", "noche_tarde"]
    df["day_part"] = pd.cut(df["tweet_hour"], bins=bins, labels=labels, include_lowest=True, right=False)
    df["tweet_len"] = df["tweet_text"].fillna("").str.len()
    df["num_upper"] = df["tweet_text"].fillna("").str.count(r"[A-ZÁÉÍÓÚÑ]")
    df["num_exclaim"] = df["tweet_text"].fillna("").str.count(r"!")
    df["num_question"] = df["tweet_text"].fillna("").str.count(r"\?")
    return df


In [None]:
SAMPLE_FILES = 60
ROWS_PER_FILE = 1000

if len(all_files) == 0:
    raise RuntimeError("No se encontraron archivos en User_Tweet")

sample_files = list(np.random.choice(all_files, size=min(SAMPLE_FILES, len(all_files)), replace=False))

rows = []
for p in sample_files:
    carpeta = p.parent.name
    archivo = p.name
    count = 0
    for rec in iter_json_lines(p):
        parsed = parse_tweet_record(rec, carpeta, archivo)
        if parsed is None:
            continue
        rows.append(parsed)
        count += 1
        if count >= ROWS_PER_FILE:
            break

sample_df = pd.DataFrame(rows, columns=ALL_COLS)
sample_df = derive_time_features(sample_df)
print(sample_df.shape)
sample_df.head()

(11857, 37)


Unnamed: 0,tweet_id,tweet_id_str,tweet_text,tweet_lang,tweet_source,created_at,is_retweet,retweet_count,favorite_count,has_hashtags,...,tweet_date,tweet_hour,tweet_wday,tweet_month,tweet_year,day_part,tweet_len,num_upper,num_exclaim,num_question
0,995336558990409731,995336558990409731,Aftermarket Forecasting Analyst: Req ID : 4256...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 16:15:02 +0000 2018,False,0,0,False,...,2018-05-12,16,5,5,2018,tarde,140,22,0,0
1,995224830499667970,995224830499667970,Hybris Tech Lead: Hybris Tech Lead Ref No.:18-...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 08:51:04 +0000 2018,False,0,0,False,...,2018-05-12,8,5,5,2018,mañana,140,20,0,0
2,995209223653879810,995209223653879810,Principal Software Engineer: The Big Willow is...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 07:49:03 +0000 2018,False,0,0,False,...,2018-05-12,7,5,5,2018,mañana,139,11,0,0
3,995185827595485184,995185827595485184,Senior Business Systems Analyst: Title Senior ...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 06:16:05 +0000 2018,False,0,0,False,...,2018-05-12,6,5,5,2018,mañana,140,16,0,0
4,995170019842473984,995170019842473984,Test Automation Engineer (Automation Tester (H...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 05:13:16 +0000 2018,False,0,0,False,...,2018-05-12,5,5,5,2018,noche,139,17,0,0


In [None]:
SAMPLE_CSV = OUTPUT_DIR / "user_tweets_sample_enriched.csv"
sample_df.to_csv(SAMPLE_CSV, index=False)
print("Guardado:", SAMPLE_CSV)
sample_df.head(20)


Guardado: C:\Users\felip\OneDrive\Escritorio\BotNetsCode\PhDBotnetsDB\User_Tweet\_outputs\user_tweets_sample_enriched.csv


Unnamed: 0,tweet_id,tweet_id_str,tweet_text,tweet_lang,tweet_source,created_at,is_retweet,retweet_count,favorite_count,has_hashtags,...,tweet_date,tweet_hour,tweet_wday,tweet_month,tweet_year,day_part,tweet_len,num_upper,num_exclaim,num_question
0,995336558990409731,995336558990409731,Aftermarket Forecasting Analyst: Req ID : 4256...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 16:15:02 +0000 2018,False,0,0,False,...,2018-05-12,16,5,5,2018,tarde,140,22,0,0
1,995224830499667970,995224830499667970,Hybris Tech Lead: Hybris Tech Lead Ref No.:18-...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 08:51:04 +0000 2018,False,0,0,False,...,2018-05-12,8,5,5,2018,mañana,140,20,0,0
2,995209223653879810,995209223653879810,Principal Software Engineer: The Big Willow is...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 07:49:03 +0000 2018,False,0,0,False,...,2018-05-12,7,5,5,2018,mañana,139,11,0,0
3,995185827595485184,995185827595485184,Senior Business Systems Analyst: Title Senior ...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 06:16:05 +0000 2018,False,0,0,False,...,2018-05-12,6,5,5,2018,mañana,140,16,0,0
4,995170019842473984,995170019842473984,Test Automation Engineer (Automation Tester (H...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 05:13:16 +0000 2018,False,0,0,False,...,2018-05-12,5,5,5,2018,noche,139,17,0,0
5,995138506040135682,995138506040135682,Electronic-Systems-Architect: Location Wilton ...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Sat May 12 03:08:03 +0000 2018,False,0,0,False,...,2018-05-12,3,5,5,2018,noche,143,15,0,0
6,995075354359300096,995075354359300096,Node.js Developer: Node.js Developer Ref No.:1...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Fri May 11 22:57:06 +0000 2018,False,0,1,False,...,2018-05-11,22,4,5,2018,noche_tarde,140,19,0,0
7,995051739505016833,995051739505016833,Optical Data Analyst: Job Description Location...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Fri May 11 21:23:16 +0000 2018,False,0,0,False,...,2018-05-11,21,4,5,2018,noche_tarde,140,18,0,0
8,995036133783289856,995036133783289856,Cisco Enterprise Solutions Architect: The Cisc...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Fri May 11 20:21:15 +0000 2018,False,0,0,False,...,2018-05-11,20,4,5,2018,noche_tarde,139,16,0,0
9,995012142347644929,995012142347644929,Civil Engineer/Project Manager: Civil Engineer...,en,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",Fri May 11 18:45:55 +0000 2018,False,0,0,False,...,2018-05-11,18,4,5,2018,noche_tarde,140,18,0,0


In [None]:
FULL_CSV = OUTPUT_DIR / "user_tweets_full_enriched.csv"

if FULL_CSV.exists():
    FULL_CSV.unlink()

HEADER_WRITTEN = False

for p in all_files:
    carpeta = p.parent.name
    archivo = p.name
    chunk_rows = []
    for rec in iter_json_lines(p):
        parsed = parse_tweet_record(rec, carpeta, archivo)
        if parsed is None:
            continue
        chunk_rows.append(parsed)
        if len(chunk_rows) >= 20000:
            df_chunk = pd.DataFrame(chunk_rows, columns=ALL_COLS)
            df_chunk = derive_time_features(df_chunk)
            df_chunk.to_csv(FULL_CSV, mode='a', header=not HEADER_WRITTEN, index=False)
            HEADER_WRITTEN = True
            chunk_rows = []
    if chunk_rows:
        df_chunk = pd.DataFrame(chunk_rows, columns=ALL_COLS)
        df_chunk = derive_time_features(df_chunk)
        df_chunk.to_csv(FULL_CSV, mode='a', header=not HEADER_WRITTEN, index=False)
        HEADER_WRITTEN = True

print("CSV completo escrito en:", FULL_CSV)

CSV completo escrito en: C:\Users\felip\OneDrive\Escritorio\BotNetsCode\PhDBotnetsDB\User_Tweet\_outputs\user_tweets_full_enriched.csv


In [None]:
if FULL_CSV.exists():
    full_head = pd.read_csv(FULL_CSV, nrows=20)
    display(full_head)
else:
    print("Aun no se genero el CSV completo.")

Unnamed: 0,tweet_id,tweet_id_str,tweet_text,tweet_lang,tweet_source,created_at,is_retweet,retweet_count,favorite_count,has_hashtags,...,tweet_date,tweet_hour,tweet_wday,tweet_month,tweet_year,day_part,tweet_len,num_upper,num_exclaim,num_question
0,340849848343470081,340849848343470081,Id:85539984945315840,und,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Sat Jun 01 15:18:31 +0000 2013,False,0,3,False,...,2013-06-01,15,5,6,2013,tarde,20,1,0,0
1,328417577720246272,328417577720246272,Oh my god. I can't stand it when people embarr...,en,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Sun Apr 28 07:57:06 +0000 2013,False,0,0,False,...,2013-04-28,7,6,4,2013,mañana,71,3,0,0
2,324262096617496576,324262096617496576,hiks T.T tak bisa mem follow lg.. huhuhu.. fol...,in,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Tue Apr 16 20:44:42 +0000 2013,False,0,0,False,...,2013-04-16,20,1,4,2013,noche_tarde,61,2,0,0
3,323414192055001089,323414192055001089,Semua bs d usahakan,in,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Sun Apr 14 12:35:26 +0000 2013,False,0,0,False,...,2013-04-14,12,6,4,2013,tarde,19,1,0,0
4,322756431642234880,322756431642234880,On our way to an A-Rod signing,en,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Fri Apr 12 17:01:44 +0000 2013,False,0,0,False,...,2013-04-12,17,4,4,2013,tarde,30,3,0,0
5,320783778542936064,320783778542936064,movies\nChanning Tatum has been offered the le...,en,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Sun Apr 07 06:23:07 +0000 2013,False,0,0,False,...,2013-04-07,6,6,4,2013,mañana,108,11,0,0
6,320145054612213760,320145054612213760,saindo\nsaindo pra me arrumar....bjsssss,pt,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Fri Apr 05 12:05:03 +0000 2013,False,0,0,False,...,2013-04-05,12,4,4,2013,tarde,39,0,0,0
7,317849550872264704,317849550872264704,Bon tu le sais dj xD. &lt;3.,fr,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Sat Mar 30 04:03:32 +0000 2013,False,0,0,False,...,2013-03-30,4,5,3,2013,noche,28,2,0,0
8,317255901327261696,317255901327261696,akhirnya bisa beli trispokes telah lama ku ida...,in,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Thu Mar 28 12:44:35 +0000 2013,False,0,0,False,...,2013-03-28,12,3,3,2013,tarde,58,0,0,0
9,316175021171171328,316175021171171328,Just got Logic Pro 9! Now to get to learn it c...,en,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Mon Mar 25 13:09:33 +0000 2013,False,0,0,False,...,2013-03-25,13,0,3,2013,tarde,77,4,1,0


In [10]:
csvcompleto = pd.read_csv(FULL_CSV)
csvcompleto.shape

  csvcompleto = pd.read_csv(FULL_CSV)


(4206867, 37)

### Variables clave del dataset generado
- created_at, created_at_ts, tweet_date, tweet_hour, tweet_wday, tweet_month, tweet_year, day_part
- señales de engagement: retweet_count, favorite_count, is_retweet
- señales de contenido: tweet_len, num_upper, num_exclaim, num_question, has_hashtags, has_urls, has_mentions
- metadatos de usuario: verified, followers/friends/statuses/favourites counts, protected, user_created_at
- procedencia: carpeta_origen, archivo_origen

Estas variables permiten:
- Perfiles temporales por usuario (distribución horaria, días, estacionalidad)
- Señales de coordinación entre cuentas (similitud temporal/textual)
- Etiquetado por origen para análisis por familia de botnet
