In [8]:
!pip -q install -U pip
!pip -q install "tensorflow==2.19.0" "tf-keras==2.19.0" pandas scikit-learn matplotlib
!pip -q install "keras-nlp==0.17.0" "keras-hub==0.17.0"

import tensorflow as tf, keras, keras_nlp, keras_hub
print("TF:", tf.__version__)
print("Keras:", keras.__version__)
print("KerasNLP:", keras_nlp.__version__)
print("KerasHub:", keras_hub.__version__)


TF: 2.19.0
Keras: 3.10.0
KerasNLP: 0.21.1
KerasHub: 0.21.1


In [9]:

!pip -q install -U kaggle kagglehub

import os, glob, zipfile, shutil, json, sys, subprocess
from pathlib import Path

data_dir = Path("data"); data_dir.mkdir(exist_ok=True)

def list_csv():
    files = sorted([p for p in data_dir.glob("**/*.csv")],
                   key=lambda p: p.stat().st_size if p.exists() else 0)
    return files

def have_csv():
    files = list_csv()
    if files:
        print("CSV(s) em data/:", [p.name for p in files])
        return True
    return False


if have_csv():
    print("Pule para a Célula 3 (treino).")
else:

    try:
        import kagglehub
        print("Tentando KaggleHub...")
        path = kagglehub.dataset_download("goyaladi/twitter-bot-detection-dataset")
        for p in Path(path).glob("*.csv"):
            shutil.copy(p, data_dir/p.name)
        if have_csv():
            print("Sucesso com KaggleHub. Pule para a Célula 3 (treino).")
    except Exception as e:
        print("KaggleHub falhou:", e)


    if not have_csv():
        try:
            import kaggle
        except Exception:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "kaggle"])
            import kaggle

        kaggle_json = Path("/root/.kaggle/kaggle.json")
        if not kaggle_json.exists():
            from google.colab import files
            print("📤 Envie o seu kaggle.json (Kaggle > Account > Create New API Token)")
            up = files.upload()
            assert "kaggle.json" in up, "Faltou o kaggle.json"
            kaggle_json.parent.mkdir(parents=True, exist_ok=True)
            with open(kaggle_json, "wb") as f: f.write(up["kaggle.json"])
            os.chmod(kaggle_json, 0o600)

        print("➜ Tentando Kaggle CLI...")
        code = os.system("kaggle datasets download -d goyaladi/twitter-bot-detection-dataset -p data/ -unzip")
        if code != 0:
            print(f"⚠️ Kaggle CLI retornou código {code}. Provável termos não aceitos no site.")
        have_csv()

    if not have_csv():
        from google.colab import files
        print("📤 Envie um CSV (ex.: bot_detection_data.csv) ou um ZIP contendo CSV(s).")
        up = files.upload()
        for name, content in up.items():
            fn = Path(name)
            out = data_dir / fn.name
            with open(out, "wb") as f:
                f.write(content)
            if fn.suffix.lower() == ".zip":
                try:
                    with zipfile.ZipFile(out, "r") as z:
                        z.extractall(data_dir)
                except zipfile.BadZipFile:
                    print(f"ZIP inválido: {fn.name}")
        assert have_csv(), "Não foi possível localizar CSV em data/ após o upload."



CSV(s) em data/: ['bot_detection_data.csv']
Pule para a Célula 3 (treino).


In [10]:

import os, glob, re, json, numpy as np, pandas as pd, tensorflow as tf, keras, keras_nlp
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt


SEQ_LEN = 32
BATCH   = 2
MAX_N   = 3000
CAP_TRAIN_STEPS = 40
CAP_VAL_STEPS   = 10
EPOCHS          = 2


csvs = sorted(glob.glob("data/*.csv"), key=os.path.getsize)
assert csvs, "Nenhum CSV em data/. Rode a célula A2 para baixar/subir o dataset."
csv_path = csvs[-1]
print("CSV:", os.path.basename(csv_path))

df = pd.read_csv(csv_path)
text_candidates  = ["Tweet","tweet","text","Text","content","description","bio"]
label_candidates = ["Bot Label","bot","label","target","bot_label","is_bot"]
text_col  = next((c for c in text_candidates  if c in df.columns), df.columns[0])
label_col = next((c for c in label_candidates if c in df.columns), df.columns[-1])
print("cols -> text:", text_col, "| label:", label_col)

def clean(s:str)->str:
    s = str(s)
    s = re.sub(r"http\S+|www\.\S+"," ", s)
    s = re.sub(r"@\w+","@user", s)
    s = re.sub(r"#(\w+)", r"\1", s)
    return " ".join(s.split())
def to01(x)->int:
    s = str(x).strip().lower()
    return 1 if (s in {"1","true","bot","yes","y"} or "bot" in s) else 0

df = df[[text_col,label_col]].dropna()
X = df[text_col].astype(str).apply(clean).tolist()
y = df[label_col].apply(to01).astype(int).tolist()


if MAX_N and len(X) > MAX_N:
    X, y = X[:MAX_N], y[:MAX_N]
print(f"tamanho do conjunto usado: {len(X)}")

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

preproc = keras_nlp.models.BertPreprocessor.from_preset(
    "bert_base_en_uncased",
    sequence_length=SEQ_LEN
)

mini_backbone = keras_nlp.models.BertBackbone(
    vocabulary_size=preproc.tokenizer.vocabulary_size(),
    num_layers=2,
    num_heads=2,
    hidden_dim=128,
    intermediate_dim=256,
    max_sequence_length=SEQ_LEN,
)

model = keras_nlp.models.BertClassifier(
    backbone=mini_backbone,
    preprocessor=preproc,
    num_classes=2,
)

AUTOTUNE = tf.data.AUTOTUNE
train_ds = (tf.data.Dataset.from_tensor_slices((tf.constant(Xtr), tf.constant(np.array(ytr, dtype=np.int64))))
            .shuffle(min(2048, len(Xtr))).batch(BATCH).prefetch(AUTOTUNE))
val_ds   = (tf.data.Dataset.from_tensor_slices((tf.constant(Xte), tf.constant(np.array(yte, dtype=np.int64))))
            .batch(BATCH).prefetch(AUTOTUNE))


train_ds_fast = train_ds.take(CAP_TRAIN_STEPS)
val_ds_fast   = val_ds.take(CAP_VAL_STEPS)


opt     = keras.optimizers.Adam(2e-4)
loss    = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [keras.metrics.SparseCategoricalAccuracy(name="acc")]

model.compile(optimizer=opt, loss=loss, metrics=metrics, run_eagerly=True, jit_compile=False)
print("Treinando (leve e capado)…")
history = model.fit(train_ds_fast, validation_data=val_ds_fast, epochs=EPOCHS, verbose=2)


logits = model.predict(val_ds_fast, verbose=0)
yp = np.argmax(logits, axis=1)
yref = np.concatenate([b.numpy() for _, b in val_ds_fast], axis=0)

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
acc  = accuracy_score(yref, yp)
prec, rec, f1, _ = precision_recall_fscore_support(yref, yp, average="binary", pos_label=1, zero_division=0)
print(f"VAL(capado)  acc={acc:.4f}  prec={prec:.4f}  rec={rec:.4f}  f1={f1:.4f}")


os.makedirs("artifacts", exist_ok=True)
model.save("artifacts/model_keras.keras")


with open("artifacts/tokenizer_info.json","w") as f:
    json.dump({"preset_vocab": "bert_base_en_uncased", "sequence_length": SEQ_LEN,
               "backbone_config": {"layers":2,"heads":2,"hidden":128,"intermediate":256}}, f, indent=2)
with open("artifacts/label_mapping.json","w") as f:
    json.dump({"id2label": {0:"human", 1:"bot"}}, f, indent=2)
with open("artifacts/metrics_test.json","w") as f:
    json.dump({"accuracy":float(acc),"precision":float(prec),"recall":float(rec),"f1":float(f1)}, f, indent=2)


cm = confusion_matrix(yref, yp, labels=[0,1])
plt.figure(figsize=(3,3)); plt.imshow(cm); plt.title("Confusão (val capado)")
plt.xticks([0,1], ["human","bot"]); plt.yticks([0,1], ["human","bot"])
for i in range(2):
    for j in range(2):
        plt.text(j,i,str(cm[i,j]),ha="center",va="center")
plt.tight_layout(); plt.savefig("artifacts/confusion_matrix.png"); plt.close()

print("Entregáveis em artifacts/: model_keras/, tokenizer_info.json, label_mapping.json, metrics_test.json")


CSV: bot_detection_data.csv
cols -> text: Tweet | label: Bot Label
tamanho do conjunto usado: 3000
Treinando (leve e capado)…
Epoch 1/2
40/40 - 25s - 620ms/step - acc: 0.5375 - loss: 0.7065 - val_acc: 0.7000 - val_loss: 0.6577
Epoch 2/2
40/40 - 38s - 960ms/step - acc: 0.5250 - loss: 0.6955 - val_acc: 0.3500 - val_loss: 0.6933
VAL(capado)  acc=0.3500  prec=1.0000  rec=0.0714  f1=0.1333
Entregáveis em artifacts/: model_keras/, tokenizer_info.json, label_mapping.json, metrics_test.json


In [11]:

import os, json, shutil, datetime


try:
    _ = (model, acc, prec, rec, f1, SEQ_LEN)
except NameError as e:
    raise RuntimeError("Rode a célula de TREINO antes desta (model/metrics não estão na memória).") from e

os.makedirs("artifacts", exist_ok=True)


model_path = "artifacts/model_keras.keras"
model.save(model_path)
print("✔ Modelo salvo em:", model_path)


try:
    model.export("artifacts/model_savedmodel")
    saved_export = True
    print("SavedModel exportado em artifacts/model_savedmodel/")
except Exception as e:
    saved_export = False
    print("Export SavedModel (opcional) não feita:", e)


with open("artifacts/tokenizer_info.json","w") as f:
    json.dump({
        "preset_vocab": "bert_base_en_uncased",
        "sequence_length": int(SEQ_LEN),
        "backbone_config": {"layers":2,"heads":2,"hidden":128,"intermediate":256}
    }, f, indent=2)

with open("artifacts/label_mapping.json","w") as f:
    json.dump({"id2label": {0:"human", 1:"bot"}}, f, indent=2)

with open("artifacts/metrics_test.json","w") as f:
    json.dump({
        "accuracy": float(acc),
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1)
    }, f, indent=2)

#  Relatório
hoje = datetime.date.today().isoformat()
lines = []
lines += [
"# Detecção de Bots no Twitter com BERT (Keras)",
f"**Data:** {hoje}",
"",
"## 1. Objetivo",
"Treinar uma rede neural **BERT** no **Keras** para detectar bots no Twitter utilizando o dataset **Twitter-Bot Detection** (Kaggle).",
"",
"## 2. Dados (Kaggle)",
"- Conjunto: *Twitter-Bot Detection Dataset*.",
"- Colunas detectadas no CSV: texto = `Tweet` | rótulo = `Bot Label`.",
"- Rotulagem binária mapeada para 0: human, 1: bot.",
"",
"## 3. Preparação",
"- Limpezas: remoção de URLs, normalização de menções para `@user`, remoção de `#` mantendo a palavra.",
"- Divisão: treino 80% / teste 20% (stratify).",
f"- Tamanho de sequência para o modelo: **{int(SEQ_LEN)}** tokens.",
"",
"## 4. Modelo",
"- **BERT (KerasNLP)** *mini* treinado do zero (leve para Colab):",
"  - camadas = 2, cabeças = 2, dimensão oculta = 128, intermediária = 256.",
"- Perda: `SparseCategoricalCrossentropy(from_logits=True)`.",
"- Otimizador: Adam.",
"- Métricas: Acurácia + Precisão/Recall/F1 (classe bot).",
"",
"## 5. Treinamento",
"- Subamostragem do dataset para caber na RAM.",
"- Limite de passos por época (CAP_TRAIN_STEPS/CAP_VAL_STEPS).",
"- Épocas: 2.",
"",
"## 6. Resultados (val capado)",
f"- **Accuracy:** {acc:.4f}",
f"- **Precision (bot):** {prec:.4f}",
f"- **Recall (bot):** {rec:.4f}",
f"- **F1 (bot):** {f1:.4f}",
"",
"## 7. Discussão",
"- Desempenho condizente com BERT pequeno sem pré-treino e orçamento de passos reduzido.",
"- Melhorias possíveis: usar preset pré-treinado, mais épocas, ajuste de `sequence_length`/`batch`, class weights, e pequenas augments de texto.",
"",
"## 8. Reprodutibilidade",
"1) Instalar dependências (TF 2.19, Keras 3, KerasNLP).",
"2) Colocar o CSV do Kaggle em `data/`.",
"3) Executar a célula de treino (mini-BERT).",
"4) Executar esta célula de salvamento/relatório.",
"",
"## 9. Como carregar o modelo para inferência",
"```python",
"import keras, numpy as np",
'm = keras.models.load_model("artifacts/model_keras.keras")',
'logits = m.predict(["This account tweets the same link every hour"])',
"pred = int(np.argmax(logits, axis=1)[0])",
'print("bot" if pred==1 else "human")',
"```",
"",
"## 10. Arquivos Entregues",
"- `artifacts/model_keras.keras` (modelo treinado)",
"- `artifacts/tokenizer_info.json` (config de pré-processamento/seq_len)",
"- `artifacts/label_mapping.json`",
"- `artifacts/metrics_test.json`",
"- `artifacts/Relatorio_Projeto.md`",
]
with open("artifacts/Relatorio_Projeto.md","w", encoding="utf-8") as f:
    f.write("\n".join(lines))


zip_path = shutil.make_archive("ENTREGA_FINAL", "zip", "artifacts")
print("ENTREGA_FINAL gerado:", zip_path)


✔ Modelo salvo em: artifacts/model_keras.keras
Saved artifact at 'artifacts/model_savedmodel'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['token_ids', TensorSpec(shape=(None, None), dtype=tf.int32, name='token_ids')], ['segment_ids', TensorSpec(shape=(None, None), dtype=tf.int32, name='segment_ids')], ['padding_mask', TensorSpec(shape=(None, None), dtype=tf.int32, name='padding_mask')]]
Output Type:
  TensorSpec(shape=(None, 2), dtype=tf.float32, name=None)
Captures:
  138323966703184: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138323966699152: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138323966691472: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138323966704528: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138323966692048: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138323966690896: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138323966691280: TensorSpec(shape=(), dtype=tf.resource,