In [1]:
# ==========================================
# 0. CONFIGURATION CRITIQUE (KERAS LEGACY)
# ==========================================
import os
# Ces deux lignes forcent l'utilisation de Keras 2 pour la compatibilité HuggingFace
os.environ["TF_USE_LEGACY_KERAS"] = "1"
os.environ["TF_keras_1_compat"] = "1"

# --- INSTALLATION ---
!pip install -q transformers tensorflow pandas numpy scikit-learn tf-keras

import pandas as pd
import numpy as np
import re
import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertModel
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from pandas import json_normalize

# ==========================================
# 1. FEATURE ENGINEERING
# ==========================================
def advanced_feature_engineering(df):
    target_cols = ['user.listed_count', 'user.favourites_count', 'user.statuses_count',
                   'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
                   'user.followers_count', 'user.friends_count']
    for col in target_cols:
        if col not in df.columns: df[col] = 0
        df[col] = df[col].fillna(0)

    df['log_followers'] = np.log1p(df['user.followers_count'])
    df['log_friends'] = np.log1p(df['user.friends_count'])
    df['ratio_log'] = df['log_followers'] - df['log_friends']
    df['log_listed'] = np.log1p(df['user.listed_count'])
    df['log_statuses'] = np.log1p(df['user.statuses_count'])

    if 'created_at' in df.columns and 'user.created_at' in df.columns:
        df['tweet_date'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['account_age_days'] = (df['tweet_date'] - df['user_date']).dt.days
        df['account_age_days'] = df['account_age_days'].fillna(0)
    else:
        df['account_age_days'] = 0

    def get_clean_text(row):
        txt = str(row.get('text', ''))
        if 'extended_tweet.full_text' in row and not pd.isna(row['extended_tweet.full_text']):
            txt = str(row['extended_tweet.full_text'])
        return txt
    df['final_text'] = df.apply(get_clean_text, axis=1)

    if 'user.description' not in df.columns: df['user.description'] = ""
    df['user_desc'] = df['user.description'].fillna("")
    df['desc_len'] = df['user_desc'].apply(len)
    df['desc_has_http'] = df['user_desc'].str.contains(r'http', regex=True).fillna(0).astype(int)

    df['caps_ratio'] = df['final_text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x)+1))
    return df

# ==========================================
# 2. PRÉPARATION DES DONNÉES
# ==========================================
print("--- Chargement des données ---")
train_df = pd.read_json('train.jsonl', lines=True)
test_df = pd.read_json('kaggle_test.jsonl', lines=True)

train_df = json_normalize(train_df.to_dict(orient='records'))
test_df = json_normalize(test_df.to_dict(orient='records'))

train_df = advanced_feature_engineering(train_df)
test_df = advanced_feature_engineering(test_df)

numeric_features = [
    'log_listed', 'log_statuses', 'ratio_log',
    'quote_count', 'favorite_count', 'retweet_count',
    'caps_ratio', 'desc_len', 'desc_has_http',
    'account_age_days'
]

scaler = StandardScaler()
X_train_meta = scaler.fit_transform(train_df[numeric_features])
X_test_meta = scaler.transform(test_df[numeric_features])
y_train = train_df['label'].values

print("--- Tokenization BERT ---")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
MAX_LEN = 64

def batch_encode(tokenizer, texts, batch_size=256, max_length=MAX_LEN):
    input_ids = []
    attention_masks = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, max_length=max_length, padding='max_length', truncation=True, return_tensors='tf')
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
    return tf.concat(input_ids, 0), tf.concat(attention_masks, 0)

X_train_ids, X_train_mask = batch_encode(tokenizer, train_df['final_text'].tolist())
X_test_ids, X_test_mask = batch_encode(tokenizer, test_df['final_text'].tolist())

# ==========================================
# 3. MODÈLE HYBRIDE
# ==========================================
print("--- Construction du Modèle Hybride ---")

def build_model():
    # Inputs
    input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    input_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')

    # BERT (use_safetensors=False est important)
    distilbert = TFDistilBertModel.from_pretrained('distilbert-base-uncased', use_safetensors=False)
    distilbert.trainable = True

    bert_output = distilbert(input_ids, attention_mask=input_mask)[0]
    cls_token = bert_output[:, 0, :]
    x_text = Dropout(0.2)(cls_token)

    # Metadata
    input_meta = Input(shape=(X_train_meta.shape[1],), name='input_meta')
    x_meta = Dense(32, activation='relu')(input_meta)
    x_meta = Dropout(0.2)(x_meta)

    # Fusion
    combined = Concatenate()([x_text, x_meta])
    z = Dense(64, activation='relu')(combined)
    z = Dropout(0.2)(z)
    output = Dense(1, activation='sigmoid')(z)

    model = Model(inputs=[input_ids, input_mask, input_meta], outputs=output)
    optimizer = Adam(learning_rate=2e-5)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

model = build_model()
model.summary()

# ==========================================
# 4. ENTRAÎNEMENT & PRÉDICTION
# ==========================================
print("--- Début de l'entraînement ---")

# Correction du Bug ModelCheckpoint: On enlève 'save_format' qui n'est plus nécessaire en legacy
checkpoint = ModelCheckpoint("bert_hybrid.h5", monitor='val_accuracy', save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

history = model.fit(
    [X_train_ids, X_train_mask, X_train_meta],
    y_train,
    validation_split=0.2,
    epochs=4,
    batch_size=16,
    callbacks=[checkpoint, early_stop]
)

print("--- Prédiction Finale ---")
preds = model.predict([X_test_ids, X_test_mask, X_test_meta], batch_size=16)
final_preds = (preds > 0.5).astype(int).reshape(-1)

submission = pd.DataFrame({'ID': test_df['challenge_id'], 'Prediction': final_preds})
submission.to_csv('submission_BERT_Hybrid.csv', index=False)

print("Fichier 'submission_BERT_Hybrid.csv' généré avec succès !")

--- Chargement des données ---


  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)
  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)


--- Tokenization BERT ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


--- Construction du Modèle Hybride ---


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 64)]                 0         []                            
                                                                                                  
 input_mask (InputLayer)     [(None, 64)]                 0         []                            
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 stilBertModel)              den_state=(None, 64, 768),   0          'input_mask[0][0]']          
                              hidden_states=None, atten                                           
                             tions=None)                                                      

  saving_api.save_model(


Epoch 2/4
Epoch 3/4
Epoch 4/4
--- Prédiction Finale ---
Fichier 'submission_BERT_Hybrid.csv' généré avec succès !
