In [1]:
# ==========================================
# 0. PACKAGE IMPORTS
# ==========================================

In [2]:
import pandas as pd
import numpy as np
import re
import json

# Deep Learning Imports
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Sklearn pour la préparation
from sklearn.preprocessing import StandardScaler
from pandas import json_normalize

In [3]:
# ==========================================
# 1. FEATURES ENGINEERING
# ==========================================

In [4]:
def extract_source(source_html):
    if pd.isna(source_html): return "Unknown"
    match = re.search(r'>(.*?)<', str(source_html))
    return match.group(1) if match else "Unknown"

In [5]:
def feature_engineering(df):
    target_cols = ['user.listed_count', 'user.favourites_count', 'user.statuses_count',
                   'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
                   'user.followers_count', 'user.friends_count']
    for col in target_cols:
        if col not in df.columns: df[col] = 0
        df[col] = df[col].fillna(0)

    df['log_followers'] = np.log1p(df['user.followers_count'])
    df['log_friends'] = np.log1p(df['user.friends_count'])
    df['ratio_log'] = df['log_followers'] - df['log_friends']
    df['log_listed'] = np.log1p(df['user.listed_count'])
    df['log_statuses'] = np.log1p(df['user.statuses_count'])

    # Time
    if 'created_at' in df.columns and 'user.created_at' in df.columns:
        df['tweet_date'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['account_age_days'] = (df['tweet_date'] - df['user_date']).dt.days
        df['account_age_days'] = df['account_age_days'].fillna(0)
    else:
        df['account_age_days'] = 0

    # Text cleaning
    def get_clean_text(row):
        txt = str(row.get('text', ''))
        if 'extended_tweet.full_text' in row and not pd.isna(row['extended_tweet.full_text']):
            txt = str(row['extended_tweet.full_text'])
        return txt
    df['final_text'] = df.apply(get_clean_text, axis=1)

    # Text Stats
    def calc_diversity(text):
        words = text.split()
        if len(words) == 0: return 0
        return len(set(words)) / len(words)
    df['lexical_diversity'] = df['final_text'].apply(calc_diversity)
    df['caps_ratio'] = df['final_text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x)+1))
    df['exclamation_count'] = df['final_text'].apply(lambda x: x.count('!'))
    df['length_char'] = df['final_text'].apply(len)

    # Bio
    if 'user.description' not in df.columns: df['user.description'] = ""
    df['user_desc'] = df['user.description'].fillna("")
    df['desc_len'] = df['user_desc'].apply(len)
    df['desc_has_email'] = df['user_desc'].str.contains(r'[\w\.-]+@[\w\.-]+', regex=True).fillna(0).astype(int)
    df['desc_has_http'] = df['user_desc'].str.contains(r'http', regex=True).fillna(0).astype(int)
    pro_keywords = ['official', 'officiel', 'media', 'news', 'presse', 'journaliste']
    df['desc_is_pro'] = df['user_desc'].apply(lambda x: 1 if any(w in x.lower() for w in pro_keywords) else 0)

    # Source encoding (simple categorical for NN)
    if 'source' not in df.columns: df['source'] = ""
    df['source_clean'] = df['source'].apply(extract_source)
    top_sources = ['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App', 'TweetDeck']
    df['source_is_top'] = df['source_clean'].apply(lambda x: 1 if x in top_sources else 0)

    return df

In [6]:
# ==========================================
# 2. PIPELINE
# ==========================================

In [11]:
def run_dl():
    print("--- 1. Chargement & Feature Engineering ---")
    train_df = pd.read_json('train.jsonl', lines=True)
    test_df = pd.read_json('kaggle_test.jsonl', lines=True)

    train_df = json_normalize(train_df.to_dict(orient='records'))
    test_df = json_normalize(test_df.to_dict(orient='records'))

    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    # --- SÉLECTION DES FEATURES NUMÉRIQUES ---
    numeric_features = [
        'user.listed_count', 'user.favourites_count', 'user.statuses_count',
        'log_listed', 'log_statuses', 'ratio_log',
        'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
        'caps_ratio', 'exclamation_count', 'length_char', 'lexical_diversity',
        'desc_len', 'desc_has_email', 'desc_has_http', 'desc_is_pro',
        'account_age_days', 'source_is_top'
    ]

    # Normalisation (CRITIQUE pour les réseaux de neurones)
    scaler = StandardScaler()
    X_train_meta = scaler.fit_transform(train_df[numeric_features])
    X_test_meta = scaler.transform(test_df[numeric_features])

    y_train = train_df['label'].values

    # --- PRÉPARATION DU TEXTE (TOKENIZATION) ---
    print("--- 2. Tokenization du Texte (NLP) ---")
    MAX_WORDS = 20000     # Taille du vocabulaire
    MAX_LEN = 100         # Longueur max d'un tweet analysé

    tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
    tokenizer.fit_on_texts(train_df['final_text'])

    X_train_text = tokenizer.texts_to_sequences(train_df['final_text'])
    X_test_text = tokenizer.texts_to_sequences(test_df['final_text'])

    # Padding (pour avoir des vecteurs de même longueur)
    X_train_text = pad_sequences(X_train_text, maxlen=MAX_LEN, padding='post', truncating='post')
    X_test_text = pad_sequences(X_test_text, maxlen=MAX_LEN, padding='post', truncating='post')

    print(f"Vocabulaire : {len(tokenizer.word_index)} mots uniques")
    print(f"Shape Meta : {X_train_meta.shape} | Shape Text : {X_train_text.shape}")

    # ==========================================
    # 3. ARCHITECTURE DU MODÈLE HYBRIDE
    # ==========================================

    print("--- 3. Construction du Cerveau Hybride ---")

    # --- BRANCHE A : TEXTE (LSTM) ---
    input_text = Input(shape=(MAX_LEN,), name="input_text")
    # Embedding : Transforme les mots en vecteurs denses
    x_text = Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN)(input_text)
    # SpatialDropout pour éviter le sur-apprentissage sur certains mots
    x_text = tf.keras.layers.SpatialDropout1D(0.2)(x_text)
    # Bidirectional LSTM : Lit dans les deux sens
    x_text = Bidirectional(LSTM(64, return_sequences=True))(x_text)
    x_text = GlobalMaxPool1D()(x_text) # Garde l'info la plus forte

    # --- BRANCHE B : METADATA (DENSE) ---
    input_meta = Input(shape=(X_train_meta.shape[1],), name="input_meta")
    x_meta = Dense(64, activation='relu')(input_meta)
    x_meta = BatchNormalization()(x_meta)
    x_meta = Dropout(0.3)(x_meta)
    x_meta = Dense(32, activation='relu')(x_meta)

    # --- FUSION ---
    x = Concatenate()([x_text, x_meta])
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_text, input_meta], outputs=output)

    # Compilation
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    model.summary()

    # ==========================================
    # 4. ENTRAÎNEMENT
    # ==========================================

    print("--- 4. Entraînement ---")

    # Callbacks pour sauver le meilleur modèle et arrêter si ça stagne
    checkpoint = ModelCheckpoint("best_model_dl.h5", monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)
    early_stop = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True, verbose=1)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

    # On utilise 20% du train set pour valider pendant l'entrainement
    history = model.fit(
        [X_train_text, X_train_meta], y_train,
        batch_size=32,
        epochs=20, # On met 20, mais early_stop arrêtera avant si nécessaire
        validation_split=0.2,
        callbacks=[checkpoint, early_stop, reduce_lr]
    )

    # ==========================================
    # 5. PRÉDICTION
    # ==========================================

    print("--- 5. Prédiction Finale ---")
    # On recharge le meilleur checkpoint pour être sûr
    model.load_weights("best_model_dl.h5")

    preds = model.predict([X_test_text, X_test_meta], batch_size=32)
    # Sigmoid sort une proba entre 0 et 1. On coupe à 0.5
    preds_binary = (preds > 0.5).astype(int).reshape(-1)

    submission = pd.DataFrame({'ID': test_df['challenge_id'], 'Prediction': preds_binary})
    submission.to_csv('Prediction_dl.csv', index=False)
    print("Fichier 'Prediction_dl.csv' généré avec succès !")

In [12]:
if __name__ == "__main__":
    run_dl()

--- 1. Chargement & Feature Engineering ---


  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)
  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)


--- 2. Tokenization du Texte (NLP) ---
Vocabulaire : 221685 mots uniques
Shape Meta : (154914, 20) | Shape Text : (154914, 100)
--- 3. Construction du Cerveau Hybride ---




--- 4. Entraînement ---
Epoch 1/20
[1m3873/3873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.7960 - loss: 0.4561
Epoch 1: val_accuracy improved from -inf to 0.82910, saving model to best_model_dl.h5




[1m3873/3873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m587s[0m 150ms/step - accuracy: 0.7960 - loss: 0.4561 - val_accuracy: 0.8291 - val_loss: 0.3910 - learning_rate: 0.0010
Epoch 2/20
[1m3873/3873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.8335 - loss: 0.3815
Epoch 2: val_accuracy did not improve from 0.82910
[1m3873/3873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m595s[0m 143ms/step - accuracy: 0.8335 - loss: 0.3815 - val_accuracy: 0.8277 - val_loss: 0.3936 - learning_rate: 0.0010
Epoch 3/20
[1m3873/3873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.8561 - loss: 0.3365
Epoch 3: val_accuracy did not improve from 0.82910

Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m3873/3873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m555s[0m 143ms/step - accuracy: 0.8561 - loss: 0.3365 - val_accuracy: 0