In [3]:
# ==========================================
# 0. PACKAGE IMPORTS
# ==========================================

In [4]:
import pandas as pd
import numpy as np
import re
import joblib
from pandas import json_normalize

# Les 3 Rois du Boosting
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [5]:
# ==========================================
# 1. FEATURE ENGINEERING (ULTIME)
# ==========================================

In [6]:
def extract_source(source_html):
    if pd.isna(source_html): return "Unknown"
    match = re.search(r'>(.*?)<', str(source_html))
    return match.group(1) if match else "Unknown"

In [7]:
def feature_engineering(df):
    print("... Génération des Features ...")

    # 1. Nettoyage des colonnes manquantes
    target_cols = ['user.listed_count', 'user.favourites_count', 'user.statuses_count',
                   'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
                   'user.followers_count', 'user.friends_count']
    for col in target_cols:
        if col not in df.columns: df[col] = 0
        df[col] = df[col].fillna(0)

    # 2. Les Ratios Mathématiques (Crucial pour les arbres)
    df['log_followers'] = np.log1p(df['user.followers_count'])
    df['log_friends'] = np.log1p(df['user.friends_count'])
    df['ratio_log'] = df['log_followers'] - df['log_friends']
    df['log_listed'] = np.log1p(df['user.listed_count'])
    df['log_statuses'] = np.log1p(df['user.statuses_count'])

    # Ratio d'activité (Likes donnés / Tweets postés)
    # +1 pour éviter la division par zéro
    df['activity_ratio'] = (df['user.favourites_count'] + 1) / (df['user.statuses_count'] + 1)

    # 3. Gestion du Temps (Âge du compte)
    if 'created_at' in df.columns and 'user.created_at' in df.columns:
        # Conversion robuste
        df['tweet_date'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['account_age_days'] = (df['tweet_date'] - df['user_date']).dt.days
        df['account_age_days'] = df['account_age_days'].fillna(0)
    else:
        df['account_age_days'] = 0

    # Growth Rate (Followers par jour d'existence) - Très fort indicateur
    df['growth_rate'] = df['user.followers_count'] / (df['account_age_days'] + 1)

    # 4. Text Mining (Basique mais efficace)
    def get_clean_text(row):
        txt = str(row.get('text', ''))
        if 'extended_tweet.full_text' in row and not pd.isna(row['extended_tweet.full_text']):
            txt = str(row['extended_tweet.full_text'])
        return txt
    df['final_text'] = df.apply(get_clean_text, axis=1)

    def calc_diversity(text):
        words = text.split()
        if len(words) == 0: return 0
        return len(set(words)) / len(words)

    df['lexical_diversity'] = df['final_text'].apply(calc_diversity)
    df['caps_ratio'] = df['final_text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x)+1))
    df['length_char'] = df['final_text'].apply(len)
    df['num_hashtags'] = df['final_text'].apply(lambda x: x.count('#'))
    df['num_mentions'] = df['final_text'].apply(lambda x: x.count('@'))

    # 5. Profil (Bio & Défauts)
    if 'user.description' not in df.columns: df['user.description'] = ""
    df['user_desc'] = df['user.description'].fillna("")
    df['desc_len'] = df['user_desc'].apply(len)

    # Mots clés Pro
    pro_keywords = ['official', 'officiel', 'media', 'news', 'presse', 'journaliste']
    df['desc_is_pro'] = df['user_desc'].apply(lambda x: 1 if any(w in x.lower() for w in pro_keywords) else 0)

    # --- NOUVEAUTÉS : Indicateurs "Default" ---
    if 'user.default_profile_image' in df.columns:
        df['is_default_image'] = df['user.default_profile_image'].fillna(False).astype(int)
    else:
        df['is_default_image'] = 0 # Par défaut on assume que non

    if 'user.default_profile' in df.columns:
        df['is_default_profile'] = df['user.default_profile'].fillna(False).astype(int)
    else:
        df['is_default_profile'] = 0

    # Source
    if 'source' not in df.columns: df['source'] = ""
    df['source_clean'] = df['source'].apply(extract_source)
    # On garde les Top sources, les autres deviennent "Other"
    top_sources = ['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App', 'TweetDeck', 'Hootsuite', 'Buffer']
    df['source_category'] = df['source_clean'].apply(lambda x: x if x in top_sources else 'Other')

    return df

In [8]:
# ==========================================
# 2. PRÉPARATION DU PIPELINE
# ==========================================

In [9]:
def run_stacking():
    print("--- 1. Chargement des données ---")
    train_df = pd.read_json('train.jsonl', lines=True)
    test_df = pd.read_json('kaggle_test.jsonl', lines=True)

    train_df = json_normalize(train_df.to_dict(orient='records'))
    test_df = json_normalize(test_df.to_dict(orient='records'))

    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    # LISTE DES FEATURES (La plus complète à ce jour)
    numeric_features = [
        # Stats User
        'user.listed_count', 'user.favourites_count', 'user.statuses_count',
        'log_listed', 'log_statuses', 'ratio_log', 'activity_ratio', 'growth_rate',
        # Stats Tweet
        'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
        # Text Analysis
        'caps_ratio', 'length_char', 'lexical_diversity', 'num_hashtags', 'num_mentions',
        # Bio & Profile
        'desc_len', 'desc_is_pro', 'is_default_image', 'is_default_profile',
        'account_age_days'
    ]

    categorical_features = ['source_category']
    tweet_text_col = 'final_text'
    desc_text_col = 'user_desc'

    # --- Transformers ---
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Other')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # TF-IDF + SVD (Beaucoup plus robuste que BERT pour les arbres)
    tweet_text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))),
        ('svd', TruncatedSVD(n_components=50, random_state=42))
    ])

    desc_text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
        ('svd', TruncatedSVD(n_components=10, random_state=42))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('tweet_txt', tweet_text_transformer, tweet_text_col),
            ('desc_txt', desc_text_transformer, desc_text_col)
        ]
    )

    # ==========================================
    # 3. LE STACKING (LA SAINTE TRINITÉ)
    # ==========================================
    print("--- 2. Configuration du Stacking ---")

    # Niveau 1 : Les Modèles de Base (Base Learners)
    estimators = [
        # XGBoost : Le standard de l'industrie
        ('xgb', XGBClassifier(n_estimators=300, max_depth=8, learning_rate=0.05,
                              eval_metric='logloss', random_state=42, n_jobs=-1)),

        # LightGBM : Très rapide et différent de XGB
        ('lgbm', LGBMClassifier(n_estimators=300, num_leaves=31, learning_rate=0.05,
                                random_state=42, n_jobs=-1, verbose=-1)),

        # CatBoost : Excellent pour gérer les relations complexes
        ('cat', CatBoostClassifier(iterations=300, depth=8, learning_rate=0.05,
                                   verbose=0, random_seed=42)),

        # RandomForest : Pour apporter de la variance (Bagging vs Boosting)
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))
    ]

    # Niveau 2 : Le Meta-Learner
    # Il apprend à combiner les avis des 4 modèles précédents
    stacking_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(),
        cv=5, # Cross-Validation interne pour éviter le sur-apprentissage
        n_jobs=-1
    )

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', stacking_clf)
    ])

    # ==========================================
    # 4. ENTRAÎNEMENT & PRÉDICTION
    # ==========================================
    X = train_df
    y = train_df['label']

    print("--- 3. Entraînement du Stack (Cela peut prendre 2-3 min) ---")
    # Le Stacking entraîne les modèles 5 fois chacun (CV), soyez patient
    model.fit(X, y)

    print("--- 4. Prédiction ---")
    predictions = model.predict(test_df)

    submission = pd.DataFrame({'ID': test_df['challenge_id'], 'Prediction': predictions})
    submission.to_csv('Prediction_stacking.csv', index=False)

    print("Fichier 'Prediction_stacking.csv' généré.")



In [10]:
if __name__ == "__main__":
    run_stacking()

--- 1. Chargement des données ---
... Génération des Features ...


  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)


... Génération des Features ...


  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)


--- 2. Configuration du Stacking ---
--- 3. Entraînement du Stack (Cela peut prendre 2-3 min) ---




--- 4. Prédiction ---




Fichier 'Prediction_stacking.csv' généré.
