In [19]:
# ==========================================
# 0. PACKAGE IMPORTS
# ==========================================

In [20]:
import pandas as pd
import numpy as np
import json
import re
from pandas import json_normalize
from datetime import datetime
import joblib

# Scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [21]:
# ==========================================
# 1. FEATURES ENGINEERING (VERSION 5 - TIME & LEXICAL)
# ==========================================

In [22]:
def extract_source(source_html):
    if pd.isna(source_html): return "Unknown"
    match = re.search(r'>(.*?)<', str(source_html))
    return match.group(1) if match else "Unknown"

In [23]:
def feature_engineering(df):
    print("--- Génération des features ---")
    
    # --- A. Nettoyage Numérique ---
    target_cols = [
        'user.listed_count', 'user.favourites_count', 'user.statuses_count',
        'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
        'user.followers_count', 'user.friends_count'
    ]
    for col in target_cols:
        if col not in df.columns: df[col] = 0
        df[col] = df[col].fillna(0)

    # --- B. Ratios Mathématiques ---
    df['log_followers'] = np.log1p(df['user.followers_count'])
    df['log_friends'] = np.log1p(df['user.friends_count'])
    df['ratio_log'] = df['log_followers'] - df['log_friends']
    df['log_listed'] = np.log1p(df['user.listed_count'])
    df['log_statuses'] = np.log1p(df['user.statuses_count'])

    # --- C. Gestion du TEMPS (CORRECTION CRASH) ---
    # Twitter format: "Fri Nov 06 10:58:12 +0000 2015"
    
    # 1. On convertit en forçant UTC=True pour harmoniser
    # 2. On utilise .dt.tz_localize(None) pour RETIRER la timezone et rendre les dates "naïves" (comparables)
    if 'created_at' in df.columns and 'user.created_at' in df.columns:
        df['tweet_date'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        
        # Calcul de l'âge du compte en jours
        df['account_age_days'] = (df['tweet_date'] - df['user_date']).dt.days
        df['account_age_days'] = df['account_age_days'].fillna(0)
    else:
        df['account_age_days'] = 0

    # --- D. Analyse du TEXTE Avancée ---
    def get_clean_text(row):
        txt = str(row.get('text', ''))
        if 'extended_tweet.full_text' in row and not pd.isna(row['extended_tweet.full_text']):
            txt = str(row['extended_tweet.full_text'])
        return txt

    df['final_text'] = df.apply(get_clean_text, axis=1)
    
    # Diversité Lexicale
    def calc_diversity(text):
        words = text.split()
        if len(words) == 0: return 0
        return len(set(words)) / len(words)

    df['lexical_diversity'] = df['final_text'].apply(calc_diversity)
    df['caps_ratio'] = df['final_text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x)+1))
    df['exclamation_count'] = df['final_text'].apply(lambda x: x.count('!'))
    df['length_char'] = df['final_text'].apply(len)

    # --- E. Bio & Source ---
    if 'user.description' not in df.columns: df['user.description'] = ""
    df['user_desc'] = df['user.description'].fillna("")
    
    df['desc_len'] = df['user_desc'].apply(len)
    df['desc_has_email'] = df['user_desc'].str.contains(r'[\w\.-]+@[\w\.-]+', regex=True).fillna(0).astype(int)
    df['desc_has_http'] = df['user_desc'].str.contains(r'http', regex=True).fillna(0).astype(int)
    
    pro_keywords = ['official', 'officiel', 'media', 'news', 'presse', 'journaliste']
    df['desc_is_pro'] = df['user_desc'].apply(lambda x: 1 if any(w in x.lower() for w in pro_keywords) else 0)

    if 'source' not in df.columns: df['source'] = ""
    df['source_clean'] = df['source'].apply(extract_source)
    top_sources = ['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App', 'TweetDeck']
    df['source_category'] = df['source_clean'].apply(lambda x: x if x in top_sources else 'Other')

    return df

In [24]:
# ==========================================
# 2. PIPELINE DE VOTING
# ==========================================

In [25]:
def run():
    print("Chargement des données...")
    train_df = pd.read_json('train.jsonl', lines=True)
    test_df = pd.read_json('kaggle_test.jsonl', lines=True)

    print("Feature Engineering...")
    train_df = json_normalize(train_df.to_dict(orient='records'))
    test_df = json_normalize(test_df.to_dict(orient='records'))

    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    # Liste Mise à jour
    numeric_features = [
        'user.listed_count', 'user.favourites_count', 'user.statuses_count',
        'log_listed', 'log_statuses', 'ratio_log', 
        'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
        'caps_ratio', 'exclamation_count', 'length_char', 'lexical_diversity', # <-- Diversity
        'desc_len', 'desc_has_email', 'desc_has_http', 'desc_is_pro',
        'account_age_days' # <-- Time feature
    ]
    
    categorical_features = ['source_category']
    tweet_text_col = 'final_text'
    desc_text_col = 'user_desc'

    # --- Transformers ---
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Other')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    tweet_text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))), # Ngrams ajoutés
        ('svd', TruncatedSVD(n_components=50, random_state=42))
    ])

    desc_text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
        ('svd', TruncatedSVD(n_components=10, random_state=42))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('tweet_txt', tweet_text_transformer, tweet_text_col),
            ('desc_txt', desc_text_transformer, desc_text_col)
        ]
    )

    # --- MODÈLE ENSEMBLE (VOTING) ---
    # On combine 3 modèles complémentaires
    
    clf1 = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.05, max_depth=10, random_state=42)
    clf2 = RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42, n_jobs=-1)
    clf3 = ExtraTreesClassifier(n_estimators=300, max_depth=15, random_state=42, n_jobs=-1)

    voting_model = VotingClassifier(
        estimators=[
            ('hgb', clf1), 
            ('rf', clf2),
            ('et', clf3)
        ],
        voting='soft' # 'soft' utilise les probabilités, souvent meilleur que 'hard'
    )

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', voting_model)
    ])

    # --- Exécution ---
    X = train_df
    y = train_df['label']

    print("-" * 30)
    print("Validation Croisée (5-Fold) sur l'Ensemble...")
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy', n_jobs=-1)
    
    print(f"Scores CV : {scores}")
    print(f"Moyenne CV : {np.mean(scores)*100:.2f}% (+/- {np.std(scores)*100:.2f}%)")
    print("-" * 30)

    print("Entraînement Final & Prédiction...")
    model.fit(X, y)

    # --- SAUVEGARDE DU MODÈLE ---
    print("Sauvegarde du modèle en cours...")
    joblib.dump(model, 'model_detector_bots_v5.pkl')
    print("Modèle sauvegardé sous 'model_detector_bots_v5.pkl'")
    # -----------------------------
    
    predictions = model.predict(test_df)

    submission = pd.DataFrame({'ID': test_df['challenge_id'], 'Prediction': predictions})
    submission.to_csv('submission_v5_ensemble.csv', index=False)
    print("Fichier 'submission_v5_ensemble.csv' prêt. Bonne chance !")

In [26]:
if __name__ == "__main__":
    run()

Chargement des données...
Feature Engineering V5...
--- Génération des features (Time & Lexical - CORRIGÉ) ---


  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)


--- Génération des features (Time & Lexical - CORRIGÉ) ---


  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)


------------------------------
Validation Croisée (5-Fold) sur l'Ensemble...


Python(29475) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29476) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29478) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29481) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29483) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29486) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29487) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29489) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29491) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29493) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29495) Malloc

Scores CV : [0.84923991 0.84659329 0.84930446 0.84730336 0.84336066]
Moyenne CV : 84.72% (+/- 0.22%)
------------------------------
Entraînement Final & Prédiction...
Sauvegarde du modèle en cours...
Modèle sauvegardé sous 'model_detector_bots_v5.pkl'
Fichier 'submission_v5_ensemble.csv' prêt. Bonne chance !
