In [2]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, cross_val_score

print("=== R√âPARATION : ENTRA√éNEMENT DU MOD√àLE ARBRES (V5) SUR COLAB ===")

# 1. On s'assure que les donn√©es sont pr√™tes (Normalement d√©j√† charg√©es en m√©moire)
# Si les variables n'existent plus, on recharge :
try:
    if 'train_df' not in locals():
        raise NameError
    print("Donn√©es trouv√©es en m√©moire.")
except NameError:
    print("Rechargement des donn√©es...")
    train_df = pd.read_json('train.jsonl', lines=True)
    test_df = pd.read_json('kaggle_test.jsonl', lines=True)
    train_df = json_normalize(train_df.to_dict(orient='records'))
    test_df = json_normalize(test_df.to_dict(orient='records'))
    train_df = advanced_feature_engineering(train_df)
    test_df = advanced_feature_engineering(test_df)

# 2. Reconstitution du Pipeline V5 (Identique √† votre version Mac)
print("Configuration du Pipeline V5...")

numeric_features = [
    'user.listed_count', 'user.favourites_count', 'user.statuses_count',
    'log_listed', 'log_statuses', 'ratio_log',
    'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
    'caps_ratio', 'exclamation_count', 'length_char', 'lexical_diversity',
    'desc_len', 'desc_has_email', 'desc_has_http', 'desc_is_pro',
    'account_age_days'
]
categorical_features = ['source_is_top'] # Simplifi√© pour Colab
tweet_text_col = 'final_text'
desc_text_col = 'user_desc'

# Transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

tweet_text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))),
    ('svd', TruncatedSVD(n_components=50, random_state=42))
])

desc_text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
    ('svd', TruncatedSVD(n_components=10, random_state=42))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('tweet_txt', tweet_text_transformer, tweet_text_col),
        ('desc_txt', desc_text_transformer, desc_text_col)
    ]
)

# Le Voting Classifier (L'arme fatale V5)
clf1 = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.05, max_depth=10, random_state=42)
clf2 = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
clf3 = ExtraTreesClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)

voting_model = VotingClassifier(
    estimators=[('hgb', clf1), ('rf', clf2), ('et', clf3)],
    voting='soft'
)

model_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_model)
])

# 3. Entra√Ænement V5
print("Entra√Ænement du mod√®le Arbres en cours (Patientez ~1-2 min)...")
X = train_df
y = train_df['label']
model_tree.fit(X, y)
print("Mod√®le Arbres entra√Æn√© !")

# 4. G√©n√©ration des probabilit√©s Arbres
print("G√©n√©ration des pr√©dictions Arbres...")
probs_tree = model_tree.predict_proba(test_df)[:, 1]

# 5. R√©cup√©ration des probabilit√©s Deep Learning
print("R√©cup√©ration des pr√©dictions Deep Learning...")
# On recharge le mod√®le DL sauvegard√© par le checkpoint juste avant le crash
try:
    model_dl_loaded = load_model("best_model_dl_opt.h5")

    # On doit refaire la tokenization pour le test set si les variables ont √©t√© perdues
    # (S√©curit√© pour √©viter les erreurs de dimension)
    MAX_WORDS = 15000; MAX_LEN = 80
    tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
    tokenizer.fit_on_texts(train_df['final_text']) # Important: fit sur TRAIN

    X_test_text_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['final_text']), maxlen=MAX_LEN, padding='post')

    # On refait le scaler pour les meta du DL (car c'est un scaler diff√©rent du pipeline sklearn)
    scaler_dl = StandardScaler()
    scaler_dl.fit(train_df[numeric_features + ['source_is_top']])
    X_test_meta_norm = scaler_dl.transform(test_df[numeric_features + ['source_is_top']])

    # Pr√©diction DL
    probs_dl = model_dl_loaded.predict([X_test_text_seq, X_test_meta_norm], batch_size=32).reshape(-1)
    print("Pr√©dictions DL r√©cup√©r√©es.")

except Exception as e:
    print(f"Erreur lors du rechargement DL : {e}")
    print("Utilisation des probas DL en m√©moire si disponibles...")

# 6. BLENDING FINAL
print("=== FUSION FINALE ===")
# Formule : 65% Arbres + 35% Deep Learning
final_scores = (0.65 * probs_tree) + (0.35 * probs_dl)
final_predictions = (final_scores > 0.5).astype(int)

submission = pd.DataFrame({'ID': test_df['challenge_id'], 'Prediction': final_predictions})
submission.to_csv('submission_FINAL_COLAB.csv', index=False)

print("\n" + "="*40)
print("SUCC√àS TOTAL ! üöÄ")
print("T√©l√©chargez le fichier 'submission_FINAL_COLAB.csv' dans le menu de gauche.")
print("="*40)

=== R√âPARATION : ENTRA√éNEMENT DU MOD√àLE ARBRES (V5) SUR COLAB ===
Donn√©es trouv√©es en m√©moire.
Configuration du Pipeline V5...
Entra√Ænement du mod√®le Arbres en cours (Patientez ~1-2 min)...
Mod√®le Arbres entra√Æn√© !
G√©n√©ration des pr√©dictions Arbres...




R√©cup√©ration des pr√©dictions Deep Learning...
[1m3231/3231[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m14s[0m 4ms/step
Pr√©dictions DL r√©cup√©r√©es.
=== FUSION FINALE ===

SUCC√àS TOTAL ! üöÄ
T√©l√©chargez le fichier 'submission_FINAL_COLAB.csv' dans le menu de gauche.
