In [1]:
# ==========================================
# 0. PACKAGE IMPORTS
# ==========================================

In [3]:
!pip install catboost
import pandas as pd
import numpy as np
import re
from pandas import json_normalize
from textblob import TextBlob # Nouvelle feature légère

# Modèles Mac-Compatible
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
# ==========================================
# 1. FEATURE ENGINEERING (ULTIME)
# ==========================================

In [5]:
def extract_source(source_html):
    if pd.isna(source_html): return "Unknown"
    match = re.search(r'>(.*?)<', str(source_html))
    return match.group(1) if match else "Unknown"

In [6]:
def get_sentiment(text):
    # Les bots sont souvent très neutres ou très positifs (spam commercial)
    # Les humains sont plus nuancés (négatifs, sarcastiques)
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return 0

In [7]:
def feature_engineering(df):
    print("... Feature Engineering ...")

    # Nettoyage
    target_cols = ['user.listed_count', 'user.favourites_count', 'user.statuses_count',
                   'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
                   'user.followers_count', 'user.friends_count']
    for col in target_cols:
        if col not in df.columns: df[col] = 0
        df[col] = df[col].fillna(0)

    # Ratios
    df['log_followers'] = np.log1p(df['user.followers_count'])
    df['log_friends'] = np.log1p(df['user.friends_count'])
    df['ratio_log'] = df['log_followers'] - df['log_friends']
    df['log_listed'] = np.log1p(df['user.listed_count'])
    df['log_statuses'] = np.log1p(df['user.statuses_count'])
    df['activity_ratio'] = (df['user.favourites_count'] + 1) / (df['user.statuses_count'] + 1)

    # Temps
    if 'created_at' in df.columns and 'user.created_at' in df.columns:
        df['tweet_date'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['account_age_days'] = (df['tweet_date'] - df['user_date']).dt.days
        df['account_age_days'] = df['account_age_days'].fillna(0)
    else:
        df['account_age_days'] = 0

    df['growth_rate'] = df['user.followers_count'] / (df['account_age_days'] + 1)

    # Text
    def get_clean_text(row):
        txt = str(row.get('text', ''))
        if 'extended_tweet.full_text' in row and not pd.isna(row['extended_tweet.full_text']):
            txt = str(row['extended_tweet.full_text'])
        return txt
    df['final_text'] = df.apply(get_clean_text, axis=1)

    # --- NOUVEAU : Sentiment Analysis ---
    df['sentiment'] = df['final_text'].apply(get_sentiment)

    # Text Stats
    df['lexical_diversity'] = df['final_text'].apply(lambda x: len(set(x.split())) / (len(x.split()) + 1))
    df['caps_ratio'] = df['final_text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x)+1))
    df['length_char'] = df['final_text'].apply(len)
    df['num_hashtags'] = df['final_text'].apply(lambda x: x.count('#'))
    df['num_mentions'] = df['final_text'].apply(lambda x: x.count('@'))

    # Bio
    if 'user.description' not in df.columns: df['user.description'] = ""
    df['user_desc'] = df['user.description'].fillna("")
    df['desc_len'] = df['user_desc'].apply(len)

    # Defaults
    if 'user.default_profile_image' in df.columns:
        df['is_default_image'] = df['user.default_profile_image'].fillna(False).astype(int)
    else: df['is_default_image'] = 0

    if 'user.default_profile' in df.columns:
        df['is_default_profile'] = df['user.default_profile'].fillna(False).astype(int)
    else: df['is_default_profile'] = 0

    # Source
    if 'source' not in df.columns: df['source'] = ""
    df['source_clean'] = df['source'].apply(extract_source)
    top_sources = ['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App', 'TweetDeck', 'Hootsuite', 'Buffer']
    df['source_category'] = df['source_clean'].apply(lambda x: x if x in top_sources else 'Other')

    return df

In [8]:
# ==========================================
# 2. PRÉPARATION DU PIPELINE
# ==========================================

In [9]:
def get_pipeline():
    numeric_features = [
        'user.listed_count', 'user.favourites_count', 'user.statuses_count',
        'log_listed', 'log_statuses', 'ratio_log', 'activity_ratio', 'growth_rate',
        'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
        'caps_ratio', 'length_char', 'lexical_diversity', 'num_hashtags', 'num_mentions', 'sentiment', # <-- Sentiment added
        'desc_len', 'is_default_image', 'is_default_profile',
        'account_age_days'
    ]

    categorical_features = ['source_category']
    tweet_text_col = 'final_text'
    desc_text_col = 'user_desc'

    # Transformers
    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Other')),
                                              ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

    # TF-IDF reduit pour aller plus vite pendant le pseudo-labeling
    tweet_text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=3000, stop_words='english', ngram_range=(1,2))),
        ('svd', TruncatedSVD(n_components=30, random_state=42))
    ])

    desc_text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=500, stop_words='english')),
        ('svd', TruncatedSVD(n_components=5, random_state=42))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('tweet_txt', tweet_text_transformer, tweet_text_col),
            ('desc_txt', desc_text_transformer, desc_text_col)
        ]
    )

    # Stack Mac-Safe
    estimators = [
        ('hgb', HistGradientBoostingClassifier(max_iter=200, learning_rate=0.05, max_depth=10, random_state=42)),
        ('cat', CatBoostClassifier(iterations=200, depth=8, learning_rate=0.05, verbose=0, random_seed=42)),
        ('rf', RandomForestClassifier(n_estimators=150, max_depth=15, random_state=42, n_jobs=-1)),
        ('et', ExtraTreesClassifier(n_estimators=150, max_depth=15, random_state=42, n_jobs=-1))
    ]

    stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5, n_jobs=-1)

    return Pipeline(steps=[('preprocessor', preprocessor), ('classifier', stacking_clf)])

In [10]:
# ==========================================
# 3. EXÉCUTION : PSEUDO-LABELING
# ==========================================

In [11]:
def run_pseudo_labeling():
    print("--- 1. Chargement & Engineering ---")
    train_df = pd.read_json('train.jsonl', lines=True)
    test_df = pd.read_json('kaggle_test.jsonl', lines=True)

    train_df = json_normalize(train_df.to_dict(orient='records'))
    test_df = json_normalize(test_df.to_dict(orient='records'))

    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    X_train = train_df
    y_train = train_df['label']
    X_test = test_df

    # --- ROUND 1 : Entraînement Initial ---
    print("\n--- 2. Entraînement Initial (Round 1) ---")
    model = get_pipeline()
    model.fit(X_train, y_train)

    # Prédiction des probabilités sur le test
    print("Génération des Pseudo-Labels...")
    probs = model.predict_proba(X_test) # [[prob_0, prob_1], ...]

    # --- PSEUDO LABELING LOGIC ---
    # On prend les prédictions où le modèle est sûr à > 98%
    # On crée un DataFrame temporaire
    pseudo_labels = []

    # Probabilité d'être un BOT (Classe 1) > 0.98
    high_conf_bots = np.where(probs[:, 1] > 0.98)[0]
    # Probabilité d'être HUMAIN (Classe 0) > 0.98 (donc prob bot < 0.02)
    high_conf_humans = np.where(probs[:, 1] < 0.02)[0]

    print(f"Trouvé {len(high_conf_bots)} Bots 'sûrs' et {len(high_conf_humans)} Humains 'sûrs' dans le Test Set.")

    # Création du dataset augmenté
    # On prend les lignes du test set correspondantes
    test_bots = X_test.iloc[high_conf_bots].copy()
    test_bots['label'] = 1 # On force le label

    test_humans = X_test.iloc[high_conf_humans].copy()
    test_humans['label'] = 0 # On force le label

    # Fusion avec le train set original
    X_train_augmented = pd.concat([X_train, test_bots, test_humans], axis=0)
    # Important: mettre à jour y_train aussi (il est dans le dataframe X_train_augmented maintenant)
    y_train_augmented = X_train_augmented['label']

    print(f"Taille Training Original : {len(X_train)}")
    print(f"Taille Training Augmenté : {len(X_train_augmented)} (+{len(X_train_augmented)-len(X_train)})")

    # --- ROUND 2 : Entraînement Final sur données augmentées ---
    print("\n--- 3. Entraînement Final (Round 2 - Pseudo Labeling) ---")
    # On recrée un modèle neuf
    final_model = get_pipeline()
    final_model.fit(X_train_augmented, y_train_augmented)

    print("--- 4. Prédiction Finale ---")
    final_predictions = final_model.predict(X_test)

    submission = pd.DataFrame({'ID': test_df['challenge_id'], 'Prediction': final_predictions})
    submission.to_csv('Prediction_pseudo_labeling.csv', index=False)

    print("Fichier 'Prediction_pseudo_labeling.csv' prêt.")

In [12]:
if __name__ == "__main__":
    run_pseudo_labeling()

--- 1. Chargement & Engineering ---
... Feature Engineering ...


  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)


... Feature Engineering ...


  df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)



--- 2. Entraînement Initial (Round 1) ---
Génération des Pseudo-Labels...
Trouvé 0 Bots 'sûrs' et 0 Humains 'sûrs' dans le Test Set.
Taille Training Original : 154914
Taille Training Augmenté : 154914 (+0)

--- 3. Entraînement Final (Round 2 - Pseudo Labeling) ---
--- 4. Prédiction Finale ---
Fichier 'Prediction_pseudo_labeling.csv' prêt.
