In [None]:
# ==========================================
# 0. PACKAGE IMPORTS
# ==========================================

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
import pandas as pd
import numpy as np
import re
from pandas import json_normalize
from textblob import TextBlob

# Modèles
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

In [None]:
# ==========================================
# 1. FEATURE ENGINEERING
# ==========================================

In [4]:
def extract_source(source_html):
    if pd.isna(source_html): return "Unknown"
    match = re.search(r'>(.*?)<', str(source_html))
    return match.group(1) if match else "Unknown"

In [5]:
def get_sentiment(text):
    try: return TextBlob(text).sentiment.polarity
    except: return 0

In [6]:
def feature_engineering(df):
    print("... Feature Engineering ...")

    # Nettoyage
    target_cols = ['user.listed_count', 'user.favourites_count', 'user.statuses_count',
                   'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
                   'user.followers_count', 'user.friends_count']
    for col in target_cols:
        if col not in df.columns: df[col] = 0
        df[col] = df[col].fillna(0)

    # --- Ratios ---
    df['reputation_score'] = df['user.listed_count'] / (df['user.followers_count'] + 1)

    # Ratios & Logs
    df['log_followers'] = np.log1p(df['user.followers_count'])
    df['log_friends'] = np.log1p(df['user.friends_count'])
    df['log_listed'] = np.log1p(df['user.listed_count'])
    df['log_statuses'] = np.log1p(df['user.statuses_count'])
    df['ratio_log'] = df['log_followers'] - df['log_friends']
    df['activity_ratio'] = (df['user.favourites_count'] + 1) / (df['user.statuses_count'] + 1)

    # Temps & Croissance
    if 'created_at' in df.columns and 'user.created_at' in df.columns:
        df['tweet_date'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['user_date'] = pd.to_datetime(df['user.created_at'], errors='coerce', utc=True).dt.tz_localize(None)
        df['account_age_days'] = (df['tweet_date'] - df['user_date']).dt.days
        df['account_age_days'] = df['account_age_days'].fillna(0)
    else:
        df['account_age_days'] = 0

    df['growth_rate'] = df['user.followers_count'] / (df['account_age_days'] + 1)

    # Text & Noms
    def get_clean_text(row):
        txt = str(row.get('text', ''))
        if 'extended_tweet.full_text' in row and not pd.isna(row['extended_tweet.full_text']):
            txt = str(row['extended_tweet.full_text'])
        return txt
    df['final_text'] = df.apply(get_clean_text, axis=1)

    # Psycholinguistique
    df['sentiment'] = df['final_text'].apply(get_sentiment)
    df['lexical_diversity'] = df['final_text'].apply(lambda x: len(set(x.split())) / (len(x.split()) + 1))
    df['caps_ratio'] = df['final_text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x)+1))
    df['length_char'] = df['final_text'].apply(len)
    df['num_hashtags'] = df['final_text'].apply(lambda x: x.count('#'))
    df['num_mentions'] = df['final_text'].apply(lambda x: x.count('@'))

    # Naming Patterns
    if 'user.screen_name' in df.columns:
        name_col = df['user.screen_name'].fillna("")
        df['name_digits'] = name_col.apply(lambda x: sum(c.isdigit() for c in str(x)))
        df['name_len'] = name_col.apply(lambda x: len(str(x)))
    else:
        df['name_digits'] = 0
        df['name_len'] = 0

    # Profil & Bio
    if 'user.description' not in df.columns: df['user.description'] = ""
    df['user_desc'] = df['user.description'].fillna("")
    df['desc_len'] = df['user_desc'].apply(len)

    # Images par défaut
    if 'user.default_profile_image' in df.columns:
        df['is_default_image'] = df['user.default_profile_image'].fillna(False).astype(int)
    else: df['is_default_image'] = 0

    if 'user.default_profile' in df.columns:
        df['is_default_profile'] = df['user.default_profile'].fillna(False).astype(int)
    else: df['is_default_profile'] = 0

    # Source
    if 'source' not in df.columns: df['source'] = ""
    df['source_clean'] = df['source'].apply(extract_source)
    top_sources = ['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App', 'TweetDeck', 'Hootsuite', 'Buffer']
    df['source_category'] = df['source_clean'].apply(lambda x: x if x in top_sources else 'Other')

    return df


In [7]:
# ==========================================
# 2. PIPELINE
# ==========================================

In [8]:
def get_pipeline():
    # 1. Features Numériques
    numeric_features = [
        'reputation_score', 'growth_rate', 'name_digits', 'name_len',
        'log_listed', 'log_statuses', 'ratio_log', 'activity_ratio',
        'quote_count', 'favorite_count', 'retweet_count', 'reply_count',
        'caps_ratio', 'length_char', 'sentiment', 'lexical_diversity', 'num_hashtags', 'num_mentions',
        'desc_len', 'is_default_image', 'is_default_profile',
        'account_age_days'
    ]

    categorical_features = ['source_category']
    tweet_text_col = 'final_text'
    desc_text_col = 'user_desc'

    # Transformers
    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', RobustScaler())])
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Other')),
                                              ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

    # 2. NLP 1 : Structure (SVD)
    tweet_svd = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=4000, stop_words='english', ngram_range=(1,2))),
        ('svd', TruncatedSVD(n_components=40, random_state=42))
    ])

    # 3. NLP 2 : Thèmes (LDA) -
    # Permet de détecter les "Spam"
    tweet_lda = Pipeline(steps=[
        ('count', CountVectorizer(max_features=4000, stop_words='english')),
        ('lda', LatentDirichletAllocation(n_components=10, random_state=42))
    ])

    desc_svd = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
        ('svd', TruncatedSVD(n_components=10, random_state=42))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('txt_svd', tweet_svd, tweet_text_col),
            ('txt_lda', tweet_lda, tweet_text_col), # <--- Fusion LDA
            ('desc_svd', desc_svd, desc_text_col)
        ]
    )

    # 4. Modèle : Stacking
    # On combine les meilleurs modèles qu'on a testé
    estimators = [
        ('cat', CatBoostClassifier(iterations=600, depth=8, learning_rate=0.03, verbose=0, random_seed=42)),
        ('hgb', HistGradientBoostingClassifier(max_iter=300, max_depth=12, learning_rate=0.05, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=300, max_depth=15, n_jobs=-1, random_state=42))
    ]

    stacking_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(),
        cv=5,
        n_jobs=-1
    )

    return Pipeline(steps=[('preprocessor', preprocessor), ('classifier', stacking_clf)])


In [9]:
# ==========================================
# 3. EXÉCUTION
# ==========================================

In [12]:
def run():
    print("--- CHARGEMENT ---")
    train_df = pd.read_json('train.jsonl', lines=True)
    test_df = pd.read_json('kaggle_test.jsonl', lines=True)

    train_df = json_normalize(train_df.to_dict(orient='records'))
    test_df = json_normalize(test_df.to_dict(orient='records'))

    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    X_train = train_df
    y_train = train_df['label']

    # --- 1. Entraînement Initial ---
    print("\n--- ENTRAINEMENT INITIAL (Round 1) ---")
    model = get_pipeline()
    model.fit(X_train, y_train)

    # --- 2. Pseudo-Labeling ---
    print("\n--- PSEUDO-LABELING ---")
    probs = model.predict_proba(test_df)

    # On prend les certitudes > 98%
    high_conf_observers = np.where(probs[:, 1] > 0.98)[0]
    high_conf_influencers = np.where(probs[:, 1] < 0.02)[0]

    print(f"Ajout de {len(high_conf_observers)} Observers et {len(high_conf_influencers)} Influenceurs au training set.")

    test_observers = test_df.iloc[high_conf_observers].copy()
    test_observers['label'] = 1

    test_influencers = test_df.iloc[high_conf_influencers].copy()
    test_influencers['label'] = 0

    # Nouveau Dataset Augmenté
    X_train_augmented = pd.concat([train_df, test_observers, test_influencers], axis=0)
    y_train_augmented = X_train_augmented['label']

    # --- 3. Entraînement Final ---
    print("\n--- ENTRAINEMENT FINAL (Round 2) ---")
    final_model = get_pipeline()
    final_model.fit(X_train_augmented, y_train_augmented)

    # --- 4. Post-Processing ---
    print("\n--- PREDICTION ---")
    probs_final = final_model.predict_proba(test_df)[:, 1]

    final_preds = []
    for i, prob in enumerate(probs_final):
        row = test_df.iloc[i]

        # Si réputation nulle et beaucoup de followers -> Probable Observers
        if (row['reputation_score'] < 0.0001) and (row['user.followers_count'] > 2000):
             final_preds.append(1 if prob > 0.2 else 0)

        # Si image par défaut -> Probable Observers
        elif row['is_default_image'] == 1:
            final_preds.append(1 if prob > 0.4 else 0)

        else:
            final_preds.append(1 if prob > 0.5 else 0)

    submission = pd.DataFrame({'ID': test_df['challenge_id'], 'Prediction': final_preds})
    submission.to_csv('Prediction_Sasageyo.csv', index=False)
    print("Fichier 'Prediction_Sasageyo.csv' généré.")

In [None]:
if __name__ == "__main__":
    run()