In [1]:
# ==========================================
# 0. PACKAGE IMPORTS
# ==========================================

import pandas as pd
import numpy as np
import json
from pandas import json_normalize
import re

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:
# ==========================================
# 1. FEATURE ENGINEERING
# ==========================================

In [3]:
def extract_source(source_html):
    """Extracts the source name from HTML anchor tag."""
    if pd.isna(source_html):
        return "Unknown"
    # Extract text between '>' and '<'
    match = re.search(r'>(.*?)<', str(source_html))
    if match:
        return match.group(1)
    return "Unknown"

In [4]:
def feature_engineering(df):
    print("--- Feature Generation ---")
    
    # --- A. User Metadata ---
    # Replace NaN with 0 for numerical user features
    num_cols = ['user.followers_count', 'user.friends_count', 'user.listed_count', 
                'user.favourites_count', 'user.statuses_count']
    for col in num_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
        else:
            df[col] = 0

    # 1. Advantage Ratio (Followers vs Friends)
    # Log-transform to reduce skewness between large and small accounts
    df['log_followers'] = np.log1p(df['user.followers_count'])
    df['log_friends'] = np.log1p(df['user.friends_count'])
    df['ratio_log'] = df['log_followers'] - df['log_friends'] # Equivalent to log(followers/friends)
    
    # 2. Liste Ratio
    # 'listed_count' is often a sign of influence
    df['user_listed_ratio'] = df['user.listed_count'] / (df['user.followers_count'] + 1)
    
    # --- B. Tweet Metadata (Behavior) ---
    
    # 3. Tweet Source (Professional Outlet vs Mobile User)
    # Often : "Influencers" - TweetDeck, Buffer, Hootsuite.  "Observers" - Android/iPhone.
    df['source_clean'] = df['source'].apply(extract_source)
    # We can categorize sources into 'Top' and 'Other'
    top_sources = ['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App', 
                   'TweetDeck', 'iPad', 'Hootsuite', 'Buffer']
    df['source_category'] = df['source_clean'].apply(lambda x: x if x in top_sources else 'Other')

    # 4. Content Richness (Entities)
    # An influencer often uses more hashtags, mentions, and links to engage audience
    def count_entities(entity_list):
        if isinstance(entity_list, list):
            return len(entity_list)
        return 0

    # Count hashtags, URLs, and mentions
    if 'entities.hashtags' in df.columns:
        df['num_hashtags'] = df['entities.hashtags'].apply(count_entities)
    else:
        df['num_hashtags'] = 0
        
    if 'entities.urls' in df.columns:
        df['num_urls'] = df['entities.urls'].apply(count_entities)
    else:
        df['num_urls'] = 0
        
    if 'entities.user_mentions' in df.columns:
        df['num_mentions'] = df['entities.user_mentions'].apply(count_entities)
    else:
        df['num_mentions'] = 0

    # 5. Virality Indicators
    # Influential tweets often get more retweets and likes
    df['retweet_count'] = df['retweet_count'].fillna(0)
    df['favorite_count'] = df['favorite_count'].fillna(0)

    # --- C. Text Cleaning ---
    def get_text(row):
        text = str(row.get('text', ''))
        if 'extended_tweet.full_text' in row and not pd.isna(row['extended_tweet.full_text']):
            text = str(row['extended_tweet.full_text'])
        return text
    
    df['final_text'] = df.apply(get_text, axis=1)
    df['text_len'] = df['final_text'].apply(len) # Tweet length as a feature

    return df

In [5]:
# ==========================================
# 2. PIPELINE
# ==========================================

In [6]:
def run():
    print("Data loading...")
    train_df = pd.read_json('train.jsonl', lines=True)
    test_df = pd.read_json('kaggle_test.jsonl', lines=True)

    print("JSON Flattening...")
    train_df = json_normalize(train_df.to_dict(orient='records'))
    test_df = json_normalize(test_df.to_dict(orient='records'))

    # Feature Engineering
    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    # Columns for Modeling
    numeric_features = [
        'user.followers_count', 'user.friends_count', 'user.listed_count', 
        'user.favourites_count', 'user.statuses_count',
        'ratio_log', 'user_listed_ratio',
        'retweet_count', 'favorite_count',
        'num_hashtags', 'num_urls', 'num_mentions', 'text_len'
    ]

    # Categorial Features
    categorical_features = ['source_category']

    # Text Feature
    text_feature = 'final_text'

    # --- Transformers Constructors ---

    # 1. Imputation and Scaling for Numerical Features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # 2. Ordinal Encoding for Categorical Features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Other')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # 3. Texte : TF-IDF + SVD (Dimension Reduction)
    # HistGradientBoosting - no sparse input support, so we reduce dimensionality.
    # We choose 50 components to balance information retention and efficiency.
    text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))),
        ('svd', TruncatedSVD(n_components=50, random_state=42)) 
    ])

    # Assembling the Preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('txt', text_transformer, text_feature)
        ]
    )

    # Model : HistGradientBoostingClassifier
    # State of the Art for tabular data in sklearn (similaire Ã  LightGBM)
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', HistGradientBoostingClassifier(
            max_iter=200,           # Trees count
            learning_rate=0.1,      # Learning rate
            max_depth=10,           # Max depth of each tree
            random_state=42,
            scoring='accuracy'
        ))
    ])

    # --- Training and Validation ---
    X = train_df
    y = train_df['label']

    print("-" * 30)
    print("Cross-Validation (5-Fold)...")
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy', n_jobs=-1)
    
    print(f"CV Scores: {scores}")
    print(f"CV Mean: {np.mean(scores)*100:.2f}% (+/- {np.std(scores)*100:.2f}%)")
    print("-" * 30)

    # Final Training
    print("Final Training...")
    model.fit(X, y)

    # Prediction
    print("Prediction on the test set...")
    predictions = model.predict(test_df)

    # Save Submission
    submission = pd.DataFrame({
        'ID': test_df['challenge_id'],
        'Prediction': predictions
    })
    submission.to_csv('Prediction.csv', index=False)
    print("Finished! File 'Prediction.csv' is completed.")


In [7]:
if __name__ == "__main__":
    run()

Data loading...
JSON Flattening...
--- Feature Generation ---
--- Feature Generation ---
------------------------------
Cross-Validation (5-Fold)...
CV Scores: [0.82632411 0.82419391 0.8258077  0.8247426  0.81966949]
CV Mean: 82.41% (+/- 0.24%)
------------------------------
Final Training...
Prediction on the test set...
Finished! File 'Prediction.csv' is completed.
