In [13]:
# ==========================================
# 0. PACKAGE IMPORTS
# ==========================================

In [14]:
import pandas as pd
import numpy as np
import json
import re
from pandas import json_normalize

# Scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [15]:
# ==========================================
# 1. FEATURE ENGINEERING
# ==========================================

In [16]:
def count_regex(pattern, text):
    if not isinstance(text, str): return 0
    return len(re.findall(pattern, text))

In [17]:
def extract_source(source_html):
    if pd.isna(source_html): return "Unknown"
    match = re.search(r'>(.*?)<', str(source_html))
    return match.group(1) if match else "Unknown"

In [None]:
# ==========================================
# 1.2 TEXT ANALYSIS
# ==========================================

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel

In [None]:
class TweetDataset(Dataset):
    """
    Dataset PyTorch pour des tweets stockés dans un DataFrame pandas.

    - df : DataFrame avec au moins une colonne texte et éventuellement 'label'
    - text_col : nom de la colonne contenant le tweet (ici 'full_text')
    - label_col : nom de la colonne des labels (None pour le test Kaggle)
    """
    def __init__(self, df, tokenizer, text_col="full_text", label_col="label", max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.text_col = text_col
        self.label_col = label_col
        self.max_length = max_length
        self.has_labels = label_col is not None and label_col in df.columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Récupérer le texte brut dans le DataFrame
        text = str(self.df.iloc[idx][self.text_col])

        # Tokenization pour CamemBERT
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }

        if self.has_labels:
            label = int(self.df.iloc[idx][self.label_col])
            item["labels"] = torch.tensor(label, dtype=torch.long)

        return item



class ClassificationHead(nn.Module):
    """
    Tête de classification modulaire.
    Tu pourras facilement changer hidden_dim, activation, etc.
    """
    def __init__(
        self,
        input_dim: int,
        num_labels: int,
        hidden_dim: int = None,
        dropout: float = 0.1,
        activation: str = "gelu",
    ):
        super().__init__()

        if hidden_dim is None:
            hidden_dim = input_dim

        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(input_dim, hidden_dim)

        if activation == "gelu":
            self.activation = nn.GELU()
        elif activation == "relu":
            self.activation = nn.ReLU()
        else:
            raise ValueError(f"Activation inconnue: {activation}")

        self.linear2 = nn.Linear(hidden_dim, num_labels)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.dropout(x)
        x = self.linear1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class CamembertForCustomClassification(nn.Module):
    """
    Wrapper CamemBERT + tête de classification.
    """
    def __init__(
        self,
        model_name: str,
        num_labels: int = 2,
        dropout: float = 0.1,
        pooling: str = "cls",      # 'cls' ou 'mean'
        hidden_dim: int = None,
        activation: str = "gelu",
    ):
        super().__init__()

        self.backbone = AutoModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size

        self.pooling = pooling
        self.num_labels = num_labels

        self.classifier = ClassificationHead(
            input_dim=hidden_size,
            num_labels=num_labels,
            hidden_dim=hidden_dim,
            dropout=dropout,
            activation=activation,
        )

        self.loss_fn = nn.CrossEntropyLoss()

    def _pool(self, last_hidden_state: torch.Tensor, attention_mask: torch.Tensor):
        if self.pooling == "cls":
            return last_hidden_state[:, 0, :]
        elif self.pooling == "mean":
            mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size())
            masked = last_hidden_state * mask
            summed = masked.sum(dim=1)
            counts = mask.sum(dim=1).clamp(min=1)
            return summed / counts
        else:
            raise ValueError(f"Pooling inconnue: {self.pooling}")

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor = None,
        labels: torch.Tensor = None,
    ):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        last_hidden_state = outputs.last_hidden_state
        pooled = self._pool(last_hidden_state, attention_mask)

        logits = self.classifier(pooled)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"logits": logits, "loss": loss}

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Yanzhu/bertweetfr-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = CamembertForCustomClassification(
    model_name=model_name,   # ou bertweetfr, etc.
    num_labels=2,
)
model.load_state_dict(torch.load("Model1.pth", map_location=device))
model.to(device)
model.eval()

In [None]:
import torch
import pandas as pd
from tqdm.auto import tqdm

def predict_from_text(model, data_loader, device):
    """
    Effectue l'inférence sur data_loader et écrit un CSV format Kaggle :
    
    csv_path : chemin du CSV de sortie
    """

    model.to(device)
    model.eval()

    all_preds = []

    with torch.no_grad():
        for batch in tqdm(data_loader, leave=False):

            # Récupération input
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs["logits"]
            probas = torch.softmax(logits, dim=1)[:, 1].cpu().tolist()

            all_preds.extend(probas)
    
    # Creation DataFrame Kaggle
    df = pd.DataFrame({
        "Prediction": all_preds
    })

    return df

In [None]:
def feature_engineering(df):
    print("--- Génération des features ---")
    
    # --- A. Nettoyage de base ---
    # Remplir les valeurs nulles numériques
    num_cols = ['user.followers_count', 'user.friends_count', 'user.listed_count', 
                'user.favourites_count', 'user.statuses_count']
    for col in num_cols:
        if col not in df.columns: df[col] = 0
        df[col] = df[col].fillna(0)

    # --- B. Le "Log Ratio" ---
    df['log_followers'] = np.log1p(df['user.followers_count'])
    df['log_friends'] = np.log1p(df['user.friends_count'])
    df['ratio_log'] = df['log_followers'] - df['log_friends']
    df['user_listed_ratio'] = df['user.listed_count'] / (df['user.followers_count'] + 1)
    
    # --- C. Analyse de la Biographie (user.description) ---
    # C'est la nouveauté majeure.
    df['user_desc'] = df['user.description'].fillna("")
    
    # 1. Longueur de la bio (les pros soignent leur bio)
    df['desc_len'] = df['user_desc'].apply(len)
    
    # 2. Signaux de professionnalisme dans la bio
    # Détection d'email (contact pro) ou de liens
    df['desc_has_email'] = df['user_desc'].apply(lambda x: 1 if re.search(r'[\w\.-]+@[\w\.-]+', x) else 0)
    df['desc_has_link'] = df['user_desc'].apply(lambda x: 1 if "http" in x else 0)
    
    # 3. Mots clés "Pro" (comptage manuel simple)
    pro_keywords = ['journaliste', 'journalist', 'official', 'officiel', 'media', 'news', 'presse', 'auteur', 'author']
    df['desc_is_pro'] = df['user_desc'].apply(lambda x: 1 if any(word in x.lower() for word in pro_keywords) else 0)

    # --- D. Analyse du Tweet (Contenu et Forme) ---
    def get_text(row):
        text = str(row.get('text', ''))
        if 'extended_tweet.full_text' in row and not pd.isna(row['extended_tweet.full_text']):
            text = str(row['extended_tweet.full_text'])
        return text
    
    df['final_text'] = df.apply(get_text, axis=1)
    
    # Features "Psycholinguistiques"
    # Usage excessif de majuscules (souvent signe d'amateurisme ou de colère)
    df['caps_ratio'] = df['final_text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x)+1))
    # Longueur moyenne des mots (vocabulaire riche vs simple)
    df['avg_word_len'] = df['final_text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)
    # Ponctuation
    df['exclamation_count'] = df['final_text'].apply(lambda x: x.count('!'))

    # --- E. Source ---
    df['source_clean'] = df['source'].apply(extract_source)
    top_sources = ['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App', 'TweetDeck', 'iPad', 'Hootsuite', 'Buffer']
    df['source_category'] = df['source_clean'].apply(lambda x: x if x in top_sources else 'Other')
    
    # --- F. Meta Tweet ---
    if 'entities.hashtags' in df.columns:
        df['num_hashtags'] = df['entities.hashtags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    else: df['num_hashtags'] = 0

    if 'entities.urls' in df.columns:
        df['num_urls'] = df['entities.urls'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    else: df['num_urls'] = 0

    # --- G. Text analysis features ---

    print("--- Extraction du texte complet et analyse textuelle ---")

    # Tweets can be truncated, storing the full version in 'extended_tweet.full_text'.
    def extract_full_text(tweet):
        text = tweet['text']
        if not pd.isna(tweet['extended_tweet.full_text']):
            text = tweet['extended_tweet.full_text']
        return text

    # Apply this function to every row (axis=1)
    df['full_text'] = df.apply(lambda tweet: extract_full_text(tweet), axis=1)
    
    kaggle_dataset = TweetDataset(
        df=df,
        tokenizer=tokenizer,
        text_col="full_text",
        label_col=None,      # pas de labels pour Kaggle
        max_length=128,
    )

    kaggle_loader = DataLoader(kaggle_dataset, batch_size=32, shuffle=False)
    text_analysis_df = predict_from_text(model, kaggle_loader, device)
    df['text_analysis_pred'] = text_analysis_df['Prediction']

    return df

In [19]:
# ==========================================
# 2. PIPELINE DE TRAITEMENT
# ==========================================

In [None]:
def run():
    print("Chargement des données...")
    # Assurez-vous que les fichiers sont au bon endroit
    train_df = pd.read_json('train.jsonl', lines=True)
    test_df = pd.read_json('kaggle_test.jsonl', lines=True)

    print("Feature Engineering...")
    train_df = json_normalize(train_df.to_dict(orient='records'))
    test_df = json_normalize(test_df.to_dict(orient='records'))

    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    # --- Sélecteur de colonnes ---
    
    numeric_features = [
        'user.followers_count', 'user.friends_count', 'user.listed_count', 
        'user.favourites_count', 'user.statuses_count',
        'ratio_log', 'user_listed_ratio',
        'desc_len', 'desc_has_email', 'desc_has_link', 'desc_is_pro', # Bio stats
        'name_digits_count', 'name_len', # User name stats
        'caps_ratio', 'avg_word_len', 'exclamation_count', # Style stats
        'num_hashtags', 'num_urls', 'retweet_count', 'favorite_count', 'text_analysis_pred'
    ]
    
    categorical_features = ['source_category']
    
    tweet_text_col = 'final_text'
    desc_text_col = 'user_desc' # On traite aussi le TEXTE de la bio via TF-IDF

    # --- Transformers ---

    # 1. Numérique
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # 2. Catégoriel
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Other')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # 3. Texte du Tweet (SVD 50 composants)
    tweet_text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))),
        ('svd', TruncatedSVD(n_components=50, random_state=42))
    ])

    # 4. Texte de la Bio (SVD 10 composants - c'est plus court)
    # Cela permet de capter des thèmes ("Politique", "Sport") dans la bio
    desc_text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
        ('svd', TruncatedSVD(n_components=10, random_state=42))
    ])

    # Assemblage global
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('tweet_txt', tweet_text_transformer, tweet_text_col),
            ('desc_txt', desc_text_transformer, desc_text_col)
        ]
    )

    # --- Modèle ---
    # On augmente légèrement la complexité (profondeur) car on a plus de données
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', HistGradientBoostingClassifier(
            max_iter=300,        # Plus d'itérations
            learning_rate=0.05,  # Apprentissage plus fin (plus lent mais plus précis)
            max_depth=12,        # Arbres un peu plus profonds
            max_leaf_nodes=40,
            l2_regularization=1.0, # Évite le sur-apprentissage
            random_state=42,
            scoring='accuracy'
        ))
    ])

    # --- Exécution ---
    X = train_df
    y = train_df['label']

    print("-" * 30)
    print("Validation Croisée (5-Fold)...")
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy', n_jobs=-1)
    
    print(f"Scores CV : {scores}")
    print(f"Moyenne CV : {np.mean(scores)*100:.2f}% (+/- {np.std(scores)*100:.2f}%)")
    print("-" * 30)

    print("Entraînement Final...")
    model.fit(X, y)

    print("Prédiction...")
    predictions = model.predict(test_df)

    submission = pd.DataFrame({'ID': test_df['challenge_id'], 'Prediction': predictions})
    submission.to_csv('submission_expert.csv', index=False)
    print("Fichier 'submission_expert.csv' prêt.")

In [21]:
if __name__ == "__main__":
    run()

Chargement des données...
Feature Engineering...
--- Génération des features ---
ATTENTION : Colonne 'screen_name' introuvable. Remplie avec des vides.
--- Génération des features ---
ATTENTION : Colonne 'screen_name' introuvable. Remplie avec des vides.
------------------------------
Validation Croisée (5-Fold)...
Scores CV : [0.84097731 0.82574315 0.83875028 0.8331343  0.82983668]
Moyenne CV : 83.37% (+/- 0.56%)
------------------------------
Entraînement Final...
Prédiction...
Fichier 'submission_expert.csv' prêt.
