In [1]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize
import math
from tqdm import tqdm
import re
from collections import Counter

# Natural Language Processing (NLP) libraries
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt

# Scikit-learn modeling libraries
from sklearn.dummy import DummyClassifier # For baseline model
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text to numbers
from sklearn.linear_model import LogisticRegression # The classifier model
from sklearn.metrics import accuracy_score, classification_report # For evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score # For splitting and validating
from sklearn.pipeline import Pipeline # To chain processing step
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from torch.utils.data import Dataset, DataLoader
import torch

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import OneCycleLR
from torch.optim import Adam
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import copy




from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaModel

2025-12-10 01:04:43.560251: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    
print("Using device:", device)

Using device: cuda


In [3]:
def extract_full_text(tweet):
    # Start with the standard 'text' field
    text = tweet['text']
    # Check if the 'extended_tweet.full_text' field exists (is not NaN)
    if not pd.isna(tweet['extended_tweet.full_text']):
        # If it exists, it's the full text, so use it instead
        text = tweet['extended_tweet.full_text']
    return text

In [4]:
data = pd.read_json('../data/raw/train.jsonl', lines=True)
data = json_normalize(data.to_dict(orient='records'))

kaggle_data = pd.read_json('../data/raw/kaggle_test.jsonl', lines=True)
kaggle_data = json_normalize(kaggle_data.to_dict(orient='records'))

In [5]:
# Exemple : extraire le texte visible entre les balises <a>...</a>
def extract_source(source_html):
    match = re.search(r'>(.*?)<', source_html)
    return match.group(1) if match else source_html

# Appliquer la fonction sur la colonne source
data['source_name'] = data['source'].apply(extract_source)
kaggle_data['source_name'] = kaggle_data['source'].apply(extract_source)

In [6]:
main_sources = {
    'Twitter for iPhone', 'Twitter Web App', 'Twitter for Android',
    'Twitter for iPad', 'TweetDeck', 'Hootsuite Inc.', 'dlvr.it',
    'IFTTT', 'Wordpress.com', 'Buffer', 'Echobox'
}

source_stats = data.groupby('source_name').agg(
    total=('label', 'size'),
    sum_labels=('label', 'sum')
)

list_1 = source_stats[
    (source_stats['total'] > 3) &
    (source_stats['sum_labels'] == source_stats['total']) &
    (~source_stats.index.isin(main_sources))
].index.tolist()

list_0 = source_stats[
    (source_stats['total'] > 3) &
    (source_stats['sum_labels'] == 0) &
    (~source_stats.index.isin(main_sources))
].index.tolist()

In [7]:
data['main_source'] = data['source_name'].apply(lambda x: 1 if x in main_sources else 0)
data['others_1'] = 0
data['others_0'] = 0
data['others']  = 0
main_source_dummies = pd.get_dummies(data['source_name'])
main_source_dummies = main_source_dummies[[s for s in main_sources if s in main_source_dummies.columns]]
main_source_dummies = main_source_dummies.astype(int)
data['others_1'] = data['source_name'].apply(lambda x: 1 if x in list_1 else 0)
data['others_0'] = data['source_name'].apply(lambda x: 1 if x in list_0 else 0)
data['others']   = data['source_name'].apply(lambda x: 1 if (x not in main_sources) and (x not in list_1) and (x not in list_0) else 0)
data = pd.concat([data, main_source_dummies], axis=1)

In [8]:
kaggle_data['main_source'] = kaggle_data['source_name'].apply(lambda x: 1 if x in main_sources else 0)
kaggle_data['others_1'] = 0
kaggle_data['others_0'] = 0
kaggle_data['others']  = 0
main_source_dummies = pd.get_dummies(kaggle_data['source_name'])
main_source_dummies = main_source_dummies[[s for s in main_sources if s in main_source_dummies.columns]]
main_source_dummies = main_source_dummies.astype(int)
kaggle_data['others_1'] = kaggle_data['source_name'].apply(lambda x: 1 if x in list_1 else 0)
kaggle_data['others_0'] = kaggle_data['source_name'].apply(lambda x: 1 if x in list_0 else 0)
kaggle_data['others']   = kaggle_data['source_name'].apply(lambda x: 1 if (x not in main_sources) and (x not in list_1) and (x not in list_0) else 0)
kaggle_data = pd.concat([kaggle_data, main_source_dummies], axis=1)

In [9]:
def combine_list_columns(df, cols, new_col, mode="merge"):
    """
    Combine plusieurs colonnes contenant des listes.
    
    mode = "merge" → fusionne toutes les listes dans new_col
    mode = "count" → compte le nombre total d'éléments dans les listes
    
    - df : DataFrame
    - cols : liste de noms de colonnes à traiter
    - new_col : nom de la colonne résultante
    """
    results = []
    
    for _, row in df.iterrows():
        if mode == "merge":
            merged = []
            for col in cols:
                val = row.get(col)
                if isinstance(val, list):
                    merged.extend(val)
            results.append(merged)

        elif mode == "count":
            count = 0
            for col in cols:
                val = row.get(col)
                if isinstance(val, list):
                    count += len(val)
            results.append(count)
        
        else:
            raise ValueError("mode must be 'merge' or 'count'")
    
    df[new_col] = results
    return df


In [10]:
data = combine_list_columns(data,['entities.user_mentions', 'extended_tweet.entities.user_mentions'],'all_user_mentions_count', mode ="count") #For all_users_mentions column
data['quoted_status.favorite_count'] = pd.to_numeric(data['quoted_status.favorite_count'], errors='coerce')
data['quoted_status.user.favourites_count'] = pd.to_numeric(data['quoted_status.user.favourites_count'], errors='coerce')

kaggle_data = combine_list_columns(kaggle_data,['entities.user_mentions', 'extended_tweet.entities.user_mentions'],'all_user_mentions_count', mode ="count") #For all_users_mentions column
kaggle_data['quoted_status.favorite_count'] = pd.to_numeric(kaggle_data['quoted_status.favorite_count'], errors='coerce')
kaggle_data['quoted_status.user.favourites_count'] = pd.to_numeric(kaggle_data['quoted_status.user.favourites_count'], errors='coerce')

In [11]:
data = combine_list_columns(data,['entities.media', 'extended_tweet.entities.media','extended_entities.media','extended_tweet.extended_entities.media'],'all_media', mode = "merge") 
kaggle_data = combine_list_columns( kaggle_data,['entities.media', 'extended_tweet.entities.media','extended_entities.media','extended_tweet.extended_entities.media'],'all_media', mode = "merge") 

def extract_media_types(media_list):
    if isinstance(media_list, list):
        return {m.get("type") for m in media_list if isinstance(m, dict)}
    return set()

types_series = data['all_media'].apply(extract_media_types)

data['has_photo'] = types_series.apply(lambda t: 1 if "photo" in t else 0)
data['has_video'] = types_series.apply(lambda t: 1 if "video" in t else 0)
data['has_gif']   = types_series.apply(lambda t: 1 if "animated_gif" in t else 0)

types_series_kaggle = kaggle_data['all_media'].apply(extract_media_types)

kaggle_data['has_photo'] = types_series_kaggle.apply(lambda t: 1 if "photo" in t else 0)
kaggle_data['has_video'] = types_series_kaggle.apply(lambda t: 1 if "video" in t else 0)
kaggle_data['has_gif']   = types_series_kaggle.apply(lambda t: 1 if "animated_gif" in t else 0)

In [12]:
data = combine_list_columns(data,['entities.hashtags', 'extended_tweet.entities.hashtags'],'all_hashtags', mode = "merge") 
kaggle_data = combine_list_columns( kaggle_data,['entities.hashtags', 'extended_tweet.entities.hashtags'],'all_hashtags', mode = "merge") 
data_combined = pd.concat([data, kaggle_data], ignore_index=True)

all_hashtags = []
for hashtags_list in data_combined['all_hashtags']:
    if isinstance(hashtags_list, list):
        for hashtag in hashtags_list:
            if isinstance(hashtag, dict) and 'text' in hashtag:
                all_hashtags.append(hashtag['text'].lower())
hashtag_counts = Counter(all_hashtags)  
print("Top 20 hashtags:")
for tag, count in hashtag_counts.most_common(20):
    print(f"{tag}: {count}")

Top 20 hashtags:
covid19: 36666
covid: 7092
coronavirus: 5027
confinement: 3560
couvrefeu: 2314
covid_19: 2238
macron: 2136
vaccin: 2084
covidー19: 2057
covid19france: 1830
vaccination: 1719
france: 1564
confinement2: 962
reconfinement: 944
castex: 901
santé: 885
confinement3: 817
macron20h: 762
vaccins: 759
covid19fr: 738


In [13]:
from collections import Counter

def extract_hashtags(hashtag_list):
    if not isinstance(hashtag_list, list):
        return []
    return [h['text'].lower() for h in hashtag_list if isinstance(h, dict) and 'text' in h]

data['clean_hashtags'] = data['all_hashtags'].apply(extract_hashtags)
kaggle_data['clean_hashtags'] = kaggle_data['all_hashtags'].apply(extract_hashtags)

# 2️⃣ Compter tous les hashtags dans data
all_tags = [tag for tags in data['clean_hashtags'] for tag in tags]
hashtag_counts = Counter(all_tags)

top_tags = [tag for tag, _ in hashtag_counts.most_common(20)]
print("Top 20 hashtags:", top_tags)

for tag in top_tags:
    colname = f"hashtag_{tag}"
    data[colname] = data['clean_hashtags'].apply(lambda tags: int(tag in tags))
    kaggle_data[colname] = kaggle_data['clean_hashtags'].apply(lambda tags: int(tag in tags))

print(data[[f"hashtag_{tag}" for tag in top_tags]].head())
print(kaggle_data[[f"hashtag_{tag}" for tag in top_tags]].head())

Top 20 hashtags: ['covid19', 'covid', 'coronavirus', 'confinement', 'couvrefeu', 'covid_19', 'macron', 'covidー19', 'vaccin', 'covid19france', 'vaccination', 'france', 'reconfinement', 'confinement2', 'castex', 'confinement3', 'santé', 'vaccins', 'macron20h', 'covid19fr']
   hashtag_covid19  hashtag_covid  hashtag_coronavirus  hashtag_confinement  \
0                0              0                    0                    0   
1                0              0                    0                    0   
2                0              0                    0                    0   
3                1              0                    0                    0   
4                0              0                    0                    0   

   hashtag_couvrefeu  hashtag_covid_19  hashtag_macron  hashtag_covidー19  \
0                  0                 0               0                 0   
1                  0                 0               0                 0   
2                  0     

In [14]:
data['favourites_per_status'] = data['user.favourites_count'] / (data['user.statuses_count'] + 1)
data['listed_per_status'] = data['user.listed_count'] / (data['user.statuses_count'] + 1)

kaggle_data['favourites_per_status'] = kaggle_data['user.favourites_count'] / (kaggle_data['user.statuses_count'] + 1)
kaggle_data['listed_per_status'] = kaggle_data['user.listed_count'] / (kaggle_data['user.statuses_count'] + 1)

In [18]:
X = data.drop('label', axis=1)
y = data['label']

X_kaggle = kaggle_data

# Apply this function to every row (axis=1) in the training data
X['full_text'] = X.apply(lambda tweet: extract_full_text(tweet), axis=1)
# Apply the same function to the Kaggle test data
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

print(X.shape)

(154914, 237)


In [19]:
feature_cols = [
    "user.favourites_count",
    "user.listed_count",
    "user.statuses_count",
    "user.profile_use_background_image",
    "user.default_profile",
    "user.geo_enabled",
    "user.profile_background_tile",
    "all_user_mentions_count",
    "quoted_status.favorite_count",
    "quoted_status.user.favourites_count",
    "others_1",
    "others_0",
    "others",
    "Twitter Web App",
    "Twitter for iPhone",
    "Buffer",
    "Twitter for iPad",
    "Hootsuite Inc.",
    "TweetDeck",
    "IFTTT",
    "dlvr.it",
    "Twitter for Android",
    "Echobox",
    "has_photo",
    "has_video",
    "has_gif",
    "hashtag_covid19",
    "hashtag_coronavirus",
    "hashtag_covidー19",
    "hashtag_covid",
    "hashtag_confinement",
    "hashtag_couvrefeu",
    "hashtag_covid_19",
    "hashtag_macron",
    "hashtag_vaccin",
    "hashtag_covid19france",
    "hashtag_vaccination",
    "hashtag_france",
    "hashtag_reconfinement",
    "hashtag_confinement2",
    "hashtag_castex",
    "hashtag_confinement3",
    "hashtag_santé",
    "hashtag_vaccins",
    "hashtag_macron20h",
    "hashtag_covid19fr",
    "favourites_per_status",
    "listed_per_status"
]

to_remove = [
    "Buffer",
    "Twitter for iPad",
    "IFTTT",
    "dlvr.it",
    "has_gif",
    "hashtag_covid",
    "hashtag_coronavirus",
    "hashtag_confinement",
    "hashtag_couvrefeu",
    "hashtag_covid_19",
    "hashtag_macron",
    "hashtag_covidー19",
    "hashtag_vaccin",
    "hashtag_covid19france",
    "hashtag_vaccination",
    "hashtag_france",
    "hashtag_reconfinement",
    "hashtag_confinement2",
    "hashtag_castex",
    "hashtag_confinement3",
    "hashtag_santé",
    "hashtag_vaccins",
    "hashtag_macron20h",
    "hashtag_covid19fr"
]

# Retirer ces features de feature_cols
feature_cols = [f for f in feature_cols if f not in to_remove]

X_feats = X[feature_cols].fillna(0).astype(float)
X_kaggle_feats = X_kaggle[feature_cols].fillna(0).astype(float)

# Ajouter les features textuelles (full_text et user.description) ici
X_feats['full_text'] = X['full_text']
X_kaggle_feats['full_text'] = X_kaggle['full_text']
X_feats['user.description'] = X['user.description']
X_kaggle_feats['user.description'] = X_kaggle['user.description']

In [20]:
from sklearn.model_selection import train_test_split

# Nettoyage basique du texte
def clean_text(s):
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"http\S+", "", s)
    s = re.sub(r"@\w+", "", s)
    s = re.sub(r"#\w+", "", s)
    s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

X_feats['full_text'] = X_feats['full_text'].apply(clean_text)
X_feats['user.description'] = X_feats['user.description'].apply(clean_text)

X_kaggle_feats['full_text'] = X_kaggle_feats['full_text'].apply(clean_text)
X_kaggle_feats['user.description'] = X_kaggle_feats['user.description'].apply(clean_text)

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X_feats, y, test_size=0.2, stratify=y, random_state=42
)

print(X_train.shape, X_val.shape)


(123931, 26) (30983, 26)


In [21]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
bert_model.eval()

@torch.no_grad()
def embed_texts(texts, batch_size=16):
    all_embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size].tolist()
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)
        
        outputs = bert_model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        all_embeddings.append(cls_embeddings.cpu().numpy())
    
    return np.vstack(all_embeddings)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [22]:
print("Embedding full_text (train)...")
X_train_text_emb = embed_texts(X_train['full_text'])

print("Embedding description (train)...")
X_train_desc_emb = embed_texts(X_train['user.description'])

print("Embedding full_text (val)...")
X_val_text_emb = embed_texts(X_val['full_text'])

print("Embedding description (val)...")
X_val_desc_emb = embed_texts(X_val['user.description'])

print("Embedding full_text (kaggle)...")
X_kaggle_text_emb = embed_texts(X_kaggle_feats['full_text'])

print("Embedding description (kaggle)...")
X_kaggle_desc_emb = embed_texts(X_kaggle_feats['user.description'])


Embedding full_text (train)...


100%|██████████| 7746/7746 [03:05<00:00, 41.82it/s]


Embedding description (train)...


100%|██████████| 7746/7746 [01:47<00:00, 71.91it/s]


Embedding full_text (val)...


100%|██████████| 1937/1937 [00:46<00:00, 41.69it/s]


Embedding description (val)...


100%|██████████| 1937/1937 [00:27<00:00, 71.05it/s]


Embedding full_text (kaggle)...


100%|██████████| 6462/6462 [02:34<00:00, 41.72it/s]


Embedding description (kaggle)...


100%|██████████| 6462/6462 [01:27<00:00, 73.89it/s]


In [23]:
# Suppression des colonnes texte brutes
num_cols = [c for c in X_feats.columns if c not in ['full_text', 'user.description']]

X_train_num = X_train[num_cols].values
X_val_num   = X_val[num_cols].values
X_kaggle_num = X_kaggle_feats[num_cols].values

# Concaténation
X_train_final = np.hstack([X_train_num, X_train_text_emb, X_train_desc_emb])
X_val_final   = np.hstack([X_val_num,   X_val_text_emb,   X_val_desc_emb])
X_kaggle_final = np.hstack([X_kaggle_num, X_kaggle_text_emb, X_kaggle_desc_emb])

print(X_train_final.shape, X_val_final.shape, X_kaggle_final.shape)


(123931, 1560) (30983, 1560) (103380, 1560)


In [25]:
!pip install xgboost

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.2


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=1000,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    tree_method="hist"
)

xgb_model.fit(
    X_train_final, y_train,
    eval_set=[(X_val_final, y_val)],
    verbose=True
)


[0]	validation_0-logloss:0.66620
[1]	validation_0-logloss:0.64359
[2]	validation_0-logloss:0.62326
[3]	validation_0-logloss:0.60481
[4]	validation_0-logloss:0.58742
[5]	validation_0-logloss:0.57159
[6]	validation_0-logloss:0.55762
[7]	validation_0-logloss:0.54389
[8]	validation_0-logloss:0.53132
[9]	validation_0-logloss:0.51958
[10]	validation_0-logloss:0.50868
[11]	validation_0-logloss:0.49839
[12]	validation_0-logloss:0.48893
[13]	validation_0-logloss:0.47990
[14]	validation_0-logloss:0.47165
[15]	validation_0-logloss:0.46392
[16]	validation_0-logloss:0.45673
[17]	validation_0-logloss:0.44975
[18]	validation_0-logloss:0.44353
[19]	validation_0-logloss:0.43745
[20]	validation_0-logloss:0.43176
[21]	validation_0-logloss:0.42620
[22]	validation_0-logloss:0.42091
[23]	validation_0-logloss:0.41614
[24]	validation_0-logloss:0.41211
[25]	validation_0-logloss:0.40786
[26]	validation_0-logloss:0.40398
[27]	validation_0-logloss:0.39981
[28]	validation_0-logloss:0.39604
[29]	validation_0-loglos

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [27]:
from sklearn.metrics import classification_report, accuracy_score

y_val_pred = xgb_model.predict(X_val_final)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))


Accuracy: 0.933124616725301
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     16535
           1       0.95      0.90      0.93     14448

    accuracy                           0.93     30983
   macro avg       0.94      0.93      0.93     30983
weighted avg       0.93      0.93      0.93     30983



In [28]:
import joblib

joblib.dump(xgb_model, "../models/xgb_bert_model.pkl")
joblib.dump(num_cols, "../models/feature_columns.pkl")

bert_model.save_pretrained("../models/bert_encoder/")
tokenizer.save_pretrained("../models/bert_encoder/")

print("Models saved successfully.")


Models saved successfully.


In [30]:
kaggle_preds_proba = xgb_model.predict_proba(X_kaggle_final)[:, 1]
kaggle_preds = (kaggle_preds_proba > 0.5).astype(int)

submission = pd.DataFrame({
    "ID": kaggle_data["challenge_id"],
    "Prediction": kaggle_preds
})

submission.to_csv("../submissions/submission_bert_xgb.csv", index=False)

print("Submission saved.")


Submission saved.


## Optionnel : sauvegarde des embeddings pour plus tard

In [None]:
np.save("../features/X_train_final.npy", X_train_final)
np.save("../features/X_val_final.npy", X_val_final)
np.save("../features/X_kaggle_final.npy", X_kaggle_final)
np.save("../features/y_train.npy", y_train.values)
np.save("../features/y_val.npy", y_val.values)
