# Data Exploration

In [1]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize
import matplotlib.pyplot as plt
from pathlib import Path


# Natural Language Processing (NLP) libraries
from nltk.corpus import stopwords

# Scikit-learn modeling libraries
from sklearn.dummy import DummyClassifier # For baseline model
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text to numbers
from sklearn.linear_model import LogisticRegression # The classifier model
from sklearn.metrics import accuracy_score, log_loss, classification_report # For evaluation
from sklearn.model_selection import train_test_split, learning_curve, StratifiedKFold, cross_val_score # For splitting and validating
from sklearn.pipeline import Pipeline # To chain processing steps
import random
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from datetime import datetime

## Ouverture des datasets avec panda

In [2]:
compute_csv = False

# Définition du dossier cible de manière propre
DATA_RAW_DIR = Path("../data/raw")
DATA_RAW_DIR.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED_DIR = Path("../data/processed")
DATA_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

X_TRAIN_PATH = DATA_RAW_DIR / "X_train.csv"
Y_TRAIN_PATH = DATA_RAW_DIR / "y_train.csv"
X_KAGGLE_PATH = DATA_RAW_DIR / "X_kaggle.csv"

if compute_csv:
    train_data = pd.read_json('../data/raw/train.jsonl', lines=True)
    train_data = json_normalize(train_data.to_dict(orient='records'))

    kaggle_data = pd.read_json('../data/raw/kaggle_test.jsonl', lines=True)
    kaggle_data = json_normalize(kaggle_data.to_dict(orient='records'))

    X_train = train_data.drop('label', axis=1)
    y_train = train_data['label']

    X_kaggle = kaggle_data

    X_train.to_csv(X_TRAIN_PATH, index=False)
    y_train.to_csv(Y_TRAIN_PATH, index=False)
    X_kaggle.to_csv(X_KAGGLE_PATH, index=False)

else:
    X_train = pd.read_csv(X_TRAIN_PATH)
    y_train = pd.read_csv(Y_TRAIN_PATH).iloc[:, 0]
    X_kaggle = pd.read_csv(X_KAGGLE_PATH)


  X_train = pd.read_csv(X_TRAIN_PATH)
  X_kaggle = pd.read_csv(X_KAGGLE_PATH)


In [3]:
# Define a function to get the full text from a tweet object.
# Tweets can be truncated, storing the full version in 'extended_tweet.full_text'.
def extract_full_text(tweet):
    # Start with the standard 'text' field
    text = tweet['text']
    # Check if the 'extended_tweet.full_text' field exists (is not NaN)
    if not pd.isna(tweet['extended_tweet.full_text']):
        # If it exists, it's the full text, so use it instead
        text = tweet['extended_tweet.full_text']
    return text

# Apply this function to every row (axis=1) in the training data
X_train['full_text'] = X_train.apply(lambda tweet: extract_full_text(tweet), axis=1)
# Apply the same function to the Kaggle test data
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

## Filtrage des features essentielles

In [4]:
essential_features = [
    "full_text", # Texte du tweet
    "extended_tweet.entities.hashtags", # Hashtags dans le tweet
    "possibly_sensitive", # Indicateur de contenu sensible
    "is_quote_status", # Indicateur si le tweet est une citation
    "user.statuses_count", # Nombre total de tweets de l'utilisateur
    "user.favourites_count", # Nombre de tweets favs par l'utilisateur
    "user.description", # Description du profil utilisateur
    "user.created_at", # Date de création du compte utilisateur
]

X_train_essential = X_train[essential_features].copy()
X_kaggle_essential  = X_kaggle[essential_features].copy()

# Gestion des valeurs manquantes
X_train_essential['possibly_sensitive'] = X_train_essential['possibly_sensitive'].fillna(0).astype(int)
X_train_essential['user.description'] = X_train_essential['user.description'].fillna('')
X_kaggle_essential['possibly_sensitive'] = X_kaggle_essential['possibly_sensitive'].fillna(0).astype(int)
X_kaggle_essential['user.description'] = X_kaggle_essential['user.description'].fillna('')

In [5]:
# Transformations sur les colonnes restantes (ex: ancienneté du compte)

def compute_account_age(date_series):
    now = pd.Timestamp.now(tz='UTC')
    # Convertir toute la série d'un coup au lieu d'appliquer lambda
    dates = pd.to_datetime(date_series, errors="coerce", utc=True)
    return (now - dates).dt.days

# Fonction optimisée pour compter les hashtags
def count_hashtags_safe(x):
    if not isinstance(x, str) or not x.startswith("["):
        return 0
    try:
        # Méthode plus rapide : compter les occurrences de '{' qui indiquent un hashtag
        return x.count('{"text"')  # Ou simplement x.count('{') selon le format
    except:
        return 0

# Ajout de la feature dérivée "account_age_days" - OPTIMISÉ
X_train_essential["account_age_days"] = compute_account_age(X_train_essential["user.created_at"])
X_kaggle_essential["account_age_days"] = compute_account_age(X_kaggle_essential["user.created_at"])

# Encodage des hashtags : OPTIMISÉ - éviter eval()
X_train_essential["n_hashtags"] = X_train_essential["extended_tweet.entities.hashtags"].apply(count_hashtags_safe)
X_kaggle_essential["n_hashtags"] = X_kaggle_essential["extended_tweet.entities.hashtags"].apply(count_hashtags_safe)

# Gestion des valeurs manquantes - OPTIMISÉ avec fillna vectorisé
X_train_essential['full_text'] = X_train_essential['full_text'].fillna('')
X_train_essential['user.description'] = X_train_essential['user.description'].fillna('')
X_kaggle_essential['full_text'] = X_kaggle_essential['full_text'].fillna('')
X_kaggle_essential['user.description'] = X_kaggle_essential['user.description'].fillna('')

# Colonnes finales
text_columns = ["full_text", "user.description"]
num_columns = ["possibly_sensitive", "is_quote_status", "user.statuses_count",
               "user.favourites_count", "account_age_days", "n_hashtags"]

  dates = pd.to_datetime(date_series, errors="coerce", utc=True)
  dates = pd.to_datetime(date_series, errors="coerce", utc=True)


In [6]:
X_train_essential.to_csv(DATA_PROCESSED_DIR / "X_train_essential.csv", index=False)
X_kaggle_essential.to_csv(DATA_PROCESSED_DIR / "X_kaggle_essential.csv", index=False)