In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import tensorflow
import tensorflow_hub as hub
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

In [97]:
# filepath = 'datasets/depression_dataset_reddit_cleaned.csv'
def load_data(filepath):
    df = pd.read_csv(filepath)
    texts = df['clean_text'].dropna().values
    labels = df['is_depression'].values
    print(df.shape)
    print(texts.shape)
    return texts, labels

In [98]:
def text_processing(texts, max_length=128):
    """Preprocess the text data using BERT preprocessing and encoding."""
    bert_processor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
    
    batch_size = 32
    encoded_texts = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        preprocessed = bert_processor(batch_texts)
        encoded = bert_encoder(preprocessed)['pooled_output']
        encoded_texts.append(encoded.numpy())

    return np.vstack(encoded_texts)

In [99]:
def build_model(input_shape):

    model = Sequential([
        Input(shape=input_shape),
        Dense(256, activation='relu', kernel_regularizer=tensorflow.keras.regularizers.l2(0.01)),
        Dropout(0.4),
        Dense(128, activation='relu', kernel_regularizer=tensorflow.keras.regularizers.l2(0.01)),
        Dropout(0.4),
        Dense(1, activation='sigmoid'),
    ])

    return model

In [100]:
def plot_training_history(history):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Loss
    ax1.plot(history.history['loss'], label='Train Loss')
    ax1.plot(history.history['val_loss'], label='Val Loss')
    ax1.set_title('Loss over Epochs')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    
    # Accuracy
    ax2.plot(history.history['accuracy'], label='Train Accuracy')
    ax2.plot(history.history['val_accuracy'], label='Val Accuracy')
    ax2.set_title('Accuracy over Epochs')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    
    plt.show()

In [101]:
def main(data_path):

    texts, labels = load_data(data_path)
    
    X = text_processing(texts)
    y = np.array(labels)
    
    if len(np.unique(y)) == 2:
        print("Classes équilibrées:", np.bincount(y))
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = build_model(input_shape=(X.shape[1],))
    
    model.compile(
        optimizer=tensorflow.keras.optimizers.Adam(learning_rate=1e-4),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=15,
        batch_size=32,
        callbacks=[
            tensorflow.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
            tensorflow.keras.callbacks.ModelCheckpoint('depression_model.h5', save_best_only=True)
        ]
    )
    
    # Évaluation
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    print("\nRapport de classification :")
    print(classification_report(y_test, y_pred, target_names=['Non-dépression', 'Dépression']))
    f1 = f1_score(y_test, y_pred)
    print(f"F1-Score : {f1:.4f}")
    
    plot_training_history(history) 
    
    return model, history

In [None]:
if __name__ == "__main__":
    data_path = "datasets/depression_dataset_reddit_cleaned.csv"
    model, history = main(data_path)