In [34]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.preprocessing import LabelEncoder

In [2]:
from logic import processing

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Flotchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Flotchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Flotchi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/Flotchi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/Flotchi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
data = processing.load_data('Combined Data.csv')

In [5]:
df = processing.preproc(data,bi = False)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,statement,status,clean
0,0,oh my gosh,Anxiety,oh gosh
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleep confuse mind restless heart tune
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...
3,3,I've shifted my focus to something else but I'...,Anxiety,ive shift focus something else im still worry
4,4,"I'm restless and restless, it's been a month n...",Anxiety,im restless restless month boy mean


In [8]:
X = df.clean
y= df.status

In [10]:
def embed_sentence(word2vec, sentence):
    """Embed a sentence by averaging the embeddings of its words."""
    embedded_words = [word2vec.wv[word] for word in sentence if word in word2vec.wv]
    if not embedded_words:
        return np.zeros(word2vec.vector_size)  # Phrase sans mots connus -> vecteur nul
    return np.mean(embedded_words, axis=0)

def embedding(word2vec, sentences):
    """Embed a list of sentences."""
    return np.array([embed_sentence(word2vec, sentence.split()) for sentence in sentences])


In [13]:
# Entraîner Word2Vec
word2vec = Word2Vec(sentences=[sentence.split() for sentence in X], 
                    vector_size=60, 
                    min_count=5, 
                    window=5)

# Embedding des données
X_embed = embedding(word2vec, X)

In [14]:
ros = RandomOverSampler(random_state=101)
X_resampled, y_resampled = ros.fit_resample(X_embed, y)

In [21]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_resampled)

In [22]:
model = LogisticRegression(max_iter=1000)

# Cross-validation
cv_results = cross_validate(
    model,
    X_resampled,
    y_encoded,
    scoring=['accuracy', 'precision_macro', 'recall_macro'],
    cv=5,  # Par exemple, 5-fold cross-validation
)

print(cv_results)

{'fit_time': array([ 8.72809196,  9.4235661 , 10.25048399, 13.70297027,  9.66559696]), 'score_time': array([0.02637792, 0.02825785, 0.02315712, 0.02392077, 0.02332616]), 'test_accuracy': array([0.61270313, 0.59658396, 0.5889651 , 0.57507317, 0.57070464]), 'test_precision_macro': array([0.61259926, 0.59685518, 0.58954285, 0.57140771, 0.56656844]), 'test_recall_macro': array([0.61269782, 0.59658165, 0.5889691 , 0.57507194, 0.57070873])}


In [23]:
cv_results['test_accuracy'].mean()

0.5888059998507081

In [29]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
y_pred = cross_val_predict(model2, X_resampled, y_encoded, cv=5)

# Rapport détaillé
report = classification_report(y_encoded, y_pred, target_names=label_encoder.classes_)
print(report)

                      precision    recall  f1-score   support

             Anxiety       0.64      0.62      0.63     16351
             Bipolar       0.54      0.56      0.55     16351
          Depression       0.35      0.21      0.26     16351
              Normal       0.72      0.53      0.61     16351
Personality disorder       0.53      0.76      0.63     16351
              Stress       0.45      0.50      0.47     16351
            Suicidal       0.54      0.60      0.57     16351

            accuracy                           0.54    114457
           macro avg       0.54      0.54      0.53    114457
        weighted avg       0.54      0.54      0.53    114457



In [26]:
model2 = DecisionTreeClassifier(max_depth=9, min_samples_split=5, random_state=101)

cv_results = cross_validate(
    model2,
    X_resampled,
    y_encoded,
    scoring=['accuracy', 'precision_macro', 'recall_macro'],
    cv=5,  # Par exemple, 5-fold cross-validation
)

print(cv_results)

{'fit_time': array([6.06056905, 5.86930108, 5.12427402, 5.25986028, 5.71875   ]), 'score_time': array([0.04073095, 0.02344012, 0.0246911 , 0.03212285, 0.02347088]), 'test_accuracy': array([0.54857592, 0.5436397 , 0.53545061, 0.54008125, 0.53261107]), 'test_precision_macro': array([0.55433967, 0.54765892, 0.54024197, 0.53107546, 0.52554514]), 'test_recall_macro': array([0.54857186, 0.54363096, 0.53546324, 0.54007878, 0.53261648])}


In [27]:
cv_results['test_accuracy'].mean()

0.5400717118900775