# ИД22-2, Кирюшкин Николай, КП

 Датасет - https://www.kaggle.com/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/data

### Импорт необходимых библиотек

In [1]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re
import seaborn as sns
import string

%matplotlib inline
from imblearn.over_sampling import RandomOverSampler
from matplotlib import pyplot as plt
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from wordcloud import WordCloud
from xgboost import XGBClassifier

In [2]:
nltk.download('popular')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\kkiry\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\kkiry\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\kkiry\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\kkiry\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\kkiry\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

True

### Чтение и описание данных

In [3]:
df = pd.read_csv('Combined Data.csv', encoding="utf-8", index_col=0)
df.head(5)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [4]:
df.describe()

Unnamed: 0,statement,status
count,52681,53043
unique,51073,7
top,what do you mean?,Normal
freq,22,16351


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53043 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     53043 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [6]:
print("Пропущенные значения:")
print(df.isnull().sum())

Пропущенные значения:
statement    362
status         0
dtype: int64


In [7]:
df.dropna(inplace=True)
df.isna().sum()

statement    0
status       0
dtype: int64

### Предобработка данных

In [8]:
def preprocess_text(text):
    text = text.lower()  # Приведение текста к нижнему регистру
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)  # Удаление markdown-ссылок
    text = re.sub(r'@\w+', '', text)  # Удаление упоминаний (handle)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Удаление ссылок
    text = re.sub(r'<.*?>+', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Удаление знаков препинания
    text = re.sub(r'\n', '', text)  # Удаление символов новой строки
    text = re.sub(r'\w*\d\w*', '', text)  # Удаление слов с цифрами
    return text.strip()  # Удаление лишних пробелов в начале и конце текста

df['preprocessed_statement'] = df.statement.apply(preprocess_text)

In [None]:
df['tokens'] = df.preprocessed_statement.apply(lambda x: [i for i in word_tokenize(x)])

lemmatizer = WordNetLemmatizer()

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize(tokens):
    
    pos_tagged = nltk.pos_tag(tokens) 

    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else: 
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

df['lemmatized_tokens'] = df['tokens'].apply(lemmatize)

In [None]:
df.head(5)

### Добавление новых признаков

In [None]:
df['statement_len'] = df.statement.apply(len)
df['pronoun_count'] = df.lemmatized_tokens.apply(lambda x: len([i for i in x.split() if i in ['i','ive','im','i`m','i`ve']]))
df['word_count'] = df.statement.apply(lambda x: len(nltk.word_tokenize(x)))
df['sentence_count'] = df.statement.apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
df.describe()

### Визуализация данных

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=1,
              subplot_titles=("С выбросами", "Очищено от выбросов"))

fig.add_trace(go.Histogram(x=df.statement_len),row=1, col=1)



Q1 = df.statement_len.quantile(0.25)
Q3 = df.statement_len.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
filtered_df = df[(df.statement_len >= lower_bound) & (df.statement_len <= upper_bound)]
fig.add_trace(go.Histogram(x=filtered_df.statement_len),row=2, col=1)


fig.update_layout(height=600, width=800, title_text="Распределение длинны текста", showlegend=False)
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df.status,
                           y=df.statement_len,
                           histfunc='avg',
                           name="Средняя длина сообщения",
                           text=round(df.groupby('status').statement_len.mean())))

fig.add_trace(go.Histogram(x=df.status,
                           y=df['statement'].apply(lambda x: len(nltk.sent_tokenize(x))),
                           histfunc='avg',
                           name="Среднее кол-во предложений",
                           text=round(df.groupby('status').word_count.mean())))

fig.add_trace(go.Histogram(x=df.status,
                           y=df['statement'].apply(lambda x: len(nltk.word_tokenize(x))),
                           histfunc='avg',
                           name="Среднее кол-во слов",
                           text=round(df.groupby('status').sentence_count.mean())))
                           
fig.add_trace(go.Histogram(x=df.status,
                           y=df.pronoun_count,
                           histfunc='avg',
                           name="Среднее кол-во местоимений",
                           text=round(df.groupby('status').pronoun_count.mean())))
fig.update_layout(
    yaxis_type='log',  # Логарифмическая шкала
    title='Сравнение дигнозов по численным метрикам',
    xaxis_title='Диагноз',
    yaxis_title='Средние значения (логарифмическая шкала)',
    legend=dict(
        orientation="h",  # Горизонтальное расположение легенды
        yanchor="bottom",  # Привязка к нижней части легенды
        y=1.02,  # Расположить над графиком
        xanchor="center",  # Центрирование
        x=0.5  # Легенда посередине
    ),
    template='plotly_white',
    xaxis=dict(
        categoryorder='array',  # Указываем пользовательский порядок
        categoryarray=['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder',
       'Stress', 'Suicidal'],  # Порядок категорий
    )
)
fig.show()

Из графика видно что в среднем длинна текста, количество слов и предложений кратно выше при наличии психического растройства

In [None]:
fig = px.histogram(df, x='status', title='Распределение диагнозов ментальных заболеваний', text_auto=True)
fig.show()

В распределении классов наблюдается сильный дисбаланс который необходимо учесть при проектировании модели

### Облака слов

In [None]:
def plot_wordcloud(tokens, status):
    wordcloud = WordCloud(width=1600, height=1000, background_color='white').generate(tokens)
    ax[i,j].imshow(wordcloud, interpolation='bilinear')
    ax[i,j].set_title(f'{status}')

In [None]:
statuses = df['status'].unique()
fig, ax = plt.subplots(4, 2)
fig.set_figheight(25)
fig.set_figwidth(18)
i, j=0, 0
plot_wordcloud(' '.join(df['tokens'].apply(lambda x: ' '.join(x)).tolist()), 'All statuses')


for status in statuses:
    if j<1: j+=1
    elif j==1 and i<3:
        j=0
        i+=1

    tokens_data = ' '.join(df[df['status'] == status]['tokens'].apply(lambda x: ' '.join(x)).tolist())
    plot_wordcloud(tokens_data, status)

plt.show()

### Подготовка тренировочной и обучающей выборки

In [None]:
X = df[['statement_len','pronoun_count','lemmatized_tokens']]
y = df.status

In [None]:
lbl_enc = LabelEncoder()
y = lbl_enc.fit_transform(y.values)
labels = lbl_enc.classes_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Преобразование текста в числовые признаки с помощью TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train['lemmatized_tokens'])
X_test_tfidf = vectorizer.transform(X_test['lemmatized_tokens'])

X_train_num = X_train[['statement_len','pronoun_count']].values
X_test_num = X_test[['statement_len','pronoun_count']].values

X_train_combined = hstack([X_train_tfidf, X_train_num])
X_test_combined = hstack([X_test_tfidf, X_test_num])

### Оверсемплинг тренировочных данных

In [None]:
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_combined, y_train)

### Поиск лучшей модели

In [None]:
# Определение моделей
models = {
    'Logistic Regression': LogisticRegression(max_iter=2000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Naive Bayes': MultinomialNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Гиперпараметры для поиска
param_grid = {
    'Logistic Regression': {
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear'],
        'penalty': [None, 'l1','l2']
    },
    'Decision Tree': {
        'max_depth': [3, 5, None],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5, 10],
        'bootstrap': [True, False]
    },
    'Naive Bayes': {
        'alpha': [0.01, 0.1, 1]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 10],
        'gamma': [0, 0.1, 0.2],
        'min_child_weight': [1, 3, 5]
    }
}

# Выбор модели и настройка гиперпараметров с помощью GridSearchCV
best_models = {}
for model_name, model in models.items():
    print(f"Настройка гиперпараметров для модели: {model_name}")

    grid_search = GridSearchCV(model, param_grid[model_name], cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search.fit(X_train_resampled, y_train_resampled)

    best_models[model_name] = [grid_search.best_estimator_]
    print(f"Лучшие гиперпараметры для {model_name}: {grid_search.best_params_}")

    # Оценка модели на тестовых данных
    y_pred = grid_search.predict(X_test_combined)
    print(f"Результаты классификации для {model_name}:\n{classification_report(y_test, y_pred, target_names=labels)}")
    print("-" * 80)
    best_models[model_name].append(y_pred)

In [None]:
model_results = pd.DataFrame({name:{'Accuracy': val[0].score(X_test_combined, y_test),
                                    'Precision': precision_score(y_test, val[1],average='macro'),
                                    'Recall': recall_score(y_test, val[1],average='macro'),
                                    'F1 score': f1_score(val[1],y_test,average='macro'),
#                                     'ROC AUC score': roc_auc_score(y_test, val[1], average='macro', multi_class='ovr')
                                   } for name, val in best_models.items()})
sns.heatmap(model_results,annot = True)
plt.show()

### Оценка метрик полученной модели


In [None]:
# Определение наилучшей модели
best_model_name = max(best_models, key=lambda x: f1_score(y_test, best_models[x][0].predict(X_test_combined), average='macro'))
print(f"Наилучшая модель: {best_model_name}")

# Оценка наилучшей модели
best_model = best_models[best_model_name][0]
y_pred_best = best_model.predict(X_test_combined)
print(f"Результаты наилучшей модели ({best_model_name}):\n{classification_report(y_test, y_pred_best, target_names=labels)}")
f1_score(y_test, best_model.predict(X_test_combined), average='macro'),best_model.score(X_test_combined,y_test)

In [None]:
best_model

In [None]:
scoring = ['accuracy', 'f1_macro']
scores = cross_validate(best_model, X_test_combined, y_test, cv=10, scoring=scoring)

fpr, tpr, treshold = roc_curve(y_test, best_model.predict_proba(X_test_combined)[:,1], pos_label=1)
roc_auc = auc(fpr, tpr)

print("Метрики полученной модели")
print('Accuracy:', scores['test_accuracy'].mean())
print('F1-score:', scores['test_f1_macro'].mean())
print('ROC AUC:', roc_auc)

### Визуализация результатов модели

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_best)

sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
plt.xlabel('Предсказанные')
plt.ylabel('Действительные')
plt.title(f'Confusion Matrix')
plt.show()

In [None]:
y_scores = best_model.predict_proba(X_test_combined)
y_onehot = pd.get_dummies(y_test, columns=best_model.classes_)

fig = go.Figure()
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

for i in range(y_scores.shape[1]):
    y_true = y_onehot.iloc[:, i]
    y_score = y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{df.status.unique()[i]} (AUC={auc_score:.2f})"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    xaxis=dict(
        title=dict(
            text='False Positive Rate'
        ),
        constrain='domain'
    ),
    yaxis=dict(
        title=dict(
            text='True Positive Rate'
        ),
        scaleanchor='x',
        scaleratio=1
    ),
    width=900, height=800
)
fig.show()