In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,id
0,Aakash,Male,47,Agra,Working Professional,Teacher,,1.0,,,5.0,Less than 5 hours,Unhealthy,B.Arch,Yes,11,5,Yes,0,1
1,Asha,Female,25,Chennai,Student,,3.0,,6.59,1.0,,7-8 hours,Healthy,BSc,No,9,3,No,0,2
2,Samar,Male,56,Indore,Working Professional,Data Scientist,,3.0,,,2.0,7-8 hours,Moderate,B.Tech,No,2,4,Yes,0,3
3,Chhavi,Female,24,Kalyan,Student,,2.0,,5.77,2.0,,5-6 hours,Moderate,MBBS,Yes,5,3,No,1,4
4,Anand,Male,55,Kanpur,Working Professional,Researcher,,1.0,,,2.0,7-8 hours,Unhealthy,BSc,Yes,4,4,No,0,5


In [4]:
test_df.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,id
0,Raghav,Male,59,Kolkata,Working Professional,Content Writer,,1.0,,,1.0,Less than 5 hours,Healthy,ME,No,10,1,No,1
1,Shlok,Male,45,Indore,Working Professional,Plumber,,5.0,,,1.0,7-8 hours,Moderate,PhD,Yes,0,1,Yes,2
2,Saurav,Male,42,Nagpur,Working Professional,Digital Marketer,,3.0,,,4.0,5-6 hours,Healthy,M.Com,No,12,2,Yes,3
3,Dhruv,Male,34,Lucknow,Working Professional,Marketing Manager,,4.0,,,2.0,7-8 hours,Unhealthy,B.Com,No,12,5,No,4
4,Shreya,Female,36,Kolkata,Working Professional,Plumber,,4.0,,,2.0,7-8 hours,Moderate,BE,Yes,7,3,Yes,5


Проверка распределения классов

In [5]:
print("\nРаспределение классов в train.csv:")
print(train_df['Depression'].value_counts(normalize=True))


Распределение классов в train.csv:
Depression
0    0.821787
1    0.178213
Name: proportion, dtype: float64


Из этого видно, что один класс имеет гораздо меньше примеров, чем другой, что создает дисбаланс в распределении данных. А именно, в данном случае очень мало значений "1", из-за чего модель научится предсказывать только "0". Чтобы решить данную проблему используем метод SMOTE для генерации синтетических данных редкого класса.

Проверка пропусков

In [6]:
print("\nПропуски в train.csv:\n", train_df.isna().sum())
print("\nПропуски в test.csv:\n", test_df.isna().sum())


Пропуски в train.csv:
 Name                                        0
Gender                                      0
Age                                         0
City                                        0
Working Professional or Student             0
Profession                                511
Academic Pressure                        1509
Work Pressure                             382
CGPA                                     1509
Study Satisfaction                       1509
Job Satisfaction                          382
Sleep Duration                              0
Dietary Habits                              0
Degree                                      0
Have you ever had suicidal thoughts ?       0
Work/Study Hours                            0
Financial Stress                            0
Family History of Mental Illness            0
Depression                                  0
id                                          0
dtype: int64

Пропуски в test.csv:
 Name                

**Заполнение пропусков**

Заполнение константой <br> F1: 0.9725734398940263

Заплнение медианой <br> F1: 0.9672636579328682

Заполнение модой <br> F1: 0.9684617251589728

In [7]:
# Определение признаков
features = ['Gender', 'Age', 'City', 'Working Professional or Student', 'Profession',
            'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
            'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree',
            'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
            'Financial Stress', 'Family History of Mental Illness']

Предобработка данных

In [8]:
def preprocess_data(df):
    df = df.copy()

    numerical_cols = ['Age', 'Work/Study Hours', 'Financial Stress', 'Academic Pressure',
                      'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction']
    categorical_cols = ['Gender', 'Working Professional or Student', 'Sleep Duration',
                        'Dietary Habits', 'Have you ever had suicidal thoughts ?',
                        'Family History of Mental Illness', 'City', 'Profession', 'Degree']

    # Заполнение числовых признаков константой 0
    for col in numerical_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Заполнение категориальных признаков константой 'Unknown'
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown')

    # Инженерия признаков
    df['Stress_Interaction'] = df['Financial Stress'] * df['Academic Pressure']
    if 'Stress_Interaction' not in features:
        features.append('Stress_Interaction')

    # Логарифмическое преобразование
    if 'Work/Study Hours' in df.columns:
        df['Work/Study Hours'] = np.log1p(df['Work/Study Hours'])

    return df

Признаки делим на числовые, признаки имеющие малое количество уникальных категорий и признаки имеют большое количество уникальных категорий. <br> Пропуски заполняем модой, основываясь на прошлых наблюдениях. <br> Из-за того, что может быть слишком большой разброс во времени у признаков Work и Study необходимо снизить влияние выбросов с помощью логарифмирования.

In [9]:
train_df_processed = preprocess_data(train_df)
test_df_processed = preprocess_data(test_df)

In [10]:
# Определение признаков и целевой переменной
X = train_df_processed[features]
y = train_df_processed['Depression']
X_test = test_df_processed[features]

In [11]:
# Разделение данных
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
# Создание препроцессора
numerical_cols = ['Age', 'Work/Study Hours', 'Financial Stress', 'Academic Pressure',
                  'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Stress_Interaction']
low_cardinality_cols = ['Gender', 'Working Professional or Student', 'Sleep Duration',
                        'Dietary Habits', 'Have you ever had suicidal thoughts ?',
                        'Family History of Mental Illness']
high_cardinality_cols = ['City', 'Profession', 'Degree']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('low_card', OneHotEncoder(drop='first', handle_unknown='ignore'), low_cardinality_cols),
        ('high_card', TargetEncoder(), high_cardinality_cols)
    ])

При обработке категориальных признаков методом OneHotEncoder точность на валидационной выборке равна 0.9656992084432717 <br> При использовании TargetEncoder на признаках с большим числом категорий точность на валидационной выборе выросла на 0.82%

**RandomForest**

F1: 0.9126591760299626

**LogisticRegression**

F1: 0.9725734398940263

**DecisionTrees**

F1: 0.832734879989464

In [13]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        class_weight='balanced',
        random_state=42,
        max_iter=2000
    ))
])

Находим оптимальные параметры модели

In [14]:
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear']
}

Кросс-валидация

In [15]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring='f1', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [16]:
print("\nЛучшие параметры:", grid_search.best_params_)
print("Лучшая F1-оценка на кросс-валидации:", grid_search.best_score_)


Лучшие параметры: {'classifier__C': 10, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Лучшая F1-оценка на кросс-валидации: 0.9725734398940263


**Балансировка SMOTE**

In [17]:
smote = SMOTE(random_state=42)
X_train_preprocessed = grid_search.best_estimator_.named_steps['preprocessor'].transform(X_train)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_preprocessed, y_train)

In [18]:
best_model = LogisticRegression(
    C=grid_search.best_params_['classifier__C'],
    penalty=grid_search.best_params_['classifier__penalty'],
    solver=grid_search.best_params_['classifier__solver'],
    class_weight='balanced',
    random_state=42,
    max_iter=2000
)
best_model.fit(X_train_balanced, y_train_balanced)


In [19]:
X_val_preprocessed = grid_search.best_estimator_.named_steps['preprocessor'].transform(X_val)
y_val_pred = best_model.predict(X_val_preprocessed)
print("\nТочность на валидационной выборке:", accuracy_score(y_val, y_val_pred))
print("\nОтчет по классификации:\n", classification_report(y_val, y_val_pred))


Точность на валидационной выборке: 0.9736147757255936

Отчет по классификации:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       311
           1       0.91      0.94      0.93        68

    accuracy                           0.97       379
   macro avg       0.95      0.96      0.96       379
weighted avg       0.97      0.97      0.97       379



In [20]:
X_test_preprocessed = grid_search.best_estimator_.named_steps['preprocessor'].transform(X_test)
test_predictions = best_model.predict(X_test_preprocessed)

Формирование файла

In [21]:
submission = pd.DataFrame({'id': test_df['id'], 'Depression': test_predictions})
submission.to_csv('submission.csv', index=False)