In [1]:
import pandas as pd
import numpy as np

In [2]:
# нанов в процентах немного, а данных много, поэтому позволим себе просто удалить наны на текущем этапе
reddit_df = pd.read_csv('../data/reddit_mental_health_posts_preprocessed.csv')
reddit_df = reddit_df[['title', 'body', 'subreddit']]
reddit_df = reddit_df.rename(columns={'subreddit': 'target'})
print('len of reddit_df: ', len(reddit_df))
# reddit_df = reddit_df.dropna().sample(10000)
reddit_df = reddit_df.dropna()
print('len of reddit_df after dropna: ', len(reddit_df))
reddit_df.head(5)

len of reddit_df:  151288
len of reddit_df after dropna:  148936


Unnamed: 0,title,body,target
0,get extremely anxious ’ working 247,month ago accepted full time software engineer...,ADHD
1,cant clean house feel incredibly motivated cle...,hey guy curious anyone else issue apartment fu...,ADHD
2,need help,6 exam next 2 week one monday havent studied f...,ADHD
3,anyone chat,anyone struggling addadhd ’ interesting chatti...,ADHD
4,figuring eat suck,whenever get hungry never eat dont know eat en...,ADHD


In [3]:
# видно, что датасет не супер сбалансирован
reddit_df.target.value_counts()

OCD           41812
ADHD          37058
depression    23770
ptsd          23758
aspergers     22538
Name: target, dtype: int64

In [4]:
reddit_df['text'] = 'Title: ' + reddit_df['title'] + '; Body: ' + reddit_df['body']
reddit_df = reddit_df.drop(['title', 'body'], axis=1)
reddit_df.head(5)

Unnamed: 0,target,text
0,ADHD,Title: get extremely anxious ’ working 247; Bo...
1,ADHD,Title: cant clean house feel incredibly motiva...
2,ADHD,Title: need help; Body: 6 exam next 2 week one...
3,ADHD,Title: anyone chat; Body: anyone struggling ad...
4,ADHD,Title: figuring eat suck; Body: whenever get h...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [6]:
reddit_df.head()

Unnamed: 0,target,text
0,ADHD,Title: get extremely anxious ’ working 247; Bo...
1,ADHD,Title: cant clean house feel incredibly motiva...
2,ADHD,Title: need help; Body: 6 exam next 2 week one...
3,ADHD,Title: anyone chat; Body: anyone struggling ad...
4,ADHD,Title: figuring eat suck; Body: whenever get h...


In [8]:
tfidf = TfidfVectorizer(
    max_features=5000,
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X_tfidf = tfidf.fit_transform(reddit_df['text'])
y = reddit_df['target']

X_temp, X_test, y_temp, y_test = train_test_split(
    X_tfidf, y, test_size=0.1, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.15, random_state=42, stratify=y_temp
)

catboost_model = CatBoostClassifier(
    iterations=200,
    random_seed=42,
    verbose=5,
)

catboost_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
    early_stopping_rounds=10
)

print('All models trained successfully')

Learning rate set to 0.219517
0:	learn: 1.3479808	test: 1.3460822	best: 1.3460822 (0)	total: 4.04s	remaining: 13m 24s
5:	learn: 1.0538641	test: 1.0506990	best: 1.0506990 (5)	total: 20.4s	remaining: 10m 59s
10:	learn: 0.9594429	test: 0.9544174	best: 0.9544174 (10)	total: 36.3s	remaining: 10m 24s
15:	learn: 0.9059915	test: 0.9001170	best: 0.9001170 (15)	total: 52.3s	remaining: 10m 1s
20:	learn: 0.8719507	test: 0.8651832	best: 0.8651832 (20)	total: 1m 7s	remaining: 9m 38s
25:	learn: 0.8433073	test: 0.8364226	best: 0.8364226 (25)	total: 1m 23s	remaining: 9m 18s
30:	learn: 0.8219712	test: 0.8150032	best: 0.8150032 (30)	total: 1m 38s	remaining: 8m 57s
35:	learn: 0.8047900	test: 0.7978618	best: 0.7978618 (35)	total: 1m 53s	remaining: 8m 36s
40:	learn: 0.7896114	test: 0.7827873	best: 0.7827873 (40)	total: 2m 8s	remaining: 8m 18s
45:	learn: 0.7776603	test: 0.7707219	best: 0.7707219 (45)	total: 2m 23s	remaining: 8m
50:	learn: 0.7674315	test: 0.7599608	best: 0.7599608 (50)	total: 2m 38s	remaining

In [9]:
test_accuracy = catboost_model.score(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.2f}")

Test accuracy: 0.76


In [10]:
import pickle

with open('models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

catboost_model.save_model('models/catboost_model.cbm')

print('All saved successfully')

All saved successfully
