In [36]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import pandas as pd
from textblob import Word
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

In [37]:
train_df = pd.read_csv("/Users/maximbortnik/Downloads/archive/train.csv")

train_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [38]:
train_df.groupby("label").count()

Unnamed: 0_level_0,id,tweet
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,29720,29720
1,2242,2242


In [39]:
def preprocess_text_with_lemmatization(text):
    # Приведение к нижнему регистру
    text = text.lower()
    # Удаление упоминаний @user
    text = re.sub(r'@\w+', ' ', text)
    # Удаление ссылок
    text = re.sub(r'http\S+|www.\S+', ' ', text)
    # Удаление хэштегов и других спецсимволов
    text = re.sub(r'#\w+', ' ', text)
    # Удаление пунктуации и цифр
    text = re.sub(r'[^a-z\s]', '', text)
    # Удаление лишних пробелов
    text = re.sub(r'\s+', ' ', text).strip()
    # Удаление стоп-слов
    sw = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in sw)
    # Лемматизация
    text = " ".join(Word(word).lemmatize() for word in text.split())
    return text

# Применение предобработки с лемматизацией к данным
train_df['clean_tweet'] = train_df['tweet'].apply(preprocess_text_with_lemmatization)

# Пример предобработанных данных
train_df[['tweet', 'clean_tweet']].head()


Unnamed: 0,tweet,clean_tweet
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drag kid dysfunction
1,@user @user thanks for #lyft credit i can't us...,thanks credit cant use cause dont offer wheelc...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,love u take u time ur
4,factsguide: society now #motivation,factsguide society


In [40]:
# Разделение данных на X (признаки) и y (метки)
X = train_df['clean_tweet']
y = train_df['label']

# Разделение данных на тренировочную и валидационную выборки
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [41]:
# TF-IDF векторизация
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

# CountVectorizer
count_vec = CountVectorizer(ngram_range=(1, 2), max_features=10000)
X_train_count = count_vec.fit_transform(X_train)
X_val_count = count_vec.transform(X_val)


In [42]:
# Обучение на TF-IDF
nb_model_tfidf = MultinomialNB()
nb_model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = nb_model_tfidf.predict(X_val_tfidf)

print("Naive Bayes (TF-IDF):")
print(classification_report(y_val, y_pred_tfidf))

# Обучение на CountVectorizer
nb_model_count = MultinomialNB()
nb_model_count.fit(X_train_count, y_train)
y_pred_count = nb_model_count.predict(X_val_count)

print("Naive Bayes (CountVectorizer):")
print(classification_report(y_val, y_pred_count))


Naive Bayes (TF-IDF):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      5945
           1       0.99      0.20      0.34       448

    accuracy                           0.94      6393
   macro avg       0.97      0.60      0.65      6393
weighted avg       0.95      0.94      0.93      6393

Naive Bayes (CountVectorizer):
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      5945
           1       0.65      0.46      0.53       448

    accuracy                           0.94      6393
   macro avg       0.80      0.72      0.75      6393
weighted avg       0.94      0.94      0.94      6393



Naive Bayes (TF-IDF) : Модель плохо справляется с классом 1, что видно по низкому Recall (0.20).

Naive Bayes (CountVectorizer) : Recall и F1-score для класса 1 выше, чем с TF-IDF.(0.46)(0.53)

In [43]:
# Logistic Regression с TF-IDF
log_model_tfidf = LogisticRegression(max_iter=1000)
log_model_tfidf.fit(X_train_tfidf, y_train)
y_pred_log_tfidf = log_model_tfidf.predict(X_val_tfidf)

print("Logistic Regression (TF-IDF):")
print(classification_report(y_val, y_pred_log_tfidf))

# Logistic Regression с CountVectorizer
log_model_count = LogisticRegression(max_iter=1000)
log_model_count.fit(X_train_count, y_train)
y_pred_log_count = log_model_count.predict(X_val_count)

print("Logistic Regression (CountVectorizer):")
print(classification_report(y_val, y_pred_log_count))


Logistic Regression (TF-IDF):
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5945
           1       0.92      0.24      0.38       448

    accuracy                           0.95      6393
   macro avg       0.93      0.62      0.68      6393
weighted avg       0.94      0.95      0.93      6393

Logistic Regression (CountVectorizer):
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      5945
           1       0.79      0.38      0.51       448

    accuracy                           0.95      6393
   macro avg       0.87      0.68      0.74      6393
weighted avg       0.94      0.95      0.94      6393



Logistic Regression (TF-IDF): Лог Регрессия улучшает Precision для класса 1 (0.92), но Recall остаётся низким(0.24).

С CountVectorizer Logistic Regression показывает лучший Recall для класса 1, чем с TF-IDF(0.38).

In [44]:
# CatBoost с TF-IDF
cat_model_tfidf = CatBoostClassifier(iterations=1000, learning_rate=0.1, verbose=200)
cat_model_tfidf.fit(X_train_tfidf, y_train)
y_pred_cat_tfidf = cat_model_tfidf.predict(X_val_tfidf)

print("CatBoost (TF-IDF):")
print(classification_report(y_val, y_pred_cat_tfidf))


0:	learn: 0.5871100	total: 40.3ms	remaining: 40.2s
200:	learn: 0.1616826	total: 4.51s	remaining: 17.9s
400:	learn: 0.1363232	total: 9.01s	remaining: 13.5s
600:	learn: 0.1237464	total: 13.2s	remaining: 8.78s
800:	learn: 0.1128207	total: 17.2s	remaining: 4.27s
999:	learn: 0.1023399	total: 21.2s	remaining: 0us
CatBoost (TF-IDF):
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      5945
           1       0.81      0.36      0.50       448

    accuracy                           0.95      6393
   macro avg       0.88      0.68      0.74      6393
weighted avg       0.94      0.95      0.94      6393



CatBoost справляется с классом 1 лучше, чем Logistic Regression (TF-IDF).

In [45]:
# CatBoost с CountVectorizer
cat_model_count = CatBoostClassifier(iterations=1000, learning_rate=0.1, verbose=200)
cat_model_count.fit(X_train_count, y_train)
y_pred_cat_count = cat_model_count.predict(X_val_count)

print("CatBoost (CountVectorizer):")
print(classification_report(y_val, y_pred_cat_count))

0:	learn: 0.5900333	total: 11.7ms	remaining: 11.7s
200:	learn: 0.1659951	total: 2.43s	remaining: 9.65s
400:	learn: 0.1456997	total: 4.74s	remaining: 7.08s
600:	learn: 0.1308068	total: 7.07s	remaining: 4.7s
800:	learn: 0.1201064	total: 9.4s	remaining: 2.33s
999:	learn: 0.1115098	total: 11.7s	remaining: 0us
CatBoost (CountVectorizer):
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5945
           1       0.85      0.35      0.50       448

    accuracy                           0.95      6393
   macro avg       0.90      0.68      0.74      6393
weighted avg       0.95      0.95      0.94      6393



Результаты с CountVectorizer почти аналогичны TF-IDF.

***Применем SMOTE***

In [46]:
# Применение SMOTE к данным
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

# Проверка баланса
print("Баланс после SMOTE:", y_train_balanced.value_counts())


Баланс после SMOTE: label
0    23775
1    23775
Name: count, dtype: int64


In [47]:
from catboost import CatBoostClassifier

# CatBoost с балансировкой
cat_model_balanced = CatBoostClassifier(
    iterations=2000,  
    learning_rate=0.05,  
    depth=6,  
    l2_leaf_reg=3,  
    class_weights=[1, 5], 
    verbose=200
)
cat_model_balanced.fit(X_train_balanced, y_train_balanced)

y_pred_cat_balanced = cat_model_balanced.predict(X_val_tfidf)

print("CatBoost (с балансировкой):")
print(classification_report(y_val, y_pred_cat_balanced))


0:	learn: 0.6638931	total: 37.7ms	remaining: 1m 15s
200:	learn: 0.2977111	total: 6.36s	remaining: 56.9s
400:	learn: 0.2491959	total: 12.6s	remaining: 50.4s
600:	learn: 0.2211494	total: 19s	remaining: 44.2s
800:	learn: 0.2025838	total: 25.6s	remaining: 38.3s
1000:	learn: 0.1885995	total: 32.1s	remaining: 32s
1200:	learn: 0.1783017	total: 38.3s	remaining: 25.5s
1400:	learn: 0.1699239	total: 44.6s	remaining: 19.1s
1600:	learn: 0.1627221	total: 50.8s	remaining: 12.7s
1800:	learn: 0.1565859	total: 57s	remaining: 6.3s
1999:	learn: 0.1509805	total: 1m 3s	remaining: 0us
CatBoost (с балансировкой):
              precision    recall  f1-score   support

           0       0.98      0.71      0.82      5945
           1       0.17      0.79      0.28       448

    accuracy                           0.71      6393
   macro avg       0.57      0.75      0.55      6393
weighted avg       0.92      0.71      0.78      6393



Балансировка значительно увеличивает Recall для класса 1, но снижает Precision для класса 0, что приводит к снижению Accuracy.

In [48]:
# Разделение данных на обучающую и тестовую выборки
X = train_df['clean_tweet'] 
y = train_df['label']        
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF 
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

# TruncatedSVD для снижения размерности
svd = TruncatedSVD(n_components=300, random_state=42)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_val_svd = svd.transform(X_val_tfidf)

log_model = LogisticRegression(max_iter=1000, solver='saga', random_state=42)
log_model.fit(X_train_svd, y_train)

y_pred = log_model.predict(X_val_svd)

print("Logistic Regression (TF-IDF + TruncatedSVD):")
print(classification_report(y_val, y_pred))


Logistic Regression (TF-IDF + TruncatedSVD):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      5945
           1       0.82      0.18      0.30       448

    accuracy                           0.94      6393
   macro avg       0.88      0.59      0.63      6393
weighted avg       0.93      0.94      0.92      6393



TruncatedSVD уменьшает размерность данных, но результаты для класса 1 становятся хуже.

**Выводы**:
1) *Naive Bayes*:
Хорошо справляется с классом 0, но имеет низкий Recall для класса 1.
2) *Logistic Regression*:
Обеспечивает более высокий F1-Score для класса 1, чем Naive Bayes, особенно с CountVectorizer.
3) *CatBoost*:
Показал лучшие результаты для F1-Score класса 1 (0.50) без балансировки.
Балансировка классов значительно увеличила Recall для класса 1 (до 0.79), но снизила точность предсказаний для класса 0.