In [7]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
import nltk

# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')

categories = ['comp.sys.mac.hardware', 'sci.med', 'comp.graphics']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame({'text': data['data'], 'category': data['target']})
        
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Удаление чисел и символов
    text = re.sub(r'\d+', '', text)  # Удаляем цифры
    text = text.translate(str.maketrans('', '', string.punctuation))  # Удаляем знаки препинания
    text = text.lower()  # Приведение к нижнему регистру
    tokens = word_tokenize(text)  # Токенизация
    tokens = [word for word in tokens if word not in stop_words]  # Удаляем стоп-слова
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Лемматизация
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

tfidf = TfidfVectorizer(max_features=5000)  # Ограничиваем количество признаков
X_tfidf = tfidf.fit_transform(df['processed_text'])

#X_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
#print(X_tfidf)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])

# Добавляем признаки TF-IDF
X = X_tfidf

# Разделяем на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Обучение моделей
# Логистическая регрессия
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
logistic_preds = logistic_model.predict(X_test)

# Рандомный лес
forest_model = RandomForestClassifier(n_estimators=100)
forest_model.fit(X_train, y_train)
forest_preds = forest_model.predict(X_test)

# Нейронная сеть
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300)
mlp_model.fit(X_train, y_train)
mlp_preds = mlp_model.predict(X_test)

# 6. Оценка качества
print("Logistic Regression:")
print(classification_report(y_test, logistic_preds, target_names=data['target_names']))
print("\nRandom Forest:")
print(classification_report(y_test, forest_preds, target_names=data['target_names']))
print("\nNeural Network:")
print(classification_report(y_test, mlp_preds, target_names=data['target_names']))

# Accuracy для сравнения
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, logistic_preds):.4f}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, forest_preds):.4f}")
print(f"Neural Network Accuracy: {accuracy_score(y_test, mlp_preds):.4f}")



Logistic Regression:
                       precision    recall  f1-score   support

        comp.graphics       0.91      0.85      0.88       193
comp.sys.mac.hardware       0.91      0.86      0.88       202
              sci.med       0.86      0.96      0.91       191

             accuracy                           0.89       586
            macro avg       0.89      0.89      0.89       586
         weighted avg       0.89      0.89      0.89       586


Random Forest:
                       precision    recall  f1-score   support

        comp.graphics       0.85      0.81      0.83       193
comp.sys.mac.hardware       0.80      0.89      0.85       202
              sci.med       0.92      0.86      0.89       191

             accuracy                           0.85       586
            macro avg       0.86      0.85      0.86       586
         weighted avg       0.86      0.85      0.86       586


Neural Network:
                       precision    recall  f1-score   sup