In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.datasets import fetch_20newsgroups

# Исходные данные из 2 лаб.
categories = ['alt.atheism', 'sci.space', 'soc.religion.christian']
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=24, categories=categories, remove=remove)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=24, categories=categories, remove=remove)
print(twenty_train.data[3])
print(twenty_test.data[3])

If gamma ray bursters are extragalactic, would absorption from the
galaxy be expected?  How transparent is the galactic core to gamma
rays?

How much energy does a burster put out?  I know energy depends on
distance, which is unknown.  An answer of the form _X_ ergs per
megaparsec^2 is OK.




Yes a flotation tank, combined with floride breathing water(REF: the Abyss
breathing solution I think).. also the right position of the astronaut and
strapping you can probably get much more than 45gs in an accesloration..
More like near 100g (or somewhat less)..

Saw I book called the "Time Master" (I thjink that was the title) that had some
ideas on how fast and all you could go..


In [17]:
# Применить стемминг
def stemn(data):
    porter_stemmer = PorterStemmer()
    stem = []
    for text in data:
        nltk_tokens = word_tokenize(text)
        line = ''.join([' ' + porter_stemmer.stem(word) for word in nltk_tokens])
        stem.append(line)
    return stem

porter_stemmer = PorterStemmer()
stem_train = []
for text in twenty_train.data:
    nltk_tokens = word_tokenize(text)
    line = ''
    for word in nltk_tokens:
        line += ' ' + porter_stemmer.stem(word)
    stem_train.append(line)
print(stem_train[0])

stem_test = stemn(twenty_test.data)

X_train, X_test, y_train, y_test = stem_train, stem_test, twenty_train.target, twenty_test.target

 ann jackson ( ajackson @ cs.ubc.ca ) wrote on 5 may : i would like to submit the follow which help me enorm . if it ha alreadi been post , i apolog . it seem that dure the middl age , it wa customari for pastor to explain the triniti to their parishon by analog to water . water is water , but can exist in three form -- liquid , ice and vapor . thu it is possibl for one essenc to exist in three form . and recent , the pastor of my church drew an analog , which i also found use -- a woman is often perciev by other in three way , depend on their relationship to her -- a mother , a wife and an employe in a busi . thu , it seem clear to me that the essenc of god can subsist in the father , son , and holi spirit or , depend on one 's particular need for him .


In [18]:
# DT
dt_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', DecisionTreeClassifier()) 
])

# Параметры для DT
dt_param_grid = {'vect__ngram_range': [(1, 1), (1, 2)],
                 'tfidf__use_idf': (True, False),
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth': list(range(1, 6)) + list(range(20, 101, 20))}


In [19]:
# KNN
knn_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier())
])

# Параметры для KNN
knn_param_grid = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf__n_neighbors': [3, 5, 7],
                  'clf__weights': ['uniform', 'distance'],
                  'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

In [20]:
# LR
lr_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', LogisticRegression(max_iter=1000))
])

# Параметры для LR
lr_param_grid = {'vect__ngram_range': [(1, 1), (1, 2)],
                 'tfidf__use_idf': (True, False),
                 'clf__C': [0.1, 1, 10],
                 'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'liblinear']}

In [21]:
# Список параметров
pipelines = [dt_pipeline, knn_pipeline, lr_pipeline]
param_grids = [dt_param_grid, knn_param_grid, lr_param_grid]

# Цикл 
for i, pipeline in enumerate(pipelines):
    print(f"Оптимизация параметров для {pipeline.named_steps['clf'].__class__.__name__}")

    # Данные со стеммингом
    grid_search_stem = GridSearchCV(pipeline, param_grids[i], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search_stem.fit(X_train, y_train)

    print("Лучшие параметры с использованием стемминга:", grid_search_stem.best_params_)
    y_pred_stem = grid_search_stem.predict(X_test)
    print("Точность с использованием стемминга:", accuracy_score(y_test, y_pred_stem))
    print("Точность с использованием стемминга:", precision_score(y_test, y_pred_stem, average='weighted'))
    print("Полнота с использованием стемминга:", recall_score(y_test, y_pred_stem, average='weighted'))
    print("F1-мера с использованием стемминга:", f1_score(y_test, y_pred_stem, average='weighted'))

    # Данные без стемминга
    grid_search_no_stem = GridSearchCV(pipeline, param_grids[i], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search_no_stem.fit(twenty_train.data, twenty_train.target)

    print("Лучшие параметры без стемминга:", grid_search_no_stem.best_params_)
    y_pred_no_stem = grid_search_no_stem.predict(twenty_test.data)
    print("Точность без стемминга:", accuracy_score(twenty_test.target, y_pred_no_stem))
    print("Точность без стемминга:", precision_score(twenty_test.target, y_pred_no_stem, average='weighted'))
    print("Полнота без стемминга:", recall_score(twenty_test.target, y_pred_no_stem, average='weighted'))
    print("F1-мера без стемминга:", f1_score(twenty_test.target, y_pred_no_stem, average='weighted'))

    print("\n")

Оптимизация параметров для DecisionTreeClassifier
Лучшие параметры с использованием стемминга: {'clf__criterion': 'entropy', 'clf__max_depth': 20, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Точность с использованием стемминга: 0.6300630063006301
Точность с использованием стемминга: 0.6322007535660351
Полнота с использованием стемминга: 0.6300630063006301
F1-мера с использованием стемминга: 0.6281133499766839
Лучшие параметры без стемминга: {'clf__criterion': 'gini', 'clf__max_depth': 60, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Точность без стемминга: 0.6336633663366337
Точность без стемминга: 0.6302850448994715
Полнота без стемминга: 0.6336633663366337
F1-мера без стемминга: 0.6312351029550983


Оптимизация параметров для KNeighborsClassifier
Лучшие параметры с использованием стемминга: {'clf__algorithm': 'auto', 'clf__n_neighbors': 7, 'clf__weights': 'distance', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Точность с использованием стемминга: 0.541854