In [1]:
from sklearn.datasets import fetch_20newsgroups

# Загрузка данных. Статьи и разметка (20 классов).

In [2]:
data_train = fetch_20newsgroups(subset='train')
data_test = fetch_20newsgroups(subset='test')

In [3]:
dir(data_train)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [4]:
dir(data_test)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [5]:
len(data_train.data), len(data_test.data)

(11314, 7532)

In [6]:
for i, (name1, name2) in enumerate(zip(data_train.target_names, 
                                       data_test.target_names)):
  print(i, name1, "-", name2)

0 alt.atheism - alt.atheism
1 comp.graphics - comp.graphics
2 comp.os.ms-windows.misc - comp.os.ms-windows.misc
3 comp.sys.ibm.pc.hardware - comp.sys.ibm.pc.hardware
4 comp.sys.mac.hardware - comp.sys.mac.hardware
5 comp.windows.x - comp.windows.x
6 misc.forsale - misc.forsale
7 rec.autos - rec.autos
8 rec.motorcycles - rec.motorcycles
9 rec.sport.baseball - rec.sport.baseball
10 rec.sport.hockey - rec.sport.hockey
11 sci.crypt - sci.crypt
12 sci.electronics - sci.electronics
13 sci.med - sci.med
14 sci.space - sci.space
15 soc.religion.christian - soc.religion.christian
16 talk.politics.guns - talk.politics.guns
17 talk.politics.mideast - talk.politics.mideast
18 talk.politics.misc - talk.politics.misc
19 talk.religion.misc - talk.religion.misc


In [7]:
print(data_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







# Использование логистической регрессии

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

In [9]:
vectorizer = TfidfVectorizer() # можно попробовать другие векторизаторы, а также поэкспериментировать с параметрами

In [10]:
data_train_vec = vectorizer.fit_transform(data_train.data)
data_test_vec = vectorizer.transform(data_test.data)

In [11]:
type(data_train_vec), data_train_vec.shape

(scipy.sparse.csr.csr_matrix, (11314, 130107))

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
model = LogisticRegression(max_iter=100_000)

In [14]:
model.fit(data_train_vec, data_train.target)

LogisticRegression(max_iter=100000)

In [15]:
model.score(data_train_vec, data_train.target)

0.9761357610040657

In [16]:
model.score(data_test_vec, data_test.target)

0.8274030801911842

# Использование нейронной сети

Формируем словарь всех токенов - частей тектов, разделенных пробелами.
Будет много "шума".
Можно попробовать провести очистку для улучшения качества обучения классификатора.

In [17]:
vocab = set()

for text_ in data_train.data:
  words = text_.split()
  for word_ in words:
    vocab.add(word_)

for text_ in data_test.data:
  words = text_.split()
  for word_ in words:
    vocab.add(word_)    

In [18]:
len(vocab)

386410

Формируем python словарь: "слово"-"индекс слова"

In [19]:
word_dict = dict()
for i, word in enumerate(vocab):
  word_dict[word] = i

Векторизуем тексты, подставляем вместо слов их индексы из словаря word_dict

In [20]:
vectorized_train_texts = []
for text_ in data_train.data:
  words = text_.split()
  tmp = []
  for word_ in words:
    tmp.append(word_dict[word_])
  vectorized_train_texts.append(tmp)

In [21]:
len(vectorized_train_texts[0]), len(data_train.data[0].split())

(123, 123)

Создаем нейронную сеть, готовим данные для обучения, обучаем

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [23]:
vectorized_train_texts_pad = pad_sequences(vectorized_train_texts, maxlen=1000)

In [24]:
vectorized_train_texts_pad.shape

(11314, 1000)

In [25]:
train_target_ohe = to_categorical(data_train.target)

In [26]:
data_train.target.shape, train_target_ohe.shape

((11314,), (11314, 20))

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding

In [28]:
model = Sequential()
model.add(Embedding(len(vocab), 5, input_length=1000))
model.add(SimpleRNN(100))
model.add(Dense(20, activation='softmax'))

In [29]:
model.compile(optimizer='RMSprop', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 5)           1932050   
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               10600     
                                                                 
 dense (Dense)               (None, 20)                2020      
                                                                 
Total params: 1,944,670
Trainable params: 1,944,670
Non-trainable params: 0
_________________________________________________________________


In [31]:
vectorized_train_texts_pad.shape

(11314, 1000)

In [32]:
model.fit(vectorized_train_texts_pad, train_target_ohe, epochs=1)



<keras.callbacks.History at 0x7fdc312d8250>

Подготовим данные для тестирования модели

In [33]:
vectorized_test_texts = []
for text_ in data_test.data:
  words = text_.split()
  tmp = []
  for word_ in words:
    tmp.append(word_dict[word_])
  vectorized_test_texts.append(tmp)

vectorized_test_texts_pad = pad_sequences(vectorized_test_texts, maxlen=1000)

test_target_ohe = to_categorical(data_test.target)

In [35]:
model.evaluate(vectorized_test_texts_pad, test_target_ohe)



[2.9967384338378906, 0.05323951318860054]

Результаты обучения нейронной сети удручающие ) Что сделать для их улучшения? Экспериментируйте, присылайте результаты на ychernyshov@ussc.ru