In [1]:
import wikipediaapi

In [2]:
def get_categorymembers(categorymembers, titles, level=0, max_level=1):
    """Возращает заголовки статей википедии из запрашиваемой категории
    
    Аргументы:
    categorymembers -- список с членами запрашиваемой категории
    titles -- список, в который будут добавляться заголовки статей
    
    Именные аргументы:
    level -- integer - нижний уровень вложенности категорий и подкатегорий
    max_level -- integer - нижний уровень вложенности категорий и подкатегорий
    
    """
    for c in categorymembers.values():
        titles.append(c.title)
        if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            get_categorymembers(c.categorymembers, titles,level=level + 1, max_level=max_level)

    return(titles)

wiki_wiki = wikipediaapi.Wikipedia('en')
# Категория - медицина
cat = wiki_wiki.page("Category:Medicine") 
titles = []
titles = get_categorymembers(cat.categorymembers, titles)


In [3]:
# Категория - наука, так как научные статьи более схожы с медицинскими и, я думаю,
# это хорошо отразится на качестве модели
random_cat = wiki_wiki.page("Category:Science") 
random_titles = []
random_titles = get_categorymembers(random_cat.categorymembers, random_titles)

#p_wiki = wiki_wiki.page(titles[0])
#print(p_wiki.text)

In [4]:
def getarticles(titles):
    """Возвращает список статей
    
    Аргументы:
    titles - список заголовков искомых статей
    
    """
    articles = []
    for i in range(2500):
        try:
            p_wiki = wiki_wiki.page(titles[i])
            articles.append(p_wiki.text)
        except Exception as e:
            pass
    return articles

In [5]:
class_1 = getarticles(titles)
print(len(class_1))

1490


In [6]:
class_2 = getarticles(random_titles)
print(len(class_2))

2232


In [9]:
import re

def preprocess_text(text):
    """Возращает предобработанный текст
    
    Этапы обработки:
    1. Привидение к нижнему регистру;
    2. Замена ссылок ключевым словом URL;
    3. Замена знаков препинания и других символов, не относящимся к буквам или словам, пробелами;
    4. Замена множества пробелов одним пробелом.
    
    Предобработка нужна для очистки текста от ненужных символов, которые не участвуют в классификации.
    
    Аргументы:
    text -- строка с текстом
    
    """
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', text)
    text = re.sub('[^a-zA-Zа-яА-Я]+', ' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip()
        

In [10]:
raw_data = class_1[:1400] + class_2[:1500]
data = [preprocess_text(t) for t in raw_data]

In [17]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def delete_stopwords(text):
    """Возвращает текст без стоп-слов
    
    К стоп-словам относятся слова не несущие никакой смысловой нагрузки, такие как:
    артикли, союзы, предлоги и т.д. Такие слова делают классификацию менее точной, так как
    их, как правило, больше всего, и модель может ошибочно сделать их значимыми.
    
    Аргументы:
    text -- строка с текстом
    
    """
    text = text.split(' ')
    filtered_text = [word for word in text if word not in stopwords.words('english')]
    res_text = ''
    for word in filtered_text:
        res_text += word + ' '
    return res_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [18]:
filtered_data = [delete_stopwords(t) for t in data]

In [78]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Максимальное количество слов в предложении
SENTENCE_LENGTH = 150 
# Размер словаря
NUM = 100000

def get_sequences(tokenizer, x):
    """Возращает массив идентификаторов токенов
    
    Токенизация нужна для того, что текст представить в виде чисел
    
    Аргументы:
    tokenizer -- токенизатор
    x -- массив с текстами
    
    """
    sequences = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequences, maxlen=SENTENCE_LENGTH)

# Cоздаем и обучаем токенизатор
tokenizer = Tokenizer(num_words=NUM)
tokenizer.fit_on_texts(filtered_data)

# Отображаем каждый текст в массив идентификаторов токенов
x_train = get_sequences(tokenizer, filtered_data)

In [79]:
print(x_train[1000])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0    12 16650   231   884  3317
   914 16241   219     5    26   114  1729    45   110    34 16650  2006
  3078   914    13   219    26  2454 10449 10449  4290  1829 11806   667
   470    61   174 14122  2536   990   213  9735   164 62466    94   159
 17814   333  3857   686  4143   333  6242  1370   159 16727  2569  2371
  2267  1183 10449 10449  1829    34   284   164  1386    26  1310     6
 27433  4070 19930  1109    61  2323  1260  1494  1364  3355   177  2171
   164     6 27433   675  5115  1000   780   231  3689  1183 10449     6
     4   528 14310   477    79 10880    48     4   830 26961   732 14754
   148  1066   362  3078    12    54]


In [24]:
# массив с праильными ответами для обучения с учителем
y_train = []

for i in range(2900):
    if i < 1400:
        y_train.append(1)
    else:
        y_train.append(0)

In [43]:
import numpy as np
y_train = np.array(y_train)

#print(x_train.shape)
#print(y_train.shape)

In [44]:
x_train = np.loadtxt("x_train.txt")
y_train = np.loadtxt("y_train.txt")

In [40]:
np.savetxt("x_train.txt", x_train)
np.savetxt("y_train.txt", y_train)

In [53]:
mean = np.mean(X_train)
std = np.std(X_train)
X_train = (X_train - mean)/std

In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding, LSTM

In [57]:
model = Sequential()

# Слой Embedding для векторного представления слов
# Слой располагает вектора слов со схожим знаением на близком расстоянии в процессе обучения
model.add(Embedding(100000, 64))
# Слой LSTM - долгой краткосрочной памяти, параметры дропаута и количества запоминающих элементов выбраны интуитивно
model.add(LSTM(100, dropout=0.25, recurrent_dropout=0.25))
# Выходной слой с одним нейроном с сигмоидной функцией активации для бинарной классификации
model.add(Dense(1, activation="sigmoid"))

In [60]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [61]:
model.fit(x_train, y_train, batch_size=100, epochs=10, validation_split=0.25, verbose=1)

Train on 2175 samples, validate on 725 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x218d91f7358>

In [97]:
text = "Coronary artery bypass surgery, also known as coronary artery bypass graft (CABG, pronounced \"cabbage\") surgery, and colloquially heart bypass or bypass surgery, is a surgical procedure to restore normal blood flow to an obstructed coronary artery. A normal coronary artery transports blood to and from the heart muscle itself, not through the main circulatory system. There are two main approaches. In one, the left internal thoracic artery, LITA (also called left internal mammary artery, LIMA) is diverted to the left anterior descending branch of the left coronary artery. In this method, the artery is \"pedicled\" which means it is not detached from the origin. In the other, a great saphenous vein is removed from a leg; one end is attached to the aorta or one of its major branches, and the other end is attached to the obstructed artery immediately after the obstruction to restore blood flow. CABG is performed to relieve angina unsatisfactorily controlled by maximum tolerated anti-ischemic medication, prevent or relieve left ventricular dysfunction, and/or reduce the risk of death. CABG does not prevent myocardial infarction (heart attack). This surgery is usually performed with the heart stopped, necessitating the usage of cardiopulmonary bypass. However, two alternative techniques are also available, allowing CABG to be performed on a beating heart either without using the cardiopulmonary bypass, a procedure referred to as \"off-pump\" surgery, or performing beating surgery using partial assistance of the cardiopulmonary bypass, a procedure referred to as \"on-pump beating\" surgery. The latter procedure offers the advantages of the on-pump stopped and off-pump while minimizing their respective side-effects. CABG is often indicated when coronary arteries have a 50 to 99 percent obstruction. The obstruction being bypassed is typically due to arteriosclerosis, atherosclerosis, or both. Arteriosclerosis is characterized by thickening, loss of elasticity, and calcification of the arterial wall, most often resulting in a generalized narrowing in the affected coronary artery. Atherosclerosis is characterized by yellowish plaques of cholesterol, lipids, and cellular debris deposited into the inner layer of the wall of a large or medium-sized coronary artery, most often resulting in a partial obstruction in the affected artery. Either condition can limit blood flow if it causes a cross-sectional narrowing of at least 50 percent. "

#text = "The Persian cat is a long-haired breed of cat characterized by its round face and short muzzle. It is also known as the \"Persian Longhair\" in the English-speaking countries. In the Middle East region they are widely known as \"Iranian cat\" and in Iran they are known as \"Shirazi cat\". The first documented ancestors of the Persian were imported into Italy from Iran (historically known as Persia) around 1620. The exact history of the Persian cat does seem to be a bit of a mystery but many of these long-haired cats were seen in hieroglyphics. The story has it that these long-haired cats were then imported into Europe as their popularity grew and breeding took place in Italy and France."
#text = "A chondroblastoma is a rare, usually benign, tumor of bone that accounts for approximately 1% of all bone tumors. In 1931, Codman classified it as a chondromatous variant of giant cell tumors, when he described these lesions in the proximal humerus. [1] A decade later, Jaffe and Lichtenstein renamed the Codman tumor a benign chondroblastoma to emphasize the chondroblastic genesis of the lesion and to distinguish it from the classic giant cell tumor of bone. [2]Although the exact etiology of chondroblastoma remains uncertain, the presentation, appropriate evaluation, and treatment of patients with the condition have been well described. (See Presentation, Workup, and Treatment.)"

#text = "The R-boats built by Lake Torpedo Boat Company (R-21 through R-27) are sometimes considered a separate class from those of the other builders. The Lake boats had a length of 175 feet (53.3 m) overall, a beam of 16 feet 8 inches (5.1 m) and a mean draft of 13 feet 11 inches (4.2 m). They displaced 497 long tons (505 t) on the surface and 652 long tons (662 t) submerged. The R-class submarines had a crew of 3 officers and 23 enlisted men. They had a diving depth of 200 feet (61.0 m).[1] For surface running, the boats were powered by two 500-brake-horsepower (373 kW) diesel engines, each driving one propeller shaft. When submerged each propeller was driven by a 400-horsepower (298 kW) electric motor. They could reach 14 knots (26 km/h; 16 mph) on the surface and 11 knots (20 km/h; 13 mph) underwater. On the surface, the Lake boats had a range of 3,523 nautical miles (6,525 km; 4,054 mi) at 11 knots (20 km/h; 13 mph)[1] and 150 nmi (280 km; 170 mi) at 5 knots (9.3 km/h; 5.8 mph) submerged.[1] The boats were armed with four 21-inch (53.3 cm) torpedo tubes in the bow. They carried four reloads, for a total of eight torpedoes. The R-class submarines were also armed with a single 3\"/50 caliber deck gun.[2]"

text = preprocess_text(text)
text = delete_stopwords(text)
text = get_sequences(tokenizer, [text])
print(text)
y_p = model.predict_proba(text)
print(y_p)

[[ 1415   388  2840 14248  3223  2510  5704  6427    83   896 26012   440
   5621 10768   750  3036 14260   561  9541   931   649  5621   527  4017
   2499   779   164    73 26012   649  5006  5438   337  2845   209   178
    440   337  3706 18168  1545  7283  9821    53    32   116   248     4
    203  1495 26012   440  6951   337   297   139    69  7283  9821   630
    528  4962   209  2616  6951   209    69  1127  3109  7283  9821   630
    528  4962  6951   209  1750   630  1366  3737  4962  3706  4962 10553
   5005   597    98 26012    47  1727  4940  7204    22  3031   890  5704
   5704 39581   356   114 14781  1019 15740   715 17326 14194  3886  2415
     47   829  4326 11294   971  4940  3223 14781  1019 25814 11106  4380
  11892  1765 10528  7969  2598  3456  2415   211  1202  6949  4940  3223
     47   829  1127  5704   971  3223   297   231  1808    83   896   263
    763  8362 11294   395    22   890]]
[[0.9833844]]


In [91]:
import pickle

# Сохранение токенизатора
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [92]:
# Сохранение модели в формате json
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# Сохранение весов в формате hdf5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [94]:
from tensorflow.keras.models import model_from_json

json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Loaded model from disk


In [96]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)