In [None]:
!pip install pymystem3

In [None]:
import pandas as pd
import numpy as np
import string
import nltk

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from nltk.stem.snowball import SnowballStemmer 
from tqdm.auto import tqdm, trange
from nltk.stem import *
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from nltk import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('rls_.csv', sep=';', encoding='cp1251')

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df['Класс'].value_counts()

In [None]:
def remove_punctuation(text):
    return "".join([ch if ch not in string.punctuation else ' ' for ch in text])

def remove_numbers(text):
    return ''.join([i if not i.isdigit() else ' ' for i in text])

import re
def remove_multiple_spaces(text):
	return re.sub(r'\s+', ' ', text, flags=re.I)

mystem = Mystem() 

russian_stopwords = stopwords.words("russian")
russian_stopwords.extend(['…', '«', '»', '...'])
def lemmatize_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords and token != " "]
    text = " ".join(tokens)
    return text

In [None]:
preproccessing = lambda text: (remove_multiple_spaces(remove_numbers(remove_punctuation(text))))
df['preproccessed'] = list(map(preproccessing, df['Исходники']))
prep_text = [remove_multiple_spaces(remove_numbers(remove_punctuation(text.lower()))) for text in tqdm(df['Исходники'])]

In [None]:
len(prep_text)


In [None]:
type(prep_text)

In [None]:
df['prep_text']=prep_text

In [None]:
df

In [None]:
stemmer = SnowballStemmer("russian") 
russian_stopwords = stopwords.words("russian")
russian_stopwords.extend(['…', '«', '»', '...', 'т.д.', 'т', 'д'])
text = df['prep_text'][0]
word_tokenize(text) 

In [None]:
stemmed_texts_list = []
for text in tqdm(df['prep_text']):
    tokens = word_tokenize(text)    
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in russian_stopwords]
    text = " ".join(stemmed_tokens)
    stemmed_texts_list.append(text)

df['text_stem'] = stemmed_texts_list

In [None]:
def remove_stop_words(text):
    tokens = word_tokenize(text) 
    tokens = [token for token in tokens if token not in russian_stopwords and token != ' ']
    return " ".join(tokens)
  
sw_texts_list = []
for text in tqdm(df['prep_text']):
    tokens = word_tokenize(text)    
    tokens = [token for token in tokens if token not in russian_stopwords and token != ' ']
    text = " ".join(tokens)
    sw_texts_list.append(text)

df['text_sw'] = sw_texts_list

In [None]:
df['text_sw'][0]

In [None]:
df.to_csv('rls_stemmed.csv')

In [None]:
df['text_stem'][0]

In [None]:
lemm_texts_list = []
for text in tqdm(df['text_sw']):

    try:
        text_lem = mystem.lemmatize(text)
        tokens = [token for token in text_lem if token != ' ' and token not in russian_stopwords]
        text = " ".join(tokens)
        lemm_texts_list.append(text)
    except Exception as e:
        print(e)
    
df['text_lemm'] = lemm_texts_list

In [None]:
def lemmatize_text(text):
    text_lem = mystem.lemmatize(text)
    tokens = [token for token in text_lem if token != ' ']
    return " ".join(tokens)

In [None]:
df.to_csv('lemm.csv')

In [None]:
df_lemm = pd.read_csv('lemm.csv', encoding = 'utf-8')

In [None]:
df_lemm

In [None]:
X = df_lemm['text_sw']
y = df_lemm['Класс']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
my_tags = df_lemm['Класс'].unique()
my_tags

## BOW with Keras

In [None]:
import itertools
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

train_posts = X_train
train_tags = y_train

test_posts = X_test 
test_tags = y_test

max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts) # only fit on train

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

batch_size = 32
epochs = 100

# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

## Наивный байесовский классификатор

In [None]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [None]:
%%time
nb.fit(X_train, y_train)

In [None]:
%%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

In [None]:
y_pred[0]

In [None]:
print(X_test[0], y_test[0], y_pred[0])

In [None]:
from sklearn.metrics import accuracy_score

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

## Метод опорных векторов

In [None]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

In [None]:
%%time
sgd.fit(X_train, y_train)

In [None]:
%%time
y_pred = sgd.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

## Логистическая регрессия

In [None]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])

In [None]:
%%time
logreg.fit(X_train, y_train)

In [None]:
%%time
y_pred = logreg.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,targe))