In [1]:
import os
import re
import pandas as pd
import nltk
nltk.download('punkt')
import re
from nltk import word_tokenize
!pip install pymorphy2
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.2 MB/s 
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 24.6 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import zipfile
with zipfile.ZipFile('/content/gdrive/MyDrive/Colab Notebooks/corpus_news.zip', 'r') as zip_ref:
    zip_ref.extractall('corpus_news')

In [4]:
def get_data(folder):
    df_list = []

    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'r', encoding='utf-8') as f:
            try:
              text = f.read()
              df_list.append(text)
            except UnicodeDecodeError:
              pass
    return df_list

df_culture = get_data('corpus_news/corpus/culture')
df_tech = get_data('corpus_news/corpus/hi-tech')
df_politics = get_data('corpus_news/corpus/politics')
df_science = get_data('corpus_news/corpus/science')

In [5]:
df_culture = pd.DataFrame({'text':df_culture, 'class':1})
df_tech = pd.DataFrame({'text':df_tech, 'class':2})
df_politics = pd.DataFrame({'text':df_politics, 'class':3})
df_science = pd.DataFrame({'text':df_science, 'class':4})

In [6]:
df = pd.concat([df_culture, df_tech, df_politics, df_science])

In [7]:
sw_list = []
with open('/content/gdrive/MyDrive/Colab Notebooks/swl_optimum.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.replace('\n', '')
        sw_list.append(line)

In [8]:
def clean_data(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    clean_text = []
    for word in tokens:
        if word not in sw_list:
            word = morph.parse(word)[0].normal_form
            clean_text.append(word)
    clean_text = ' '.join(clean_text) 
    
    return clean_text 

In [9]:
df['text'] = df['text'].apply(clean_data)

In [16]:
df.sample(5)

Unnamed: 0,text,class
86,римас туминас поставить большой свой пиковый д...,1
338,небензить назвать преступление удар коалиция с...,3
14,xiaomi хотеть выпустить мышь сканер отпечаток ...,2
219,сеул назвать призыв возобновить учение вашингт...,3
330,африном курдский отряд убить 40 турецкий солда...,3


DECISION TREE

In [15]:
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

text_clf_tree = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', tree.DecisionTreeClassifier()),
                     ])

text_clf_tree.fit(X_train, y_train)


predicted_tree = text_clf_tree.predict(X_test)

print(metrics.classification_report(y_test, predicted_tree))

              precision    recall  f1-score   support

           1       0.79      0.77      0.78        30
           2       1.00      1.00      1.00        38
           3       0.90      0.91      0.91        70
           4       0.81      0.81      0.81        21

    accuracy                           0.89       159
   macro avg       0.88      0.87      0.87       159
weighted avg       0.89      0.89      0.89       159



RANDOM FOREST

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.15, random_state=42)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           1       0.96      0.77      0.85        30
           2       1.00      1.00      1.00        38
           3       0.91      0.99      0.95        70
           4       1.00      1.00      1.00        21

    accuracy                           0.95       159
   macro avg       0.97      0.94      0.95       159
weighted avg       0.95      0.95      0.95       159



In [12]:
df_analysis = pd.DataFrame({'text':X_test, 'predicted': predicted, 'Y_test':y_test})

In [13]:
df_analysis = df_analysis[df_analysis['predicted'] != df_analysis['Y_test']]

In [14]:
df_analysis

Unnamed: 0,text,predicted,Y_test
158,казахстан выставить обозрение книга человеческ...,3,1
29,память виталий чуркин неожиданный момент биогр...,1,3
113,новый председатель союз писатель россия стать ...,3,1
78,показ dolce gabbana модель заменить дрон видео...,3,1
83,фильм лёд выходить китайский киноэкран стартов...,3,1
39,десятилетний юбилей отмечать журнал русский пи...,3,1
23,ватиканский монахиня жаловаться они сделать ра...,3,1
100,анджелина джоля заняться проблема сирийский бе...,3,1


As expected, random forest did better than decision tree.