In [1]:
import pandas as pd
import json
import numpy as np
import datetime

# Чтение данных

In [2]:
# файл с разметкой
with open('data/cluster_final_cut_train.json') as f:
    train = json.load(f)
    
train_pd = pd.DataFrame.from_dict(train, orient='index').reset_index()
train_pd.columns = ['id', 'label']
train_pd['id']=train_pd['id'].astype(np.int64)

In [3]:
# файл с документами
with open('data/cosmo_content_storage_final_cut.jsonl', 'r', encoding='utf-8') as f:
    lines = f.readlines()
documents = [json.loads(l) for l in lines]

ids = [d['doc_id'] for d in documents]
urls = [d['url'] for d in documents]
texts = [d.get('description', None) for d in documents]

documents_pd = pd.DataFrame()
documents_pd['id'] = ids
documents_pd['url'] = urls
documents_pd['text'] = texts

In [4]:
data = pd.merge(documents_pd, train_pd, left_on='id', right_on='id', how='left')
data.to_csv('processed_data/data.csv')

# Преобразование данных

In [5]:
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.snowball import SnowballStemmer

In [62]:
punctuation=punctuation+'«»'

In [65]:
#Create lemmatizer and stopwords list
stemmer = SnowballStemmer("russian") 
russian_stopwords = stopwords.words("russian")

#Preprocess function
def preprocess_text(text):
    def del_punct(token):
        return ''.join([l for l in token if l not in punctuation])
    
    if not text:
        return []
    
    #в токены
    tokens = text.lower().split(' ')
    #удалить пунктуацию
    tokens = [del_punct(t) for t in tokens]
    #удалить стоп-слова
    tokens = [t for t in tokens if t not in punctuation]
    #стэмминг
    tokens = [stemmer.stem(t) for t in tokens]
    #удалить стоп-слова
    tokens = [t for t in tokens if t not in russian_stopwords]
    
    return tokens

In [67]:
# Приведение тектс
data['text_arr'] = data['text'].apply(preprocess_text)

In [75]:
data['text_processed'] = data['text_arr'].apply(lambda x: ' '.join(x))

In [76]:
data.to_csv('processed_data/data_processed.csv')

# Модель

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [111]:
#сортируем по лейблу, пропуски - в конце
data =data.sort_values(by='label').reset_index().drop(['index'], axis=1)

In [117]:
# отюираем тренй и тест
train = data[pd.notnull(data['label'])]
test = data[pd.isna(data['label'])]

train_size = train.shape[0]

In [131]:
# tf-idf
vectorizer = TfidfVectorizer()
train_matrix = vectorizer.fit_transform(train['text_processed'], y = train['label'])
test_matrix = vectorizer.transform(test['text_processed'])

In [133]:
import gc

In [134]:
gc.collect()

12596

In [135]:
clf = RandomForestClassifier()
clf.fit(train_matrix, train['label'])

MemoryError: could not allocate 200802304 bytes

# Submission

In [None]:
sample_submission = pd.read_csv('data/sample.txt')

In [None]:
##
submission = sample_submission

In [None]:
submission.to_csv('submissions/submission_{}'.format(datetime.datetime.now().strftime('%y%m%d_%H%m%S')), index=False)