In [26]:
import pandas as pd
import json
import numpy as np
import datetime
import os

import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
punctuation=punctuation+'«»'

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import xgboost
import lightgbm as lgbm

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

# Тёплый запуск

In [2]:
data_is_ready = os.path.exists('processed_data/data_processed.csv')
data_merged = os.path.exists('processed_data/data.csv')

# Чтение данных

In [3]:
if not data_merged:
    # файл с разметкой
    with open('data/cluster_final_cut_train.json') as f:
        train = json.load(f)

    train_pd = pd.DataFrame.from_dict(train, orient='index').reset_index()
    train_pd.columns = ['id', 'label']
    train_pd['id']=train_pd['id'].astype(np.int64)
    
    
    # файл с документами
    with open('data/cosmo_content_storage_final_cut.jsonl', 'r', encoding='utf-8') as f:
        lines = f.readlines()
    documents = [json.loads(l) for l in lines]

    ids = [d['doc_id'] for d in documents]
    urls = [d['url'] for d in documents]
    texts = [d.get('description', None) for d in documents]

    documents_pd = pd.DataFrame()
    documents_pd['id'] = ids
    documents_pd['url'] = urls
    documents_pd['text'] = texts
    
    
    data = pd.merge(documents_pd, train_pd, left_on='id', right_on='id', how='left')
    data.to_csv('processed_data/data.csv')
    print('Данные объединены')
else:
    data = pd.read_csv('processed_data/data.csv')
    print('Объединённые данные считаны')

Объединённые данные считаны


# Преобразование данных

In [5]:
if not data_is_ready:
        #Create lemmatizer and stopwords list
    stemmer = SnowballStemmer("russian") 
    russian_stopwords = stopwords.words("russian")

    #Preprocess function
    def preprocess_text(text):
        def del_punct(token):
            return ''.join([l for l in token if l not in punctuation])

        if not text:
            return []

        #в токены
        tokens = text.lower().split(' ')
        #удалить пунктуацию
        tokens = [del_punct(t) for t in tokens]
        #удалить стоп-слова
        tokens = [t for t in tokens if t not in punctuation]
        #стэмминг
        tokens = [stemmer.stem(t) for t in tokens]
        #удалить стоп-слова
        tokens = [t for t in tokens if t not in russian_stopwords]

        return tokens
    
    # Приведение тектс
    data['text_arr'] = data['text'].apply(preprocess_text)
    data['text_processed'] = data['text_arr'].apply(lambda x: ' '.join(x))
    data.to_csv('processed_data/data_processed.csv')
    print('Данные преобразованы')
else:
    data = pd.read_csv('processed_data/data_processed.csv')
    print('Преобразованные данные считаны')

Преобразованные данные считаны


In [6]:
data.loc[pd.isna(data['text_processed']), 'text_processed'] = ''

In [22]:
#сортируем по лейблу, пропуски - в конце
data =data.sort_values(by='label').reset_index().drop(['index'], axis=1)

# отюираем тренй и тест
train = data[pd.notnull(data['label'])]
test = data[pd.isna(data['label'])]

train_size = train.shape[0]

# Модель на TF-IDF

In [10]:
# tf-idf
vectorizer = TfidfVectorizer()
train_matrix = vectorizer.fit_transform(train['text_processed'], y = train['label'])
test_matrix = vectorizer.transform(test['text_processed'])

In [11]:
lgbm_train  = lgbm.Dataset(train_matrix, label = train['label'])

In [29]:
def hyperopt_lgbm_score(params):
    model = lgbm.train(lgbm_params, lgbm_train)
    eval_result = model.eval(lgbm_train)
    score = float(eval_result[eval_result.index('.')-1:])
    return -score

In [24]:
lgbm_params = {'objective':'multiclass', 'num_class':3064,
              'n_estimators': 2}

In [None]:
best = fmin(fn=hyperopt_lgbm_score, space=simple_space_xgb, 
            algo=tpe.suggest, max_evals=10)
print('best:')
print(best)

In [14]:
#lgbm_trained = lgbm.train(lgbm_params, lgbm_train)

In [None]:
1/0

In [20]:
from sklearn.metrics import accuracy_score

# Submission

In [15]:
sample_submission = pd.read_csv('data/sample.txt')

In [16]:
##
submission = sample_submission

In [17]:
submission.shape

(61858, 2)

In [None]:
submission.to_csv('submissions/submission_{}'.format(datetime.datetime.now().strftime('%y%m%d_%H%m%S')), index=False)