In [1]:
import pandas as pd
import json
import numpy as np
import datetime
import os

import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
punctuation=punctuation+'«»'

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import xgboost
import lightgbm as lgbm

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

# Тёплый запуск

In [2]:
data_is_ready = os.path.exists('processed_data/data_processed.csv')
data_merged = os.path.exists('processed_data/data.csv')

# Чтение данных

In [3]:
if not data_merged:
    # файл с разметкой
    with open('data/cluster_final_cut_train.json') as f:
        train = json.load(f)

    train_pd = pd.DataFrame.from_dict(train, orient='index').reset_index()
    train_pd.columns = ['id', 'label']
    train_pd['id']=train_pd['id'].astype(np.int64)
    
    
    # файл с документами
    with open('data/cosmo_content_storage_final_cut.jsonl', 'r', encoding='utf-8') as f:
        lines = f.readlines()
    documents = [json.loads(l) for l in lines]

    ids = [d['doc_id'] for d in documents]
    urls = [d['url'] for d in documents]
    texts = [d.get('description', None) for d in documents]

    documents_pd = pd.DataFrame()
    documents_pd['id'] = ids
    documents_pd['url'] = urls
    documents_pd['text'] = texts
    
    
    data = pd.merge(documents_pd, train_pd, left_on='id', right_on='id', how='left')
    data.to_csv('processed_data/data.csv')
    print('Данные объединены')
else:
    data = pd.read_csv('processed_data/data.csv')
    print('Объединённые данные считаны')

Объединённые данные считаны


# Преобразование данных

In [4]:
if not data_is_ready:
        #Create lemmatizer and stopwords list
    stemmer = SnowballStemmer("russian") 
    russian_stopwords = stopwords.words("russian")

    #Preprocess function
    def preprocess_text(text):
        def del_punct(token):
            return ''.join([l for l in token if l not in punctuation])

        if not text:
            return []

        #в токены
        tokens = text.lower().split(' ')
        #удалить пунктуацию
        tokens = [del_punct(t) for t in tokens]
        #удалить стоп-слова
        tokens = [t for t in tokens if t not in punctuation]
        #стэмминг
        tokens = [stemmer.stem(t) for t in tokens]
        #удалить стоп-слова
        tokens = [t for t in tokens if t not in russian_stopwords]

        return tokens
    
    # Приведение тектс
    data['text_arr'] = data['text'].apply(preprocess_text)
    data['text_processed'] = data['text_arr'].apply(lambda x: ' '.join(x))
    data.to_csv('processed_data/data_processed.csv')
    print('Данные преобразованы')
else:
    data = pd.read_csv('processed_data/data_processed.csv')
    print('Преобразованные данные считаны')

Преобразованные данные считаны


In [5]:
data.loc[pd.isna(data['text_processed']), 'text_processed'] = ''

In [113]:
import re

def url_parts(url):
    
    def deal_points(part):
        parts = re.findall(r"[\w']+", part)
        print(parts)
        np.concatenate(parts, axis=0)
        return small_parts+[' ']
    
    parts = re.findall(r"[\w']+", url)
    parts = [p for p in parts if p not in ['https', 'http', 'www'] and len(p)]
    return parts[:20]

url_parts('https://argumentiru.com/politics/2019/12/506934')

['argumentiru', 'com', 'politics', '2019', '12', '506934']

In [114]:
data['url_arr'] = data['url'].apply(lambda x: url_parts(x))
data['url_processed'] = data['url_arr'].apply(lambda x: ' '.join(x))
data.head(4)

Unnamed: 0.1,Unnamed: 0,id,url,text,label,text_arr,text_processed,url_arr,url_processed
0,24535,3294657291826632492,https://argumentiru.com/politics/2019/12/506934,"Ещё в октябре было известно, что министр энерг...",0.0,"['ещ', 'октябр', 'известн', 'министр', 'энерге...",ещ октябр известн министр энергетик сша рик пе...,"[argumentiru, com, politics, 2019, 12, 506934]",argumentiru com politics 2019 12 506934
1,31942,3972531096574318613,https://inc-news.ru/news/politics/2:13678,Рик Перри сообщил о сложении полномочий.,0.0,"['рик', 'перр', 'сообщ', 'сложен', 'полномоч']",рик перр сообщ сложен полномоч,"[inc, news, ru, news, politics, 2, 13678]",inc news ru news politics 2 13678
2,64817,7062567965678511105,https://dailystorm.ru/news/glava-minenergo-ssh...,Рик Перри поблагодарил своих близких и америка...,0.0,"['рик', 'перр', 'поблагодар', 'сво', 'близк', ...",рик перр поблагодар сво близк американск народ...,"[dailystorm, ru, news, glava, minenergo, ssha,...",dailystorm ru news glava minenergo ssha pokinu...
3,11760,2102644298978156520,http://www.moscow-post.su/news/in_world/minist...,"Американский министр Рик Перри объявил о том, ...",0.0,"['американск', 'министр', 'рик', 'перр', 'объя...",американск министр рик перр объяв покинул сво ...,"[moscow, post, su, news, in_world, ministr_ene...",moscow post su news in_world ministr_energetik...


In [115]:
#сортируем по лейблу, пропуски - в конце
data =data.sort_values(by='label').reset_index().drop(['index'], axis=1)

# отюираем тренй и тест
train = data[pd.notnull(data['label'])]
test = data[pd.isna(data['label'])]

train_size = train.shape[0]

In [116]:
train['url_arr'].apply(len).max()

20

# Модель на TF-IDF

In [117]:
# tf-idf
vectorizer = TfidfVectorizer()
train_matrix = vectorizer.fit_transform(train['url_processed'], y = train['label'])
test_matrix = vectorizer.transform(test['url_processed'])

In [118]:
lgbm_train  = lgbm.Dataset(train_matrix, label = train['label'])

In [119]:
len(train['label'].unique())

3064

In [120]:
lgbm_trained = lgbm.train(params={'n_estimators':5, 'objective':'multiclass', 'num_class':3064}, train_set=lgbm_train)



In [122]:
lgbm_trained.eval()

TypeError: eval() missing 2 required positional arguments: 'data' and 'name'

In [83]:
def hyperopt_lgbm_score(params):
    model = lgbm.train(lgbm_params, lgbm_train)
    eval_result = model.eval(lgbm_train)
    score = float(eval_result[eval_result.index('.')-1:])
    return -score

In [84]:
lgbm_params = {'objective':'multiclass', 'num_class':3064,
              'n_estimators': 2}

In [86]:
best = fmin(fn=hyperopt_lgbm_score, space=lgbm_params, 
            algo=tpe.suggest, max_evals=10)
print('best:')
print(best)

  0%|                                                                           | 0/10 [00:00<?, ?trial/s, best loss=?]


job exception: eval() missing 1 required positional argument: 'name'






TypeError: eval() missing 1 required positional argument: 'name'

In [14]:
#lgbm_trained = lgbm.train(lgbm_params, lgbm_train)

In [None]:
1/0

In [20]:
from sklearn.metrics import accuracy_score

# Submission

In [15]:
sample_submission = pd.read_csv('data/sample.txt')

In [16]:
##
submission = sample_submission

In [17]:
submission.shape

(61858, 2)

In [None]:
submission.to_csv('submissions/submission_{}'.format(datetime.datetime.now().strftime('%y%m%d_%H%m%S')), index=False)