In [1]:
import pandas as pd
import json
import numpy as np
import datetime
import os

import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
punctuation=punctuation+'«»'

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import xgboost
import lightgbm as lgbm

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

# Тёплый запуск

In [2]:
data_is_ready = os.path.exists('processed_data/data_processed.csv')
data_merged = os.path.exists('processed_data/data.csv')

# Чтение данных

In [3]:
if not data_merged:
    # файл с разметкой
    with open('data/cluster_final_cut_train.json') as f:
        train = json.load(f)

    train_pd = pd.DataFrame.from_dict(train, orient='index').reset_index()
    train_pd.columns = ['id', 'label']
    train_pd['id']=train_pd['id'].astype(np.int64)
    
    
    # файл с документами
    with open('data/cosmo_content_storage_final_cut.jsonl', 'r', encoding='utf-8') as f:
        lines = f.readlines()
    documents = [json.loads(l) for l in lines]

    ids = [d['doc_id'] for d in documents]
    urls = [d['url'] for d in documents]
    texts = [d.get('description', None) for d in documents]

    documents_pd = pd.DataFrame()
    documents_pd['id'] = ids
    documents_pd['url'] = urls
    documents_pd['text'] = texts
    
    
    data = pd.merge(documents_pd, train_pd, left_on='id', right_on='id', how='left')
    data.to_csv('processed_data/data.csv')
    print('Данные объединены')
else:
    data = pd.read_csv('processed_data/data.csv')
    print('Объединённые данные считаны')

Объединённые данные считаны


# Преобразование данных

In [4]:
if not data_is_ready:
        #Create lemmatizer and stopwords list
    stemmer = SnowballStemmer("russian") 
    russian_stopwords = stopwords.words("russian")

    #Preprocess function
    def preprocess_text(text):
        def del_punct(token):
            return ''.join([l for l in token if l not in punctuation])

        if not text:
            return []

        #в токены
        tokens = text.lower().split(' ')
        #удалить пунктуацию
        tokens = [del_punct(t) for t in tokens]
        #удалить стоп-слова
        tokens = [t for t in tokens if t not in punctuation]
        #стэмминг
        tokens = [stemmer.stem(t) for t in tokens]
        #удалить стоп-слова
        tokens = [t for t in tokens if t not in russian_stopwords]

        return tokens
    
    # Приведение тектс
    data['text_arr'] = data['text'].apply(preprocess_text)
    data['text_processed'] = data['text_arr'].apply(lambda x: ' '.join(x))
    data.to_csv('processed_data/data_processed.csv')
    print('Данные преобразованы')
else:
    data = pd.read_csv('processed_data/data_processed.csv')
    print('Преобразованные данные считаны')

Преобразованные данные считаны


In [5]:
data.loc[pd.isna(data['text_processed']), 'text_processed'] = ''

In [6]:
#сортируем по лейблу, пропуски - в конце
data =data.sort_values(by='label').reset_index().drop(['index'], axis=1)

# отюираем тренй и тест
train = data[pd.notnull(data['label'])]
test = data[pd.isna(data['label'])]

train_size = train.shape[0]

# lifeHack

In [17]:
#train['text_processed']

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [25]:
data_cv = cv.fit_transform(data['text_processed'])

In [30]:
train_cv = data_cv[:train_size]
test_cv = data_cv[train_size:]

# KNN

In [61]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

In [56]:
knnc = KNeighborsClassifier()

In [57]:
params = {'n_neighbors':[5,10,15]}

In [62]:
gscv = GridSearchCV(knnc, params, scoring=make_scorer(accuracy_score))

In [63]:
gscv.fit(train_cv, y=train['label'])



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None, param_grid={'n_neighbors': [5, 10, 15]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(accuracy_score), verbose=0)

In [66]:
gscv.best_score_

0.2000754432289702

In [67]:
gscv.best_params_

{'n_neighbors': 5}

In [69]:
test_predicted = gscv.best_estimator_.predict(test_cv)

In [70]:
test_predicted

array([1192.,  294.,  429., ...,   67.,  194.,   66.])

In [71]:
test['knn_predicted'] = test_predicted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [73]:
tmp = test[['id', 'knn_predicted']]

In [74]:
tmp.columns=['doc_id', 'cat']

In [80]:
tmp['cat'] = tmp['cat'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [81]:
tmp.head()

Unnamed: 0,doc_id,cat
26510,1000029981939875422,1192
26511,1000103932845397396,294
26512,1000115462666782749,429
26513,1000207870420986507,205
26514,1000305001855188840,196


In [82]:
tmp.to_csv('predictions/knn_predicted.csv', index=False)