In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc


In [2]:
train = pd.read_csv('../../data/train_groups.csv')
test  = pd.read_csv('../../data/test_groups.csv')

In [3]:
test['is_train'] = False
train['is_train'] = True
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
data_x = pd.concat(( train.drop('target', 1), test))

In [5]:
title = pd.read_csv('../../data/docs_titles.tsv', sep='\t', encoding='utf-8', lineterminator='\n')

In [64]:
data_x.shape

(28317, 4)

In [6]:
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
import re
russian_stopwords = stopwords.words("russian")
alph = r'[<>\%\(\)abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/#;:!{}_|\-\?.=&<>@\[\]""]'

information = []
for i in title.title:
    try:
        tokens = [token for token in i.lower().split() if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
        
        text = " ".join(tokens)
        text = re.sub(alph, '', text)
        text = re.sub(r'[\s+\d+\n]', ' ', text)
        information.append(text)
        
    except:
        information.append(np.nan)
    

In [7]:
title['information'] = information

In [54]:
data.to_csv('information.csv', index = False)

In [10]:
tfidf= TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        ngram_range=(1,3),dtype=np.float32)

In [12]:
data = data.replace(np.nan, '')

In [13]:
data_tfidf = tfidf.fit_transform(data.information)

In [14]:
# data_tfidf.shape
data_tfidf = pd.DataFrame(data_tfidf.toarray(), columns = tfidf.get_feature_names())

In [77]:
data_tfidf.to_csv('all_tfidf.csv', index = False)

In [2]:
data = pd.read_csv('information.csv')

In [16]:
data.head()

Unnamed: 0,pair_id,group_id,doc_id,is_train,title,information
0,1,1,15731,True,ВАЗ 21213 | Замена подшипников ступицы | Нива,ваз замена подшипников ступицы нива
1,2,1,14829,True,"Ваз 2107 оптом в Сочи. Сравнить цены, купить п...","ваз оптом сочи сравнить цены, купить потр..."
2,3,1,15764,True,Купить ступица Лада калина2. Трансмиссия - пер...,купить ступица лада калина трансмиссия перехо...
3,4,1,17669,True,Классика 21010 - 21074,классика
4,5,1,14852,True,Ступица Нива — замена подшипника своими руками,ступица нива — замена подшипника своими руками


In [17]:
data_tfidf['doc_id'] = data.doc_id

In [18]:
data_all = data.merge(data_tfidf, how = 'left', on = 'doc_id')

In [23]:
data_all.head()

Unnamed: 0,pair_id,group_id,doc_id,is_train,title,information,абармин,абармин руда,абонентов,аборт,...,ярославля,ярославская,ярославская область,ярославский,ярославского,ярославской,ярославской области,ящик,ящики,ігри
0,1,1,15731,True,ВАЗ 21213 | Замена подшипников ступицы | Нива,ваз замена подшипников ступицы нива,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,14829,True,"Ваз 2107 оптом в Сочи. Сравнить цены, купить п...","ваз оптом сочи сравнить цены, купить потр...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,15764,True,Купить ступица Лада калина2. Трансмиссия - пер...,купить ступица лада калина трансмиссия перехо...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,1,17669,True,Классика 21010 - 21074,классика,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1,14852,True,Ступица Нива — замена подшипника своими руками,ступица нива — замена подшипника своими руками,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
data_all[data_all.is_train].drop(['title', 'is_train', 'information'], 1).to_csv('train_tfidf.csv', index = False)
data_all[~data_all.is_train].drop(['title', 'is_train', 'information'], 1).to_csv('test_tfidf.csv', index = False)

In [27]:
from xgboost import XGBClassifier 

In [28]:
xgb = XGBClassifier(n_estimators=1200,  random_state = 0, nthread = 4, learning_rate = 0.01,
                        max_depth=4)

In [29]:
y_train = train.target

In [None]:
groups = train.group_id.unique()
X = data_all.drop(['title', 'is_train', 'information'], 1)[data_all['group_id'].isin(groups[:int(len(groups)*4/6)])]
X_val = data_all.drop(['title', 'is_train', 'information'], 1)[data_all['group_id'].isin(groups[int(len(groups)*4/6):])]
y = y_train[train['group_id'].isin(groups[:int(len(groups)*4/6)])]
y_val = y_train[train['group_id'].isin(groups[int(len(groups)*4/6):])]

In [33]:
xgb.fit(X, y, eval_set = eval_set, eval_metric="logloss")

ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields title, information

In [32]:
eval_set = [(X_val, y_val)]


In [3]:
data_tfidf = pd.read_csv('all_tfidf.csv', dtype = 'float32')

KeyboardInterrupt: 

In [78]:
all = 0

In [72]:
all = pd.concat((data_tfidf, data_x), axis = 1, igmoreindex = True)

ValueError: Shape of passed values is (20004, 40007), indices imply (20004, 28317)

In [11]:
data = data_x.merge(title, how = 'left', on = 'doc_id')