In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder, MinMaxScaler,  Imputer
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVR

from tqdm import tqdm
import time
# import cPickle as pickle
from scipy import sparse
from scipy.sparse import hstack

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [2]:
df = pd.read_csv('data/vk_users_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34011 entries, 0 to 34010
Data columns (total 22 columns):
bdate         27938 non-null object
langs         4425 non-null object
political     5193 non-null float64
smoking       5987 non-null float64
verified      34011 non-null int64
life_main     5965 non-null float64
alcohol       5915 non-null float64
tv            3272 non-null object
country       32669 non-null object
sex           34011 non-null int64
last_name     34011 non-null object
age           34011 non-null float64
id            34011 non-null int64
posts         34011 non-null object
interests     5383 non-null object
university    14180 non-null float64
first_name    34011 non-null object
city          31156 non-null object
religion      6096 non-null object
movies        4427 non-null object
relation      14180 non-null float64
music         4709 non-null object
dtypes: float64(7), int64(3), object(12)
memory usage: 5.7+ MB


In [3]:
df['interests'] = df['interests'].fillna("empty")
df['movies'] = df['movies'].fillna("empty")
df['music'] = df['music'].fillna("empty")
df['tv'] = df['tv'].fillna("empty")
# –∑–∞–º–µ–Ω–∏–º –Ω–µ —É–∫–∞–∑–∞–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –Ω–∞ -1
df = df.fillna(-1)
# –í —Å—Ç–æ–ª–±—Ü–µ university —Å–æ–¥–µ—Ä–∂–∏—Ç—Å—è id —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–∞ –ø–æ –ë–î VK. 
# –ë—É–¥–µ–º —Å—á–∏—Ç–∞—Ç—å, —á—Ç–æ –µ—Å–ª–∏ –æ–Ω —É–∫–∞–∑–∞–Ω, —Ç–æ —É –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –µ—Å—Ç—å –≤—ã—Å—à–µ–µ 
# –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ, –∏–Ω–∞—á–µ –Ω–µ—Ç (—Ö–æ—Ç—è —Å—Ç—Ä–æ–≥–æ —ç—Ç–æ –Ω–µ —Ç–∞–∫, –æ–Ω –º–æ–∂–µ—Ç –±—ã—Ç—å –ø—Ä–æ—Å—Ç–æ 
# –Ω–µ —É–∫–∞–∑–∞–Ω. –ù–æ –ø—Ä–∏–º–µ–º —Ç–∞–∫–æ–µ –ø—Ä–µ–¥–ø–æ–ª–æ–∂–µ–Ω–∏–µ)
df['high_education'] = df['university'].apply(lambda x: 0 if x < -0.5 else 1)
df['age'] = df['age'].astype(int)
df['political'] = df['political'].astype(int)
df['smoking'] = df['smoking'].astype(int)
df['alcohol'] = df['alcohol'].astype(int)
df['relation'] = df['relation'].astype(int)
df['life_main'] = df['life_main'].astype(int)
# df['posts'] = df['posts'].str.decode('utf-8')

In [4]:
# –ß—Ç–æ–±—ã –ø—Ä–µ–≤—Ä–∞—Ç–∏—Ç—å –Ω–∞—à—É –∑–∞–¥–∞—á—É –≤ –∑–∞–¥–∞—á—É –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏, —É–º–µ–Ω—å—à–∏–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–π –≤–æ–∑—Ä–∞—Å—Ç–∞
# —Ä–∞—Å—Å–º–æ—Ç—Ä–∏–º –≤–æ–∑—Ä–∞—Å—Ç–∞ –æ—Ç 0 –¥–æ 18; –æ—Ç 18 –¥–æ 30; –æ—Ç 30 –¥–æ 50; –æ—Ç 50 –¥–æ 70 –∏ –æ—Ç 70 –¥–æ 110 - 5 –∫–∞—Ç–µ–≥–æ—Ä–∏–π
def age_cat(age):
    if 0 <= age <= 18:
        return 0
    elif 18 < age <= 30:
        return 1
    elif 30 < age <= 50:
        return 2
    elif 50 < age <= 70:
        return 3
    elif 70 < age <= 110:
        return 4
    
df['age_category'] = df['age'].apply(lambda x: age_cat(x))
df.head()

Unnamed: 0,bdate,langs,political,smoking,verified,life_main,alcohol,tv,country,sex,...,interests,university,first_name,city,religion,movies,relation,music,high_education,age_category
0,28.4,-1,-1,-1,0,-1,-1,empty,–†–æ—Å—Å–∏—è,2,...,empty,-1.0,–í–∏—Ç–∞–ª—è,–ú–æ—Å–∫–≤–∞,-1,empty,-1,empty,0,4
1,-1,-1,-1,-1,0,-1,-1,empty,–†–æ—Å—Å–∏—è,2,...,"üîû–ù–µ –ª–∞–π–∫–∞—é, –≤–∏–¥–µ–æ –Ω–µ —Å–º–æ—Ç—Ä—é, –≤ –≥—Ä—É–ø–ø—ã –Ω–µ –≤—Å—Ç—É–ø...",0.0,–û–ª–µ–≥,-1,-1,empty,0,‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù‚òù,1,2
2,-1,-1,-1,-1,1,-1,-1,empty,–†–æ—Å—Å–∏—è,2,...,empty,0.0,–ê—Ä—Ç—É—Ä,–ù–∞–±–µ—Ä–µ–∂–Ω—ã–µ –ß–µ–ª–Ω—ã,4ayan.ru,empty,0,empty,1,2
3,1.1,-1,8,-1,0,-1,-1,"Zara, Pull and Bear, Massimo Dutti, Bershka, S...",–£–∫—Ä–∞–∏–Ω–∞,2,...,"Zara, Pull and Bear, Massimo Dutti, Bershka, S...",1892.0,Zara,–î–Ω–µ–ø—Ä–æ–ø–µ—Ç—Ä–æ–≤—Å–∫ (–î–Ω–µ–ø—Ä),Buisness,"Zara, Pull and Bear, Massimo Dutti, Bershka, S...",0,"Zara, Pull and Bear, Massimo Dutti, Bershka, S...",1,2
4,6.6.1989,-1,-1,-1,0,-1,-1,empty,–£–∫—Ä–∞–∏–Ω–∞,1,...,empty,-1.0,–ù–∞—Ç–∞–ª—è,–õ—å–≤–æ–≤,-1,empty,-1,empty,0,1


In [5]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
df_train.head()

Unnamed: 0,bdate,langs,political,smoking,verified,life_main,alcohol,tv,country,sex,...,interests,university,first_name,city,religion,movies,relation,music,high_education,age_category
11153,25.3.1936,-1,-1,-1,0,-1,-1,empty,–†–æ—Å—Å–∏—è,2,...,empty,-1.0,–ú–∏—Ö–∞–∏–ª,–ù–∏–∂–Ω–∏–π –ù–æ–≤–≥–æ—Ä–æ–¥,-1,empty,-1,empty,0,4
6352,12.8,"['–†—É—Å—Å–∫–∏–π', 'English']",-1,2,0,-1,1,empty,–†–æ—Å—Å–∏—è,2,...,empty,0.0,–í–∞–ª–µ—Ä–∞,–°–∞–º–∞—Ä–∞,–ü–∞—Å—Ç–∞—Ñ–∞—Ä–∏–∞–Ω—Å—Ç–≤–æ,empty,2,empty,1,0
27363,1.9.2001,-1,-1,-1,0,-1,-1,empty,–†–æ—Å—Å–∏—è,2,...,empty,0.0,–ù–∏–∫–∏—Ç–∞,–ö—Ä–∞—Å–Ω–æ–¥–∞—Ä,-1,empty,6,empty,1,0
19244,29.8.1991,"['–†—É—Å—Å–∫–∏–π', '–£–∫—Ä–∞—ó–Ω—Å—å–∫–∞']",3,4,0,2,5,–®–æ—É –û–ø—Ä—ã,–†–æ—Å—Å–∏—è,1,...,"–ö–ª—É–±—ã, –ø–∞—Ä–Ω–∏, —Ç–∞–Ω—Ü—ã",327.0,–ú–∞—Ä–∏–Ω–∞,–ú–æ—Å–∫–≤–∞,–ü—Ä–∞–≤–æ—Å–ª–∞–≤–∏–µ,"–£–Ω–∏–≤–µ—Ä, –ò–Ω—Ç–µ—Ä–Ω—ã, –û–¥–∏–Ω –¥–æ–º–∞, –ö—Ä–∏–∫, –ó–≤–æ–Ω–æ–∫",5,"–¢–∞, –ø–æ–¥ –∫–æ—Ç–æ—Ä—É—é –º–æ–∂–Ω–æ –ø–æ–¥–≤–∏–≥–∞—Ç—å—Å—è",1,1
29708,29.11.1945,-1,-1,-1,0,-1,-1,empty,–†–æ—Å—Å–∏—è,2,...,empty,-1.0,–ì—Ä–∏—à–∞,–†–∞–¥—É–∂–Ω—ã–π,-1,empty,-1,empty,0,4


In [7]:
df_test.head()

Unnamed: 0,bdate,langs,political,smoking,verified,life_main,alcohol,tv,country,sex,...,interests,university,first_name,city,religion,movies,relation,music,high_education,age_category
11313,27.8,-1,3,1,0,1,1,empty,–†–æ—Å—Å–∏—è,1,...,empty,0.0,–ê–Ω–∞—Å—Ç–∞—Å–∏—è,–ö—Ä–∞—Å–Ω–æ–¥–∞—Ä,–ü—Ä–∞–≤–æ—Å–ª–∞–≤–∏–µ,empty,4,empty,1,4
23533,19.3,"['–†—É—Å—Å–∫–∏–π', 'Gagauz dili', 'T√ºrk√ße', '–£–∫—Ä–∞—ó–Ω—Å—å...",3,2,0,1,2,–Ω–µ –Ω–∞ –≤–∏–∂—É,–£–∫—Ä–∞–∏–Ω–∞,2,...,"–∫—É—à–∞—Ç—å, —Å–ø–∞—Ç—å, –∏–Ω—Ç–µ—Ä–Ω–µ—Ç)",0.0,Dima,–û–¥–µ—Å—Å–∞,–ü—Ä–∞–≤–æ—Å–ª–∞–≤–∏–µ,—Ä–∞–∑–Ω—ã–µ,4,—Ä–∞–∑–Ω—ã–µ,1,1
3386,-1,-1,-1,4,0,3,4,empty,–†–æ—Å—Å–∏—è,2,...,empty,0.0,–°–∞—à–∞,-1,-1,empty,0,empty,1,4
12659,20.6,['English'],-1,2,0,-1,2,empty,–ë–µ–ª–∞—Ä—É—Å—å,2,...,empty,0.0,–ê–ª–µ–∫—Å–µ–π,–°–ª—É—Ü–∫,-1,empty,0,"SUICIDEBOY, BONES, LIL PEEP, IMAGINE DRAGONS, ...",1,4
8037,31.3.1905,-1,-1,-1,0,-1,-1,empty,–†–æ—Å—Å–∏—è,2,...,empty,-1.0,–î–µ–Ω–∏—Å,-1,-1,empty,-1,empty,0,4


In [8]:
# –°–æ—Ö—Ä–∞–Ω–∏–º –Ω–∞–±–æ—Ä—ã –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –¥–∞–ª—å–Ω–µ–π—à–µ–≥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
df_train.to_csv('learning/df_train.csv', sep='\t', encoding='utf-8')
df_test.to_csv('learning/df_test.csv', sep='\t', encoding='utf-8')

–°–º—ã—Å–ª –∑–Ω–∞—á–µ–Ω–∏–π –≤ —Å—Ç–æ–ª–±—Ü–∞—Ö sex, political, smoking, alcohol, relation, life_main —Å–ª–µ–¥—É—é—â–∏–π (–∏–∑ –æ–ø–∏—Å–∞–Ω–∏—è API VK):

1) sex - –ø–æ–ª –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è:
1 - –∂–µ–Ω—Å–∫–∏–π;
2 - –º—É–∂—Å–∫–æ–π;
0 - –ø–æ–ª –Ω–µ —É–∫–∞–∑–∞–Ω

2) political - –ø–æ–ª–∏—Ç–∏—á–µ—Å–∫–∏–µ –ø—Ä–µ–¥–ø–æ—á—Ç–µ–Ω–∏—è:

1 - –∫–æ–º–º—É–Ω–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ;
2 - —Å–æ—Ü–∏–∞–ª–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ;
3 - —É–º–µ—Ä–µ–Ω–Ω—ã–µ;
4 - –ª–∏–±–µ—Ä–∞–ª—å–Ω—ã–µ;
5 - –∫–æ–Ω—Å–µ—Ä–≤–∞—Ç–∏–≤–Ω—ã–µ;
6 - –º–æ–Ω–∞—Ä—Ö–∏—á–µ—Å–∫–∏–µ;
7 - —É–ª—å—Ç—Ä–∞–∫–æ–Ω—Å–µ—Ä–≤–∞—Ç–∏–≤–Ω—ã–µ;
8 - –∏–Ω–¥–∏—Ñ—Ñ–∏—Ä–µ–Ω—Ç–Ω—ã–µ;
9 - –ª–∏–±–µ—Ä—Ç–∞—Ä–∏–∞–Ω—Å–∫–∏–µ;

3) smoking, alcohol - –æ—Ç–Ω–æ—à–µ–Ω–∏–µ –∫ –∫—É—Ä–µ–Ω–∏—é, –∞–ª–∫–æ–≥–æ–ª—é:

1 - —Ä–µ–∑–∫–æ –Ω–µ–≥–∞—Ç–∏–≤–Ω–æ–µ;
2 - –Ω–µ–≥–∞—Ç–∏–≤–Ω–æ–µ;
3 - –∫–æ–º–ø—Ä–æ–º–∏—Å—Å–Ω–æ–µ;
4 - –Ω–µ–π—Ç—Ä–∞–ª—å–Ω–æ–µ;
5 - –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–µ;

4) relation - —Å–µ–º–µ–π–Ω–æ–µ –ø–æ–ª–æ–∂–µ–Ω–∏–µ:

1 - –Ω–µ –∂–µ–Ω–∞—Ç/–Ω–µ –∑–∞–º—É–∂–µ–º;
2 - –µ—Å—Ç—å –¥—Ä—É–≥/–µ—Å—Ç—å –ø–æ–¥—Ä—É–≥–∞;
3 - –ø–æ–º–æ–ª–≤–ª–µ–Ω/–ø–æ–º–æ–ª–≤–ª–µ–Ω–∞;
4 - –∂–µ–Ω–∞—Ç/–∑–∞–º—É–∂–µ–º;
5 - –≤—Å—ë —Å–ª–æ–∂–Ω–æ;
6 - –≤ –∞–∫—Ç–∏–≤–Ω–æ–º –ø–æ–∏—Å–∫–µ;
7 - –≤–ª—é–±–ª—ë–Ω/–≤–ª—é–±–ª–µ–Ω–∞;
8 - –≤ –≥—Ä–∞–∂–¥–∞–Ω—Å–∫–æ–º –±—Ä–∞–∫–µ;
0 - –Ω–µ —É–∫–∞–∑–∞–Ω–æ;

5) life_main - –≥–ª–∞–≤–Ω–æ–µ –≤ –∂–∏–∑–Ω–∏:

1 - —Å–µ–º—å—è –∏ –¥–µ—Ç–∏;
2 - –∫–∞—Ä—å–µ—Ä–∞ –∏ –¥–µ–Ω—å–≥–∏;
3 - —Ä–∞–∑–≤–ª–µ—á–µ–Ω–∏—è –∏ –æ—Ç–¥—ã—Ö;
4 - –Ω–∞—É–∫–∞ –∏ –∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏—è;
5 - —Å–æ–≤–µ—Ä—à–µ–Ω—Å—Ç–≤–æ–≤–∞–Ω–∏–µ –º–∏—Ä–∞;
6 - —Å–∞–º–æ—Ä–∞–∑–≤–∏—Ç–∏–µ;
7 - –∫—Ä–∞—Å–æ—Ç–∞ –∏ –∏—Å–∫—É—Å—Å—Ç–≤–æ;
8 - —Å–ª–∞–≤–∞ –∏ –≤–ª–∏—è–Ω–∏–µ

–ü–æ–ø—ã—Ç–∞–µ–º—Å—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞—Ç—å –≤–æ–∑—Ä–∞—Å—Ç (–ø–æ –¥–∞–Ω–Ω—ã–º VK (—Å–∫–æ—Ä–µ–µ –≤—Å–µ–≥–æ, –º–æ–∂–µ—Ç –±—ã—Ç—å —Å –æ—à–∏–±–∫–∞–º–∏: –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–æ, –∫–∞–∫ –æ–Ω —Ç–∞–º –æ–ø—Ä–µ–¥–µ–ª—è–µ—Ç—Å—è)) –ø–æ –ø–æ—Å—Ç–∞–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è, –µ–≥–æ –ø–æ–ª—É (–ø–æ–ª —É–∫–∞–∑–∞–Ω –≤—Å–µ–≥–¥–∞) –∏ —É–∫–∞–∑–∞–Ω–Ω—ã–º –ø—Ä–∏–∑–Ω–∞–∫–∞–º–∏ –∏–∑ —Ä–∞–∑–¥–µ–ª–∞ personal, –∞ —Ç–∞–∫–∂–µ –ø–æ –Ω–∞–ª–∏—á–∏—é –≤—ã—Å—à–µ–≥–æ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è

#### –§—É–Ω–∫—Ü–∏–∏ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –ø–æ—Å—Ç–æ–≤ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è

In [9]:
from nltk.corpus import stopwords

stopwords_en = stopwords.words('english')
stopwords_ru = stopwords.words('russian')
stopwords_ge = stopwords.words('german')

stopwords_all = stopwords_en + stopwords_ru + stopwords_ge

# –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ –±—É–∫–≤–æ—Å–æ—á–µ—Ç–∞–Ω–∏—è, –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥–µ–º —É–¥–∞–ª—è—Ç—å –∏–∑ —Ç–µ–∫—Å—Ç–æ–≤
additional_stopwords = \
[u'https', u'vk', u'com', u'id', u'ph', u'–¥—Ä', u'—Å–≤', u'ff', u'la', u'—ç—Ç–æ', \
 u'de', u'pa', u'bb', u'p', u'—É–ª', u'–∏–Ω', u'http', u'ru', u'md', u'x', \
 u'ft', u'—Å–±', u'b', u'–∫', u'www', u'youtube', u'–∫–∞', u'v', u'g', u'goo', u'gl', \
 u'eu', u'u', u'te', u'un', u'–≤–∫', u'w', u'ly', u'su', u'bu', u'vl', u'—ç—Ç', u'r', u'e', \
 u'—Å–≤–æ–π', u'–µ—â—ë', u'–º–æ–π', u'–≤–µ—Å—å', u'–¥–Ω—ë–º', u'youtu', u'—Ç–≤–æ–π', u'–Ω–∞—à', u'–≤–∞—à', u'—Ç–æ—Ç', u'—ç—Ç–æ—Ç']

stopwords_all = stopwords_all + additional_stopwords

# –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ —É–¥–∞–ª—è–µ–º—ã–µ –ø–æ—Å–ª–µ –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–∏ —Å–ª–æ–≤–∞
delete_words = [u'—Å–≤–æ–π', u'–µ—â—ë', u'–º–æ–π', u'–≤–µ—Å—å', u'–¥–Ω—ë–º', u'youtu', u'—Ç–≤–æ–π', u'–Ω–∞—à', u'–≤–∞—à', u'—Ç–æ—Ç', u'—ç—Ç–æ—Ç']

In [10]:
# –∏—Å–ø–æ–ª—å–∑—É–µ–º –¥–ª—è –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–∏ –±–∏–±–ª–∏–æ—Ç–µ–∫—É pymorphy2, –∫–æ—Ç–æ—Ä–∞—è —Ä–∞–±–æ—Ç–∞–µ—Ç —Å —Ä—É—Å—Å–∫–∏–º —è–∑—ã–∫–æ–º
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def lemmatization(text):
    return morph.parse(text)[0].normal_form

def my_tokenizer(text):
    text = text.lower()
    # –æ—á–∏—Å—Ç–∫–∞ –æ—Ç html-—Ä–∞–∑–º–µ—Ç–∫–∏
    text = re.sub('<[^>]*>', '', text)
    # –≤—ã–¥–µ–ª–µ–Ω–∏–µ —Å–º–∞–π–ª–æ–≤
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # —É–¥–∞–ª–µ–Ω–∏–µ –Ω–µ—Å–ª–æ–≤–∞—Ä–Ω—ã—Ö —Å–∏–º–≤–æ–ª–æ–≤
    text = re.sub(r'[\W]+', ' ', text, flags=re.U) + ' '.join(emoticons).replace('-', '')
    # —É–¥–∞–ª–∏–º —Ç–∞–∫–∂–µ –≤—Å–µ —Ü–∏—Ñ—Ä—ã, –∫–æ—Ç–æ—Ä—ã–µ –≤—Ä—è–¥ –ª–∏ –ø–æ–ª–µ–∑–Ω—ã –ø—Ä–∏ –∞–Ω–∞–ª–∏–∑–µ —Ç–µ–∫—Å—Ç–∞
    text = re.sub(r'\d+', '', text, flags=re.U)
    # —É–¥–∞–ª–∏–º –¥–≤–∞ –∏–ª–∏ –±–æ–ª–µ–µ –ø—Ä–æ–±–µ–ª–æ–≤
    text = re.sub(r'[ ]{2,}', '', text, flags=re.U)
    # —É–¥–∞–ª–∏–º –ø–æ–¥—á—ë—Ä–∫–∏–≤–∞–Ω–∏—è
    text = re.sub(r'[_]+', '', text, flags=re.U)
    # —Ä–∞–∑–±–∏–µ–Ω–∏–µ –ø–æ –ø—Ä–æ–±–µ–ª–∞–º –∏ —É–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤
    tokenized = [w for w in text.split() if w not in stopwords_all]
    tokenized = [re.sub(r"\W", "", w, flags=re.U) for w in tokenized]
    # –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è
    tokenized = [lemmatization(w) for w in tokenized]
    # —É–¥–∞–ª–∏–º –≤—Å–µ —Å–ª–æ–≤–∞ –∏–∑ –æ–¥–Ω–æ–π –∏ –¥–≤—É—Ö –±—É–∫–≤. –í—Ä—è–¥ –ª–∏ –æ–Ω–∏ –Ω–µ—Å—É—Ç –±–æ–ª—å—à–æ–π —Å–º—ã—Å–ª. 
    # –¢–∞–∫–∂–µ —É–¥–∞–ª–∏–º "—Å–ª–æ–≤–∞-–ø–∞—Ä–∞–∑–∏—Ç—ã"
    tokenized = [w for w in tokenized if len(w) > 2 and w not in delete_words]
    return tokenized

In [11]:
def get_posts_col(df):
    return df['posts']

def get_sex_col(df):
    return df[['sex']]

def get_interests_col(df):
    return df[df['interests'] != ""]['interests']

def get_movies_col(df):
    return df[df['movies'] != ""]['movies']

def get_music_col(df):
    return df[df['music'] != ""]['music']

def get_tv_col(df):
    return df[df['tv'] != ""]['tv']

def get_categorial_cols(df):
    return df[['high_education', 'political', 'smoking', 'alcohol', 'relation', 'life_main']]

def get_text_sample(df):   
    return df[['posts','interests', 'movies', 'music', 'tv']]

vec = make_union(*[
    make_pipeline(FunctionTransformer(get_categorial_cols, validate=False), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_posts_col, validate=False)), CountVectorizer(tokenizer=my_tokenizer)
])

–° pipeline'–æ–º —á—Ç–æ-—Ç–æ –Ω–µ –ø–æ–ª—É—á–∞–µ—Ç—Å—è, –ø–æ—ç—Ç–æ–º—É –∏—Å–ø–æ–ª—å–∑—É–µ–º —Ç–æ–ª—å–∫–æ —Å—Ç–æ–ª–±–µ—Ü —Ç–µ–∫—Å—Ç–∞

In [12]:
y_train = df_train['age_category']
y_test = df_test['age_category']
print(y_train.shape)
print(y_test.shape)

(27208,)
(6803,)


In [13]:
y_train.to_csv('learning/y_train', sep='\t')
y_test.to_csv('learning/y_test', sep='\t')

In [14]:
# —Ñ—É–Ω–∫—Ü–∏–∏ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –∏ –ø–æ—Å–ª–µ–¥—É—é—â–µ–≥–æ –≤–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏—è —Ä–∞–∑—Ä–µ–∂–µ–Ω–Ω–æ–π –º–∞—Ç—Ä–∏—Ü—ã
def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [15]:
# vectorizer = CountVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 2))
vectorizer_posts = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 1), analyzer='word', max_features=1000)
vectorizer_interests = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 1), analyzer='word', max_features=1000)
vectorizer_movies = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 1), analyzer='word', max_features=1000)
vectorizer_music = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 1), analyzer='word', max_features=1000)
vectorizer_tv = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 1), analyzer='word', max_features=1000)

In [16]:
# –ò—Å–ø–æ–ª—å–∑—É–µ–º —Å—Ä–∞–∑—É –≤—Å–µ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏
X_train_posts = vectorizer_posts.fit_transform(tqdm(get_posts_col(df_train)))
print(X_train_posts.shape)

X_train_interests = vectorizer_interests.fit_transform(tqdm(get_interests_col(df_train)))
print(X_train_interests.shape)

X_train_movies = vectorizer_movies.fit_transform(tqdm(get_movies_col(df_train)))
print(X_train_movies.shape)

X_train_music = vectorizer_music.fit_transform(tqdm(get_music_col(df_train)))
print(X_train_music.shape)

X_train_tv = vectorizer_tv.fit_transform(tqdm(get_tv_col(df_train)))
print(X_train_tv.shape)

X_train_text = hstack([X_train_posts, X_train_interests, X_train_movies, X_train_music, X_train_tv])
X_train_text.shape

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27208/27208 [1:09:40<00:00,  6.51it/s]
  1%|          | 145/27208 [00:00<00:34, 778.36it/s]

(27208, 1000)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27208/27208 [00:33<00:00, 822.14it/s]
  1%|          | 303/27208 [00:00<00:09, 2945.75it/s]

(27208, 1000)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27208/27208 [00:17<00:00, 1598.96it/s]
  2%|‚ñè         | 414/27208 [00:00<00:06, 4092.73it/s]

(27208, 1000)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27208/27208 [00:11<00:00, 2366.66it/s]
  2%|‚ñè         | 430/27208 [00:00<00:06, 4184.71it/s]

(27208, 1000)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27208/27208 [00:07<00:00, 3414.18it/s]

(27208, 1000)





(27208, 5000)

### –†–∞—Å—Å–º–æ—Ç—Ä–∏–º –ø–æ–ª—É—á–∏–≤—à—É—é—Å—è –º–∞—Ç—Ä–∏—Ü—É tf-idf –¥–ª—è –ø–æ—Å—Ç–æ–≤

In [17]:
features_posts = vectorizer_posts.get_feature_names()
features_interests = vectorizer_interests.get_feature_names()
features_movies = vectorizer_movies.get_feature_names()
features_music = vectorizer_music.get_feature_names()
features_tv = vectorizer_tv.get_feature_names()

In [21]:
print(features_posts[:10])
print(features_interests[:10])
print(features_movies[:10])
print(features_music[:10])
print(features_tv[:10])

[u'adid', u'android', u'app', u'bitcoin', u'club', u'facebook', u'fotomagicsu', u'fotomimi', u'happy', u'instagram']
[u'abc', u'agel', u'aimania', u'akuna', u'alfa', u'alivemax', u'ambre', u'amway', u'apple', u'aquel']
[u'alliance', u'american', u'bad', u'big', u'black', u'capital', u'cash', u'club', u'corp', u'dead']
[u'abba', u'accept', u'acid', u'adele', u'adriano', u'aerosmith', u'age', u'air', u'aka', u'akon']
[u'air', u'alliance', u'allin', u'american', u'androiid', u'apple', u'aston', u'bad', u'bbc', u'big']


In [22]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [23]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [25]:
top_feats_in_doc(X_train_posts, features_posts, 1000, top_n=6)

Unnamed: 0,feature,tfidf
0,–º–æ—Ä,0.49
1,music,0.22
2,–≤—Å—Ç—Ä–µ—á–∞,0.22
3,–Ω–∞—Å—Ç—Ä–æ–µ–Ω–∏–µ,0.2
4,—ë–ª–∫–∞,0.2
5,–¥–æ–º,0.18


In [26]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [27]:
top_mean_feats(X_train_posts, features_posts, top_n=10)

Unnamed: 0,feature,tfidf
0,—á–µ–ª–æ–≤–µ–∫,0.04
1,app,0.04
2,—Ä–æ–∂–¥–µ–Ω–∏–µ,0.03
3,–∫–æ—Ç–æ—Ä—ã–π,0.03
4,–æ—Ç–∫—Ä—ã—Ç–∫–∞,0.03
5,–¥—Ä—É–≥,0.03
6,–≥–æ–¥,0.03
7,–∂–∏–∑–Ω—å,0.03
8,–ª—é–±–∏—Ç—å,0.03
9,–Ω–æ–≤—ã–π,0.02


In [28]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [29]:
top_feats_by_class(X_train_posts, y_train, features_posts, top_n=5)

[    feature  tfidf
 0       app   0.04
 1      –¥—Ä—É–≥   0.03
 2  —Ä–æ–∂–¥–µ–Ω–∏–µ   0.03
 3    —É–∑–Ω–∞—Ç—å   0.03
 4    –ª—é–±–∏—Ç—å   0.03,      feature  tfidf
 0    —á–µ–ª–æ–≤–µ–∫   0.04
 1    –∫–æ—Ç–æ—Ä—ã–π   0.04
 2  instagram   0.04
 3    —Å–ø–∞—Å–∏–±–æ   0.03
 4       –¥—Ä—É–≥   0.03,     feature  tfidf
 0       app   0.05
 1  –æ—Ç–∫—Ä—ã—Ç–∫–∞   0.05
 2   —á–µ–ª–æ–≤–µ–∫   0.04
 3   –∫–æ—Ç–æ—Ä—ã–π   0.03
 4  —Ä–æ–∂–¥–µ–Ω–∏–µ   0.03,     feature  tfidf
 0       app   0.06
 1  –æ—Ç–∫—Ä—ã—Ç–∫–∞   0.06
 2   —á–µ–ª–æ–≤–µ–∫   0.04
 3  —Ä–æ–∂–¥–µ–Ω–∏–µ   0.03
 4   –∫–æ—Ç–æ—Ä—ã–π   0.03,     feature  tfidf
 0   —á–µ–ª–æ–≤–µ–∫   0.04
 1  —Ä–æ–∂–¥–µ–Ω–∏–µ   0.04
 2    –ª—é–±–∏—Ç—å   0.03
 3     –∂–∏–∑–Ω—å   0.03
 4   –∫–æ—Ç–æ—Ä—ã–π   0.03]

In [32]:
save_sparse_csr('learning/X_train_bag_of_words', X_train_text)

AttributeError: indices not found

In [21]:
# —Å–æ—Ö—Ä–∞–Ω–∏–º –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä, —á—Ç–æ–±—ã –Ω–µ —Ç—Ä–∞—Ç–∏—Ç—å –≤—Ä–µ–º—è –Ω–∞ –ø–æ–≤—Ç–æ—Ä–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ
with open('learning/vectorizer.pk', 'wb') as fin:
    pickle.dump(vectorizer, fin)

In [13]:
# vectorizer = pickle.load(open('vectorizer.pk', 'rb'))
# vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tokenizer at 0x7fdf1aef2e60>, use_idf=True,
        vocabulary=None)

In [36]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è –≤–µ–∫—Ç–æ—Ä–∞ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –≤ –≤–∏–¥–µ —Ä–∞–∑—Ä–µ–∂–µ–Ω–Ω–æ–π –º–∞—Ç—Ä–∏—Ü—ã
def get_X(X_bag_of_words, df):
    scaler = MinMaxScaler()

    # –ø—Ä–∏–±–∞–≤–∏–º –æ—Å—Ç–∞–ª—å–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏, –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥–µ–º —É—á–∏—Ç—ã–≤–∞—Ç—å –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏ –º–æ–¥–µ–ª–∏
    X_sex = sparse.csr_matrix(get_sex_col(df))

    categorial_cols_scale = scaler.fit_transform(get_categorial_cols(df))
    X_categorial = sparse.csr_matrix(categorial_cols_scale)

    X = hstack([X_sex, X_categorial, X_bag_of_words])

    print(X.shape)
    
    return X

In [37]:
X_train = get_X(X_train_text, df_train)

(27208, 5007)


In [38]:
X_test_posts = vectorizer_posts.transform(tqdm(get_posts_col(df_test)))
X_test_interests = vectorizer_interests.transform(tqdm(get_interests_col(df_test)))
X_test_movies = vectorizer_movies.transform(tqdm(get_movies_col(df_test)))
X_test_music = vectorizer_music.transform(tqdm(get_music_col(df_test)))
X_test_tv = vectorizer_tv.transform(tqdm(get_tv_col(df_test)))

X_test_text = hstack([X_test_posts, X_test_interests, X_test_movies, X_test_music, X_test_tv])

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6803/6803 [16:25<00:00,  6.91it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6803/6803 [00:07<00:00, 920.73it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6803/6803 [00:04<00:00, 1698.38it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6803/6803 [00:02<00:00, 2550.11it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6803/6803 [00:01<00:00, 3680.27it/s]


In [39]:
X_test = get_X(X_test_text, df_test)

(6803, 5007)


### –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏

In [40]:
def randomized_cv(model, param_grid, x_train, y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy', n_iter=10)
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    print('time for training is {} seconds'.format(t_end - t_start))
    return grid_search.best_estimator_

In [41]:
param_grid = {'alpha':[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 2, 5]}
model = MultinomialNB()
best_model = randomized_cv(model, param_grid, X_train, y_train)

model MultinomialNB best accuracy score is 0.460783593061
time for training is 3.3917350769 seconds


In [31]:
with open('learning/model_bayes.pk', 'wb') as fin:
    pickle.dump(best_model, fin)

NameError: name 'pickle' is not defined

In [42]:
y_pred = best_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.467146846979


–í–∏–¥–∏–º, —á—Ç–æ —Ç–æ—á–Ω–æ—Å—Ç—å –Ω–µ —Å–ª–∏—à–∫–æ–º –≤—ã—Å–æ–∫–∞—è, –Ω–æ –≤—Å—ë –∂–µ –ª—É—á—à–µ —Å–ª—É—á–∞–π–Ω–æ–≥–æ –≥–∞–¥–∞–Ω–∏—è (—É –Ω–∞—Å 5 –∫–ª–∞—Å—Å–æ–≤, –ø–æ—ç—Ç–æ–º—É –ø—Ä–∏ —Å–ª—É—á–∞–π–Ω–æ–º –≤—ã–±–æ—Ä–µ –±—ã–ª–æ –±—ã 0.2). –ï—Å—Ç—å –æ—â—É—â–µ–Ω–∏–µ, —á—Ç–æ —Ç–µ–∫—Å—Ç–≤ –ø–æ—Å—Ç–∞—Ö –í–ö –Ω–µ —Å–ª–∏—à–∫–æ–º –∫–æ—Ä—Ä–µ–ª–∏—Ä—É–µ—Ç —Å –≤–æ–∑—Ä–∞—Å—Ç–æ–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è, –ø–æ—ç—Ç–æ–º—É —Ö–æ—Ä–æ—à–µ–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Å–¥–µ–ª–∞—Ç—å —Ç—è–∂–µ–ª–æ

In [43]:
# –ø–æ–ø—Ä–æ–±—É–µ–º –±–∏–≥—Ä–∞–º–º—ã
vectorizer_2gram = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(2, 2), analyzer='word', max_features=5000)
X_train_bag_of_words_2gram = vectorizer_2gram.fit_transform(tqdm(get_posts_col(df_train)))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27208/27208 [1:08:17<00:00,  6.64it/s]


In [45]:
X_train_2gram = get_X(X_train_bag_of_words_2gram, df_train)

(27208, 5007)


In [46]:
X_test_bag_of_words_2gram = vectorizer_2gram.transform(tqdm(get_posts_col(df_test)))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6803/6803 [17:17<00:00,  6.56it/s]


In [47]:
X_test_2gram = get_X(X_test_bag_of_words_2gram, df_test)

(6803, 5007)


In [48]:
model_2gram = MultinomialNB()
best_model_2gram = randomized_cv(model_2gram, param_grid, X_train_2gram, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


model MultinomialNB best accuracy score is 0.475926198177
time for training is 1.65328288078 seconds


In [49]:
y_pred_2gram = best_model_2gram.predict(X_test_2gram)
print(accuracy_score(y_test, y_pred_2gram))

0.474496545642


In [60]:
# —Ä–∞—Å—Å–º–æ—Ç—Ä–∏–º –≤–∞—Ä–∏–∞–Ω—Ç –±–µ–∑ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω–æ–≥–æ –Ω–∞–º–∏ tokenizer'–∞
vectorizer_standard = TfidfVectorizer(max_features=10000)
X_train_bag_of_words_standard = vectorizer_standard.fit_transform(tqdm(get_posts_col(df_train)))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27208/27208 [00:22<00:00, 1183.00it/s]


In [61]:
X_train_standard = get_X(X_train_bag_of_words_standard, df_train)

(27208, 10007)


In [62]:
X_test_bag_of_words_standard = vectorizer_standard.transform(tqdm(get_posts_col(df_test)))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6803/6803 [00:06<00:00, 1046.56it/s]


In [63]:
model_standard = MultinomialNB()
best_model_standard = randomized_cv(model_standard, param_grid, X_train_standard, y_train)

model MultinomialNB best accuracy score is 0.488496030579
time for training is 9.50514888763 seconds


In [64]:
X_test_standard = get_X(X_test_bag_of_words_standard, df_test)
y_pred_standard = best_model_standard.predict(X_test_standard)
print(accuracy_score(y_test, y_pred_standard))

(6803, 10007)
0.492870792298


### –ù–µ–∫–æ—Ç–æ—Ä—ã–µ –≤—ã–≤–æ–¥—ã

1) –¢–µ–∫—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ VK –∏ –≤–æ–∑—Ä–∞—Å—Ç –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –∫–æ—Ä—Ä–µ–ª–∏—Ä—É—é, –Ω–æ –Ω–µ–∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ. –ò—Å–ø–æ–ª—å–∑—É—è —Ç–æ–ª—å–∫–æ —Ç–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ, –ø—Ä–µ–¥—Å–∫–∞–∑–∞—Ç—å –≤–æ–∑—Ä–∞—Å—Ç —Å –≤—ã—Å–æ–∫–æ–π —Ç–æ—á–Ω–æ—Å—Ç—å—é –Ω–µ –ø–æ–ª—É—á–∏—Ç—Å—è. –î–æ—Å—Ç–∏–≥–Ω—É—Ç–∞—è —Ç–æ—á–Ω–æ—Å—Ç—å —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –æ–∫–æ–ª–æ 0,5 –Ω–∞ 5 –∫–ª–∞—Å—Å–∞—Ö, —á—Ç–æ –ø—Ä–∏–º–µ—Ä–Ω–æ –≤ 2,5 —Ä–∞–∑–∞ –ª—É—á—à–µ —Å–ª—É—á–∞–π–Ω–æ–≥–æ –≥–∞–¥–∞–Ω–∏—è. 

2) –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –±–∏–≥—Ä–∞–º–º (–ø–∞—Ä —Å–ª–æ–≤) –Ω–µ—Å–∫–æ–ª—å–∫–æ —É–≤–µ–ª–∏—á–∏–≤–∞–µ—Ç —Ç–æ—á–Ω–æ—Å—Ç—å, –Ω–æ –Ω–µ–∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ.

3) –í—ã–±–æ—Ä –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ "–æ—Å—Ç–∞–≤–ª—è–µ–º—ã—Ö" –≤ –Ω–∞–±–æ—Ä–µ —Å–ª–æ–≤ –≤–ª–∏—è–µ—Ç –Ω–∞ —Ç–æ—á–Ω–æ—Å—Ç—å, –Ω–æ –∫–∞–∫ –≤—ã–±—Ä–∞—Ç—å –æ–ø—Ç–∏–º–∞–ª–¥—å–Ω–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ, –Ω–µ —Å–ª–∏—à–∫–æ–º –ø–æ–Ω—è—Ç–Ω–æ. 