In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder, MinMaxScaler,  Imputer
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVR

from tqdm import tqdm
import time
# import cPickle as pickle
from scipy import sparse
from scipy.sparse import hstack

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [2]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è –∫–≤–∞–¥—Ä–∞—Ç–Ω–æ–≥–æ –∫–æ—Ä–Ω—è —Å—Ä–µ–¥–Ω–µ–∫–≤–∞–¥—Ä–∞—Ç–∏—á–µ—Å–∫–æ–π –æ—à–∏–±–∫–∏ –ª–æ–≥–∞—Ä–∏—Ñ–º–∞ 
# (Root Mean Squared Logarithmic Error (RMSLE))
def rmsle(y, y_pred):
    y_pred[y_pred < 0.0] = 0.0
    log_sqr = np.square(np.log(np.add(y_pred, 1.0)) - np.log(np.add(y, 1.0)))
    return math.sqrt(np.sum(log_sqr) / y.shape[0])

In [3]:
df = pd.read_csv('data/vk_users_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35517 entries, 0 to 35516
Data columns (total 18 columns):
political     5400 non-null float64
country       34114 non-null object
smoking       6264 non-null float64
sex           35517 non-null int64
id            35517 non-null int64
last_name     35517 non-null object
alcohol       6189 non-null float64
religion      6357 non-null object
langs         4624 non-null object
city          32524 non-null object
relation      14826 non-null float64
age           35517 non-null float64
verified      35517 non-null int64
bdate         29173 non-null object
first_name    35517 non-null object
university    14826 non-null float64
life_main     6239 non-null float64
posts         35517 non-null object
dtypes: float64(7), int64(3), object(8)
memory usage: 4.9+ MB


In [4]:
# –∑–∞–º–µ–Ω–∏–º –Ω–µ —É–∫–∞–∑–∞–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –Ω–∞ -1
df = df.fillna(-1)
# –í —Å—Ç–æ–ª–±—Ü–µ university —Å–æ–¥–µ—Ä–∂–∏—Ç—Å—è id —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–∞ –ø–æ –ë–î VK. 
# –ë—É–¥–µ–º —Å—á–∏—Ç–∞—Ç—å, —á—Ç–æ –µ—Å–ª–∏ –æ–Ω —É–∫–∞–∑–∞–Ω, —Ç–æ —É –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –µ—Å—Ç—å –≤—ã—Å—à–µ–µ 
# –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ, –∏–Ω–∞—á–µ –Ω–µ—Ç (—Ö–æ—Ç—è —Å—Ç—Ä–æ–≥–æ —ç—Ç–æ –Ω–µ —Ç–∞–∫, –æ–Ω –º–æ–∂–µ—Ç –±—ã—Ç—å –ø—Ä–æ—Å—Ç–æ 
# –Ω–µ —É–∫–∞–∑–∞–Ω. –ù–æ –ø—Ä–∏–º–µ–º —Ç–∞–∫–æ–µ –ø—Ä–µ–¥–ø–æ–ª–æ–∂–µ–Ω–∏–µ)
df['high_education'] = df['university'].apply(lambda x: 0 if x < -0.5 else 1)
df['age'] = df['age'].astype(int)
df['political'] = df['political'].astype(int)
df['smoking'] = df['smoking'].astype(int)
df['alcohol'] = df['alcohol'].astype(int)
df['relation'] = df['relation'].astype(int)
df['life_main'] = df['life_main'].astype(int)
# df['posts'] = df['posts'].str.decode('utf-8')

In [5]:
# –ß—Ç–æ–±—ã –ø—Ä–µ–≤—Ä–∞—Ç–∏—Ç—å –Ω–∞—à—É –∑–∞–¥–∞—á—É –≤ –∑–∞–¥–∞—á—É –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏, —É–º–µ–Ω—å—à–∏–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–π –≤–æ–∑—Ä–∞—Å—Ç–∞
# —Ä–∞—Å—Å–º–æ—Ç—Ä–∏–º –≤–æ–∑—Ä–∞—Å—Ç–∞ –æ—Ç 0 –¥–æ 18; –æ—Ç 18 –¥–æ 30; –æ—Ç 30 –¥–æ 50; –æ—Ç 50 –¥–æ 70 –∏ –æ—Ç 70 –¥–æ 110 - 5 –∫–∞—Ç–µ–≥–æ—Ä–∏–π
def age_cat(age):
    if 0 <= age <= 18:
        return 0
    elif 18 < age <= 30:
        return 1
    elif 30 < age <= 50:
        return 2
    elif 50 < age <= 70:
        return 3
    elif 70 < age <= 110:
        return 4
    
df['age_category'] = df['age'].apply(lambda x: age_cat(x))
df.head()

Unnamed: 0,political,country,smoking,sex,id,last_name,alcohol,religion,langs,city,relation,age,verified,bdate,first_name,university,life_main,posts,high_education,age_category
0,-1,–†–æ—Å—Å–∏—è,-1,1,2615791,–¢—Ä–µ—Ç—å—è–∫–æ–≤–∞,-1,-1,-1,–ú–æ—Å–∫–≤–∞,-1,26,0,5.3.1992,–ê–Ω–∞—Å—Ç–∞—Å–∏—è,-1.0,-1,üå∏üå∏üå∏ #id2615791 (fashionlioness) #–º–æ–¥–µ–ª—å #—Ñ–æ—Ç–æ—Å...,0,1
1,-1,–†–æ—Å—Å–∏—è,-1,2,148071868,–î–º–∏—Ç—Ä–∏–µ–≤,-1,-1,-1,–ö–æ—Å—Ç—Ä–æ–º–∞,-1,18,0,-1,–°–µ—Ä–≥–µ–π,-1.0,-1,–ö–∞—Ä–∞–Ω—Ç–∏–Ω)!!!!!!!!!,0,0
2,3,–†–æ—Å—Å–∏—è,4,1,54774632,–í–ª–∞—Å–æ–≤–∞,4,–ü—Ä–∞–≤–æ—Å–ª–∞–≤–∏–µ,-1,–ü–µ—Ä–º—å,4,110,0,6.7,–ê–Ω—é—Ç–∞,0.0,1,"–ù–µ –≤–∞–∂–Ω–æ, —Å–∫–æ–ª—å–∫–æ –¥–≤–µ—Ä–µ–π –∑–∞–∫—Ä–æ–µ—Ç—Å—è –ø–µ—Ä–µ–¥ —Ç–≤–æ–∏–º...",1,4
3,-1,–†–æ—Å—Å–∏—è,-1,1,76303980,–®–∞–±–∞–ª–∫–æ–≤–∞,-1,-1,-1,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,-1,90,0,3.9,–ê–Ω–∞—Å—Ç–∞—Å–∏—è,-1.0,-1,–î—Ä—É–∑—å—è! –Ø —Å–æ–±–∏—Ä–∞—é –±–æ–ª—å—à—É—é –ø–æ—Å—ã–ª–∫—É —Å –ø–æ–º–æ—â—å—é –¥...,0,4
4,-1,–†–æ—Å—Å–∏—è,1,2,104199626,–ë–ª–µ–π—Ö,1,–ü—Ä–∞–≤–æ—Å–ª–∞–≤–∏–µ,"['–†—É—Å—Å–∫–∏–π', 'English']",–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,1,26,0,6.11,–≠–¥–≥–∞—Ä,1.0,1,–ù–µ –º–æ–π —Å—Ä–µ–¥–∏ —Ç–≤–æ–∏—Ö\n–ò—Å—Ç–∏–Ω–Ω—ã–π –∞—Ä–∏–µ—Ü. –•–∞—Ä–∞–∫—Ç–µ—Ä ‚Äî...,1,1


In [6]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
df_train.head()

Unnamed: 0,political,country,smoking,sex,id,last_name,alcohol,religion,langs,city,relation,age,verified,bdate,first_name,university,life_main,posts,high_education,age_category
15714,-1,–£–∫—Ä–∞–∏–Ω–∞,-1,1,128537508,–õ–µ—Ö–Ω–æ–≤—Å—å–∫–∞,-1,-1,-1,–õ—å–≤–æ–≤,-1,30,0,17.9.1989,–ú–∞—Ä'—è–Ω–∞,-1.0,-1,"–ú–æ—è –∫–≤—ñ—Ç–æ—á–∫–∞üòòüåπüåºüå∏\n–ö–≤—ñ—Ç–æ—á–∫–∞ –Ω–∞—à–∞, –≤—ñ—Ç–∞—î–º–æ —Ç–µ–±–µ ...",0,1
11666,6,–ò—Ç–∞–ª–∏—è,-1,1,132398387,–ü–µ—Ç—Ä–æ—Å—è–Ω,-1,–ü—Ä–∞–≤–æ—Å–ª–∞–≤–∏–µ,-1,Fano,1,54,0,7.11,–ú–∞—Ä–∏–Ω–∞,16542.0,-1,–∑ –¥–Ω–µ–º —Ä–æ–∂–¥–µ–Ω–∏—è!!!!!\n üòÉ –ö–∞—Ä—Ç–∏–Ω–∫–∏ –∏ –æ—Ç–∫—Ä—ã—Ç–∫–∏ ‚ô•...,1,3
17047,-1,–ë–µ–ª–∞—Ä—É—Å—å,-1,1,156080718,–°–æ–∫–æ–ª–æ–≤—Å–∫–∞—è,-1,-1,-1,–ú–∏–Ω—Å–∫,-1,50,0,5.7,–ï–ª–µ–Ω–∞,-1.0,-1,"–í—Å–µ—Ö —Å –ù–æ–≤—ã–º –≥–æ–¥–æ–º –°–æ–±–∞–∫–∏!!! –°—á–∞—Å—Ç—å—è, –õ—é–±–≤–∏,...",0,2
28715,-1,–†–æ—Å—Å–∏—è,-1,1,3687543,–ò–≥–æ–Ω–∏–Ω–∞,-1,–≤–µ—Ä—é –≤ —Å–µ–±—è,-1,–£–ª—å—è–Ω–æ–≤—Å–∫,7,42,0,13.3,–ê–Ω–Ω–∞,871.0,6,–ê–∫—Ç—É–∞–ª—å–Ω–æ–µ –≤—Ä–µ–º—è –Ω–∞ –ù–æ–≤–æ–≥–æ–¥–Ω–∏–µ —Ñ–æ—Ç–æ—Å–µ—Å—Å–∏–∏ –∏ —Ñ–æ...,1,2
27703,-1,–†–æ—Å—Å–∏—è,-1,1,819833,–ó–∞–π—Ü–µ–≤–∞,-1,–≤–µ—Ä—é,-1,–¢—é–º–µ–Ω—å,4,34,0,27.11,–ö—Å–µ–Ω–∏—è,862.0,-1,–ù–∞ —É–ª–∏—Ü–µ –±–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω–∞—è –∫—Ä–∞—Å–æ—Ç–∞üòçüòç –∫–∞–∫ —É –≤–∞—Å –ø–æ—Å–ª...,1,2


In [8]:
df_test.head()

Unnamed: 0,political,country,smoking,sex,id,last_name,alcohol,religion,langs,city,relation,age,verified,bdate,first_name,university,life_main,posts,high_education,age_category
7339,-1,–†–æ—Å—Å–∏—è,-1,2,30163255,–ö—É–∑–∏–Ω—Å–∫–∏–π,-1,-1,-1,–°–æ—á–∏,-1,34,0,15.8.1984,–í–∏—Ç–∞–ª–∏–∫,-1.0,-1,–ó–∞–∫—Ä—ã–ª —Å–æ—Ä–µ–≤–Ω–æ–≤–∞—Ç–µ–ª—å–Ω—ã–π —Å–µ–∑–æ–Ω 17–≥–æ –≥–æ–¥–∞. \n–°—Ç–∞...,0,2
9058,-1,–†–æ—Å—Å–∏—è,-1,2,35057592,–ü—å–µ—Ö–∞,-1,-1,-1,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,-1,34,1,-1,–°—Ç–∞—Å,-1.0,-1,–Ø –¥–æ–ª–≥–æ –∂–¥–∞–ª- –ö–æ–≥–¥–∞ –ø—Ä–∏–¥–µ—Ç –µ—ë —á–µ—Ä–µ–¥? –≤–µ–¥—å —ç—Ç–æ ...,0,2
29722,-1,–†–æ—Å—Å–∏—è,-1,1,116910037,–ê–ª–¥–æ—à–∏–Ω–∞,-1,-1,-1,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,-1,54,0,20.2.1962,–ï–ª–µ–Ω–∞,-1.0,-1,–ï–ª–µ–Ω–∞ –ø—Ä–æ–≥–Ω–∞–ª–∞ –∏–∑ —Ä–æ—â–∏—Ü—ã –ª—é–±–∏—Ç–µ–ª–µ–π —è–≥–æ–¥ - –º–µ–¥–≤...,0,3
33234,-1,–†–æ—Å—Å–∏—è,-1,2,35303,–í–∞—Ä–ª–∞–º–æ–≤,-1,-1,-1,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,-1,86,0,13.11.1930,–õ—ë—à–∞,-1.0,-1,"–° –¥–Ω—ë–º —Ä–æ–∂–¥–µ–Ω–∏—è!!)\n–§–æ—Ä–¥, –≥–æ–≤–æ—Ä–∏—à—å? –ê –ø—Ä–∏–≤–æ–¥ –∫...",0,4
15361,-1,–†–æ—Å—Å–∏—è,-1,2,3159625,–°—Ç–∏–≤,-1,-1,-1,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,-1,6,0,666.5,–ù–∞–≤–æ–ª–æ—á–∫–∞,-1.0,-1,–°–ö–ê–ß–ò–í–ê–ï–ú –ò –°–õ–£–®–ê–ï–ú –ù–ê–® MIXTAPE –ó–û–ù–ê 812!\n\n–°...,0,0


In [9]:
# –°–æ—Ö—Ä–∞–Ω–∏–º –Ω–∞–±–æ—Ä—ã –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –¥–∞–ª—å–Ω–µ–π—à–µ–≥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
df_train.to_csv('learning/df_train.csv', sep='\t', encoding='utf-8')
df_test.to_csv('learning/df_test.csv', sep='\t', encoding='utf-8')

–°–º—ã—Å–ª –∑–Ω–∞—á–µ–Ω–∏–π –≤ —Å—Ç–æ–ª–±—Ü–∞—Ö sex, political, smoking, alcohol, relation, life_main —Å–ª–µ–¥—É—é—â–∏–π (–∏–∑ –æ–ø–∏—Å–∞–Ω–∏—è API VK):

1) sex - –ø–æ–ª –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è:
1 - –∂–µ–Ω—Å–∫–∏–π;
2 - –º—É–∂—Å–∫–æ–π;
0 - –ø–æ–ª –Ω–µ —É–∫–∞–∑–∞–Ω

2) political - –ø–æ–ª–∏—Ç–∏—á–µ—Å–∫–∏–µ –ø—Ä–µ–¥–ø–æ—á—Ç–µ–Ω–∏—è:

1 - –∫–æ–º–º—É–Ω–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ;
2 - —Å–æ—Ü–∏–∞–ª–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ;
3 - —É–º–µ—Ä–µ–Ω–Ω—ã–µ;
4 - –ª–∏–±–µ—Ä–∞–ª—å–Ω—ã–µ;
5 - –∫–æ–Ω—Å–µ—Ä–≤–∞—Ç–∏–≤–Ω—ã–µ;
6 - –º–æ–Ω–∞—Ä—Ö–∏—á–µ—Å–∫–∏–µ;
7 - —É–ª—å—Ç—Ä–∞–∫–æ–Ω—Å–µ—Ä–≤–∞—Ç–∏–≤–Ω—ã–µ;
8 - –∏–Ω–¥–∏—Ñ—Ñ–∏—Ä–µ–Ω—Ç–Ω—ã–µ;
9 - –ª–∏–±–µ—Ä—Ç–∞—Ä–∏–∞–Ω—Å–∫–∏–µ;

3) smoking, alcohol - –æ—Ç–Ω–æ—à–µ–Ω–∏–µ –∫ –∫—É—Ä–µ–Ω–∏—é, –∞–ª–∫–æ–≥–æ–ª—é:

1 - —Ä–µ–∑–∫–æ –Ω–µ–≥–∞—Ç–∏–≤–Ω–æ–µ;
2 - –Ω–µ–≥–∞—Ç–∏–≤–Ω–æ–µ;
3 - –∫–æ–º–ø—Ä–æ–º–∏—Å—Å–Ω–æ–µ;
4 - –Ω–µ–π—Ç—Ä–∞–ª—å–Ω–æ–µ;
5 - –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–µ;

4) relation - —Å–µ–º–µ–π–Ω–æ–µ –ø–æ–ª–æ–∂–µ–Ω–∏–µ:

1 - –Ω–µ –∂–µ–Ω–∞—Ç/–Ω–µ –∑–∞–º—É–∂–µ–º;
2 - –µ—Å—Ç—å –¥—Ä—É–≥/–µ—Å—Ç—å –ø–æ–¥—Ä—É–≥–∞;
3 - –ø–æ–º–æ–ª–≤–ª–µ–Ω/–ø–æ–º–æ–ª–≤–ª–µ–Ω–∞;
4 - –∂–µ–Ω–∞—Ç/–∑–∞–º—É–∂–µ–º;
5 - –≤—Å—ë —Å–ª–æ–∂–Ω–æ;
6 - –≤ –∞–∫—Ç–∏–≤–Ω–æ–º –ø–æ–∏—Å–∫–µ;
7 - –≤–ª—é–±–ª—ë–Ω/–≤–ª—é–±–ª–µ–Ω–∞;
8 - –≤ –≥—Ä–∞–∂–¥–∞–Ω—Å–∫–æ–º –±—Ä–∞–∫–µ;
0 - –Ω–µ —É–∫–∞–∑–∞–Ω–æ;

5) life_main - –≥–ª–∞–≤–Ω–æ–µ –≤ –∂–∏–∑–Ω–∏:

1 - —Å–µ–º—å—è –∏ –¥–µ—Ç–∏;
2 - –∫–∞—Ä—å–µ—Ä–∞ –∏ –¥–µ–Ω—å–≥–∏;
3 - —Ä–∞–∑–≤–ª–µ—á–µ–Ω–∏—è –∏ –æ—Ç–¥—ã—Ö;
4 - –Ω–∞—É–∫–∞ –∏ –∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏—è;
5 - —Å–æ–≤–µ—Ä—à–µ–Ω—Å—Ç–≤–æ–≤–∞–Ω–∏–µ –º–∏—Ä–∞;
6 - —Å–∞–º–æ—Ä–∞–∑–≤–∏—Ç–∏–µ;
7 - –∫—Ä–∞—Å–æ—Ç–∞ –∏ –∏—Å–∫—É—Å—Å—Ç–≤–æ;
8 - —Å–ª–∞–≤–∞ –∏ –≤–ª–∏—è–Ω–∏–µ

–ü–æ–ø—ã—Ç–∞–µ–º—Å—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞—Ç—å –≤–æ–∑—Ä–∞—Å—Ç (–ø–æ –¥–∞–Ω–Ω—ã–º VK (—Å–∫–æ—Ä–µ–µ –≤—Å–µ–≥–æ, –º–æ–∂–µ—Ç –±—ã—Ç—å —Å –æ—à–∏–±–∫–∞–º–∏: –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–æ, –∫–∞–∫ –æ–Ω —Ç–∞–º –æ–ø—Ä–µ–¥–µ–ª—è–µ—Ç—Å—è)) –ø–æ –ø–æ—Å—Ç–∞–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è, –µ–≥–æ –ø–æ–ª—É (–ø–æ–ª —É–∫–∞–∑–∞–Ω –≤—Å–µ–≥–¥–∞) –∏ —É–∫–∞–∑–∞–Ω–Ω—ã–º –ø—Ä–∏–∑–Ω–∞–∫–∞–º–∏ –∏–∑ —Ä–∞–∑–¥–µ–ª–∞ personal, –∞ —Ç–∞–∫–∂–µ –ø–æ –Ω–∞–ª–∏—á–∏—é –≤—ã—Å—à–µ–≥–æ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è

#### –§—É–Ω–∫—Ü–∏–∏ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –ø–æ—Å—Ç–æ–≤ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è

In [9]:
from nltk.corpus import stopwords

stopwords_en = stopwords.words('english')
stopwords_ru = stopwords.words('russian')
stopwords_ge = stopwords.words('german')

stopwords_all = stopwords_en + stopwords_ru + stopwords_ge

# –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ –±—É–∫–≤–æ—Å–æ—á–µ—Ç–∞–Ω–∏—è, –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥–µ–º —É–¥–∞–ª—è—Ç—å –∏–∑ —Ç–µ–∫—Å—Ç–æ–≤
additional_stopwords = \
[u'https', u'vk', u'com', u'id', u'ph', u'–¥—Ä', u'—Å–≤', u'ff', u'la', u'—ç—Ç–æ', \
 u'de', u'pa', u'bb', u'p', u'—É–ª', u'–∏–Ω', u'http', u'ru', u'md', u'x', \
 u'ft', u'—Å–±', u'b', u'–∫', u'www', u'youtube', u'–∫–∞', u'v', u'g', u'goo', u'gl', \
 u'eu', u'u', u'te', u'un', u'–≤–∫', u'w', u'ly', u'su', u'bu', u'vl', u'—ç—Ç', u'r', u'e', \
 u'—Å–≤–æ–π', u'–µ—â—ë', u'–º–æ–π', u'–≤–µ—Å—å', u'–¥–Ω—ë–º', u'youtu', u'—Ç–≤–æ–π', u'–Ω–∞—à', u'–≤–∞—à', u'—Ç–æ—Ç', u'—ç—Ç–æ—Ç']

stopwords_all = stopwords_all + additional_stopwords

# –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ —É–¥–∞–ª—è–µ–º—ã–µ –ø–æ—Å–ª–µ –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–∏ —Å–ª–æ–≤–∞
delete_words = [u'—Å–≤–æ–π', u'–µ—â—ë', u'–º–æ–π', u'–≤–µ—Å—å', u'–¥–Ω—ë–º', u'youtu', u'—Ç–≤–æ–π', u'–Ω–∞—à', u'–≤–∞—à', u'—Ç–æ—Ç', u'—ç—Ç–æ—Ç']

In [10]:
# –∏—Å–ø–æ–ª—å–∑—É–µ–º –¥–ª—è –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–∏ –±–∏–±–ª–∏–æ—Ç–µ–∫—É pymorphy2, –∫–æ—Ç–æ—Ä–∞—è —Ä–∞–±–æ—Ç–∞–µ—Ç —Å —Ä—É—Å—Å–∫–∏–º —è–∑—ã–∫–æ–º
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def lemmatization(text):
    return morph.parse(text)[0].normal_form

def my_tokenizer(text):
    text = text.lower()
    # –æ—á–∏—Å—Ç–∫–∞ –æ—Ç html-—Ä–∞–∑–º–µ—Ç–∫–∏
    text = re.sub('<[^>]*>', '', text)
    # –≤—ã–¥–µ–ª–µ–Ω–∏–µ —Å–º–∞–π–ª–æ–≤
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # —É–¥–∞–ª–µ–Ω–∏–µ –Ω–µ—Å–ª–æ–≤–∞—Ä–Ω—ã—Ö —Å–∏–º–≤–æ–ª–æ–≤
    text = re.sub(r'[\W]+', ' ', text, flags=re.U) + ' '.join(emoticons).replace('-', '')
    # —É–¥–∞–ª–∏–º —Ç–∞–∫–∂–µ –≤—Å–µ —Ü–∏—Ñ—Ä—ã, –∫–æ—Ç–æ—Ä—ã–µ –≤—Ä—è–¥ –ª–∏ –ø–æ–ª–µ–∑–Ω—ã –ø—Ä–∏ –∞–Ω–∞–ª–∏–∑–µ —Ç–µ–∫—Å—Ç–∞
    text = re.sub(r'\d+', '', text, flags=re.U)
    # —É–¥–∞–ª–∏–º –¥–≤–∞ –∏–ª–∏ –±–æ–ª–µ–µ –ø—Ä–æ–±–µ–ª–æ–≤
    text = re.sub(r'[ ]{2,}', '', text, flags=re.U)
    # —É–¥–∞–ª–∏–º –ø–æ–¥—á—ë—Ä–∫–∏–≤–∞–Ω–∏—è
    text = re.sub(r'[_]+', '', text, flags=re.U)
    # —Ä–∞–∑–±–∏–µ–Ω–∏–µ –ø–æ –ø—Ä–æ–±–µ–ª–∞–º –∏ —É–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤
    tokenized = [w for w in text.split() if w not in stopwords_all]
    tokenized = [re.sub(r"\W", "", w, flags=re.U) for w in tokenized]
    # –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è
    tokenized = [lemmatization(w) for w in tokenized]
    # —É–¥–∞–ª–∏–º –≤—Å–µ —Å–ª–æ–≤–∞ –∏–∑ –æ–¥–Ω–æ–π –∏ –¥–≤—É—Ö –±—É–∫–≤. –í—Ä—è–¥ –ª–∏ –æ–Ω–∏ –Ω–µ—Å—É—Ç –±–æ–ª—å—à–æ–π —Å–º—ã—Å–ª. 
    # –¢–∞–∫–∂–µ —É–¥–∞–ª–∏–º "—Å–ª–æ–≤–∞-–ø–∞—Ä–∞–∑–∏—Ç—ã"
    tokenized = [w for w in tokenized if len(w) > 2 and w not in delete_words]
    return tokenized

In [11]:
def get_posts_col(df):
    return df[['posts']]

def get_sex_col(df):
    return df[['sex']]

def get_categorial_cols(df):
    return df[['high_education', 'political', 'smoking', 'alcohol', 'relation', 'life_main']]

vec = make_union(*[
    make_pipeline(FunctionTransformer(get_categorial_cols, validate=False), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_posts_col, validate=False)), CountVectorizer(tokenizer=my_tokenizer)
])

–° pipeline'–æ–º —á—Ç–æ-—Ç–æ –Ω–µ –ø–æ–ª—É—á–∞–µ—Ç—Å—è, –ø–æ—ç—Ç–æ–º—É –∏—Å–ø–æ–ª—å–∑—É–µ–º —Ç–æ–ª—å–∫–æ —Å—Ç–æ–ª–±–µ—Ü —Ç–µ–∫—Å—Ç–∞

In [12]:
y_train = df_train['age_category']
y_test = df_test['age_category']
print(y_train.shape)
print(y_test.shape)

(28413,)
(7104,)


In [15]:
y_train.to_csv('learning/y_train', sep='\t')
y_test.to_csv('learning/y_test', sep='\t')

In [13]:
# —Ñ—É–Ω–∫—Ü–∏–∏ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –∏ –ø–æ—Å–ª–µ–¥—É—é—â–µ–≥–æ –≤–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏—è —Ä–∞–∑—Ä–µ–∂–µ–Ω–Ω–æ–π –º–∞—Ç—Ä–∏—Ü—ã
def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [39]:
# vectorizer = CountVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 2))
vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 1), analyzer='word', max_features=5000)

In [40]:
X_train_bag_of_words = vectorizer.fit_transform(tqdm(df_train['posts']))
X_train_bag_of_words.shape

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28413/28413 [38:22<00:00, 12.34it/s]


(28413, 4694)

### –ò—Å—Å–ª–µ–¥—É–µ–º –ø–æ–ª—É—á–∏–≤—à—É—é—Å—è –º–∞—Ç—Ä–∏—Ü—É tf-idf

In [16]:
features = vectorizer.get_feature_names()

In [17]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [18]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [19]:
top_feats_in_doc(X_train_bag_of_words, features, 1000, top_n=6)

Unnamed: 0,feature,tfidf
0,love,0.61
1,happy,0.36
2,–∫–ª—É–±,0.19
3,–¥–µ–≤–æ—á–∫–∞,0.14
4,–º—É–∂,0.14
5,–¥–µ–Ω—å–≥–∞,0.13


In [20]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [21]:
top_mean_feats(X_train_bag_of_words, features, top_n=10)

Unnamed: 0,feature,tfidf
0,app,0.04
1,—á–µ–ª–æ–≤–µ–∫,0.04
2,—Ä–æ–∂–¥–µ–Ω–∏–µ,0.03
3,–∫–æ—Ç–æ—Ä—ã–π,0.03
4,–æ—Ç–∫—Ä—ã—Ç–∫–∞,0.03
5,–¥—Ä—É–≥,0.03
6,–≥–æ–¥,0.03
7,–∂–∏–∑–Ω—å,0.03
8,–ª—é–±–∏—Ç—å,0.03
9,–Ω–æ–≤—ã–π,0.03


In [22]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [24]:
top_feats_by_class(X_train_bag_of_words, y_train, features, top_n=5)

[    feature  tfidf
 0       app   0.04
 1    —É–∑–Ω–∞—Ç—å   0.03
 2  —Ä–æ–∂–¥–µ–Ω–∏–µ   0.03
 3      –¥—Ä—É–≥   0.03
 4    –ª—é–±–∏—Ç—å   0.03,      feature  tfidf
 0    —á–µ–ª–æ–≤–µ–∫   0.04
 1  instagram   0.04
 2    –∫–æ—Ç–æ—Ä—ã–π   0.04
 3    —Å–ø–∞—Å–∏–±–æ   0.03
 4       –¥—Ä—É–≥   0.03,     feature  tfidf
 0       app   0.06
 1  –æ—Ç–∫—Ä—ã—Ç–∫–∞   0.05
 2   —á–µ–ª–æ–≤–µ–∫   0.04
 3   –∫–æ—Ç–æ—Ä—ã–π   0.04
 4       –≥–æ–¥   0.03,     feature  tfidf
 0       app   0.07
 1  –æ—Ç–∫—Ä—ã—Ç–∫–∞   0.06
 2   —á–µ–ª–æ–≤–µ–∫   0.04
 3   –∫–æ—Ç–æ—Ä—ã–π   0.03
 4  —Ä–æ–∂–¥–µ–Ω–∏–µ   0.03,     feature  tfidf
 0   —á–µ–ª–æ–≤–µ–∫   0.04
 1  —Ä–æ–∂–¥–µ–Ω–∏–µ   0.04
 2    –ª—é–±–∏—Ç—å   0.04
 3   –∫–æ—Ç–æ—Ä—ã–π   0.03
 4     –∂–∏–∑–Ω—å   0.03]

In [None]:
save_sparse_csr('learning/X_train_bag_of_words', X_train_bag_of_words)

In [21]:
# —Å–æ—Ö—Ä–∞–Ω–∏–º –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä, —á—Ç–æ–±—ã –Ω–µ —Ç—Ä–∞—Ç–∏—Ç—å –≤—Ä–µ–º—è –Ω–∞ –ø–æ–≤—Ç–æ—Ä–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ
with open('learning/vectorizer.pk', 'wb') as fin:
    pickle.dump(vectorizer, fin)

In [13]:
# vectorizer = pickle.load(open('vectorizer.pk', 'rb'))
# vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tokenizer at 0x7fdf1aef2e60>, use_idf=True,
        vocabulary=None)

In [25]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è –≤–µ–∫—Ç–æ—Ä–∞ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –≤ –≤–∏–¥–µ —Ä–∞–∑—Ä–µ–∂–µ–Ω–Ω–æ–π –º–∞—Ç—Ä–∏—Ü—ã
def get_X(X_bag_of_words, df):
    scaler = MinMaxScaler()

    # –ø—Ä–∏–±–∞–≤–∏–º –æ—Å—Ç–∞–ª—å–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏, –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥–µ–º —É—á–∏—Ç—ã–≤–∞—Ç—å –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏ –º–æ–¥–µ–ª–∏
    X_sex = sparse.csr_matrix(get_sex_col(df))

    categorial_cols_scale = scaler.fit_transform(get_categorial_cols(df))
    X_categorial = sparse.csr_matrix(categorial_cols_scale)

    X = hstack([X_sex, X_categorial, X_bag_of_words])

    print(X.shape)
    
    return X

In [41]:
X_train = get_X(X_train_bag_of_words, df_train)

(28413, 4701)


In [42]:
X_test_bag_of_words = vectorizer.transform(tqdm(df_test['posts']))
save_sparse_csr('learning/X_test_bag_of_words', X_test_bag_of_words)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7104/7104 [10:06<00:00, 11.72it/s]


In [43]:
X_test = get_X(X_test_bag_of_words, df_test)

(7104, 4701)


### –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏

In [44]:
def randomized_cv(model, param_grid, x_train, y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy', n_iter=10)
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    print('time for training is {} seconds'.format(t_end - t_start))
    return grid_search.best_estimator_

In [45]:
param_grid = {'alpha':[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 2, 5]}
model = MultinomialNB()
best_model = randomized_cv(model, param_grid, X_train, y_train)

model MultinomialNB best accuracy score is 0.4875233167916095
time for training is 6.673130989074707 seconds


In [31]:
with open('learning/model_bayes.pk', 'wb') as fin:
    pickle.dump(best_model, fin)

NameError: name 'pickle' is not defined

In [46]:
y_pred = best_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.485923423423


–í–∏–¥–∏–º, —á—Ç–æ —Ç–æ—á–Ω–æ—Å—Ç—å –Ω–µ —Å–ª–∏—à–∫–æ–º –≤—ã—Å–æ–∫–∞—è, –Ω–æ –≤—Å—ë –∂–µ –ª—É—á—à–µ —Å–ª—É—á–∞–π–Ω–æ–≥–æ –≥–∞–¥–∞–Ω–∏—è (—É –Ω–∞—Å 5 –∫–ª–∞—Å—Å–æ–≤, –ø–æ—ç—Ç–æ–º—É –ø—Ä–∏ —Å–ª—É—á–∞–π–Ω–æ–º –≥–∞–¥–∞–Ω–∏–∏ –±—ã–ª–æ –±—ã 0.2). –ï—Å—Ç—å –æ—â—É—â–µ–Ω–∏–µ, —á—Ç–æ —Ç–µ–∫—Å—Ç–≤ –ø–æ—Å—Ç–∞—Ö –í–ö –Ω–µ —Å–ª–∏—à–∫–æ–º –∫–æ—Ä—Ä–µ–ª–∏—Ä—É–µ—Ç —Å –≤–æ–∑—Ä–∞—Å—Ç–æ–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è, –ø–æ—ç—Ç–æ–º—É —Ö–æ—Ä–æ—à–µ–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Å–¥–µ–ª–∞—Ç—å —Ç—è–∂–µ–ª–æ

In [33]:
# –ø–æ–ø—Ä–æ–±—É–µ–º –±–∏–≥—Ä–∞–º–º—ã
vectorizer_2gram = TfidfVectorizer(tokenizer=my_tokenizer, ngram_range=(2, 2), analyzer='word', max_features=5000)
X_train_bag_of_words_2gram = vectorizer_2gram.fit_transform(tqdm(df_train['posts']))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28413/28413 [39:27<00:00, 12.00it/s]


In [34]:
X_train_2gram = get_X(X_train_bag_of_words_2gram, df_train)

(28413, 31)


In [35]:
X_test_bag_of_words_2gram = vectorizer_2gram.transform(tqdm(df_test['posts']))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7104/7104 [09:41<00:00, 12.22it/s]


In [36]:
X_test_2gram = get_X(X_test_bag_of_words_2gram, df_test)

(7104, 31)


In [37]:
model_2gram = MultinomialNB()
best_model_2gram = randomized_cv(model_2gram, param_grid, X_train_2gram, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


model MultinomialNB best accuracy score is 0.4162883187273431
time for training is 0.4641273021697998 seconds


In [38]:
y_pred_2gram = best_model_2gram.predict(X_test_2gram)
print(accuracy_score(y_test, y_pred_2gram))

0.420467342342
