In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model as lm
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
from stop_words import get_stop_words
import Stemmer
import pymorphy2
from segtok import segmenter
import re
from functools import partial
import pickle
from gensim import corpora, models
from gensim.models import word2vec
import xgboost as xgb 
%matplotlib inline

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.layers.advanced_activations import PReLU
from keras.optimizers import SGD

Using Theano backend.


# Loading data

In [3]:
data = pd.read_csv('data/comments_vrn.csv.gz')

In [4]:
data.shape

(185612, 6)

In [5]:
data.is_gum.value_counts()

0.0    98373
1.0    87239
Name: is_gum, dtype: int64

In [6]:
data.head(3)

Unnamed: 0,from_id,text,is_gum,hour,likes,sex
0,9048238,Вжух даже здесь,0.0,20,1,2
1,9048238,И пишите аккуратнее 😞,0.0,12,3,2
2,9048238,Это #имбрина 😏,0.0,21,0,2


# Preprocessing

## Words count

In [9]:
lenghts_word = np.array([len(m.split()) for m in data.text.values])

In [10]:
lenghts_word.mean(), np.median(lenghts_word)

(11.451069973924099, 7.0)

In [13]:
comments = data[(lenghts_word < 50) & (lenghts_word > 4)]
comments.shape

(114275, 6)

## Links

In [21]:
links = [m for m in data.text.values if 'http' in m or 'www' in m or '.ru' in m or '.com' in m] 
print('{:.2f}% of comments contain links'.format(len(links) / len(data) * 100))

0.54% of comments contain links


In [22]:
without_link = [False if 'http' in c or 'www' in c or '.ru' in c or '.com' in c else True
                for c in comments.text.values] 

In [23]:
comments = comments[without_link]

In [24]:
comments.shape

(113709, 6)

# Droping outliers

In [120]:
from sklearn.ensemble import IsolationForest

In [210]:
vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 2), analyzer='word', max_df=0.6)
X = vectorizer.fit_transform([' '.join(t.split()[:5]) for t in comments.text.values])
y = comments.is_gum.values
# y = comments.sex.values

In [229]:
vectorizer = CountVectorizer(max_features=1000, ngram_range=(3, 3), analyzer='char')
X = vectorizer.fit_transform([' '.join(t.split()[:4]) for t in comments.text.values])
y = comments.is_gum.values
# y = comments.sex.values

In [231]:
forest = IsolationForest(200, contamination=0.01, n_jobs=-1)
forest.fit(X)
X_pred = forest.predict(X)
comments['len'] = [len(t) for t in comments.text.values]
comments['outlier'] = X_pred

In [236]:
comments[comments.outlier == -1].head(10).text.values

array([ 'интересно, а как предполагается пешеходам проходить, по мнению автора? Особенно интересует процесс непересечения парковки? Тротуара там не было и нет.',
       'имя это жизнь...всякое бывает...главное самому не быть таким тогда все будет чики чики👍👍👍👍😉',
       '😂😂😂😂😂😂человечество гибнет без воды... А нам горячую подавай... Купайтесь в холодной',
       'Гибнет общество мораль растоптана... Что ж вы творите молодежь))))??? Отвечаю вам наши старенькие))) -Мы живем мы выживаем как можем 😃😃😃😃😃и осуждать не надо))))',
       'Конечно, насосала на права и свой матиз, а на ездить не успела насосать, теперь ей дороги больше надо, хотя там на камазе проедешь.',
       'Почти все требования уже по разу выполнены))',
       'Не забываем сегодня канал Пятница 12:30 Ревизорро. Воронеж. Full',
       'Такое же проделывал один раз на терминале киви. Только в тот раз рабочий стол не был активным пришлось попотеть пока натыкал кнопку пуск вслепую и дальше все закончилось тем же что в данном п

In [237]:
comments.groupby('outlier').mean()

Unnamed: 0_level_0,from_id,is_gum,hour,likes,sex,em_proportion_rep,em_proportion_no_rep,abc_proportion,len
outlier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1,87726330.0,0.467498,13.575245,2.008905,1.688335,0.001838,0.001357,0.911466,94.389136
1,91253010.0,0.461195,12.719312,1.065852,1.703227,0.002628,0.002032,0.837144,79.340107


## Emoji

In [52]:
comments_list = comments.text.values

In [53]:
with open('emoji.txt', 'r') as f:
    lines = f.readlines()
emojis = [line[0] for line in lines]

In [54]:
def is_with_emoji(comment):
    for em in emojis:
        if em in comment:
            return 1
    return 0

In [55]:
def with_emoji(comments):
    return [is_with_emoji(c) for c in comments]

In [56]:
def get_emoji(comment, repetition=True):
    ems = []
    for em in emojis:
        if not repetition:
            if em in comment:
                ems.append(em)
        else:
            founded = re.findall(em, comment)
            if len(founded) > 0:
                ems.extend(founded)
    return ems

#### If not done before

In [58]:
len(comments_list)

113709

In [None]:
# %time emoji_from_comments_rep = list(map(get_emoji, comments_list))

In [None]:
with open('emoji_from_comments_rep_vrn.pkl', 'wb') as f:
    pickle.dump(emoji_from_comments_rep, f)

#### Else load

In [62]:
with open('emoji_from_comments_rep_vrn.pkl', 'rb') as f:
    emoji_from_comments_rep = pickle.load(f)
emoji_from_comments_no_rep = list(map(lambda com: get_emoji(com, False), comments_list))
len(comments_list), len(emoji_from_comments_rep)

(113709, 113709)

In [61]:
def get_emoji_proportion(comments, emoji_from_coms=None, repetition=True):
    emoji_proportion = [] 
    func = lambda com: get_emoji(com, repetition)
    if not emoji_from_coms:
        emoji_from_coms = list(map(func, comments))
    
    for i in range(len(comments)):
        com = re.sub(' *', '', comments[i])
        emoji_proportion.append(len(emoji_from_coms[i]) / len(com))
    return np.array(emoji_proportion)

In [63]:
em_proportion_rep = get_emoji_proportion(comments_list, emoji_from_comments_rep)
em_proportion_no_rep = get_emoji_proportion(comments_list, emoji_from_comments_no_rep)

### With repetition

In [97]:
print((em_proportion_rep > 0.2).sum())
comments_list[em_proportion_rep > 0.2][:10]

158


array(['👍👍👍👍👍👍клааас❗❗❗❗ эта елка будееет самой красивой😂😂😂😂',
       '😂😂😂😂👍👍👍н ну это ж люди....\n😂😂😂блин а когда новый год???😂',
       '😂😂😂😂😂😂вот бы меня на купюру не плохооо было б😂😂😂',
       'И здесь хорошо и там хорошо....👍👍👍👍👍👍👍👍👍',
       'Да что вы... Кому что нравится 😍😍😍😍👍👍👍👍👍👍👍👍',
       'А мне нравится можно вообще не работать 😂😂😂😂😂👍👍👍👍👍👍',
       '😍😍😍😍😍😍и тишина и спокойствие 👍👍👍👍👍',
       'Кому как но Воронеж это воронеж любим и будем любить 😍😍😍😍😍😍😍😍😍👍👍👍👍👍👍👍',
       'А я отправлю :одумайся не глупи замуж не выходи 😂😂😂😂😂😂👍👍👍👍',
       'Оооо подсолнухи 👍👍👍👍😍😍😍😍😍😍👍🌻 🌻 🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻'], dtype=object)

### Without repetition

In [81]:
print((em_proportion_no_rep > 0.07).sum())
comments_list[em_proportion_no_rep > 0.07][:10]

285


array(['Главное чтоб не поздно 😂😂😂👍', '😂😂😂это я с Перми возвращаюсь 😍😍',
       'Оооо подсолнухи 👍👍👍👍😍😍😍😍😍😍👍🌻 🌻 🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻',
       '😂😂😂😂😂это еще что такое 😂😂😂😂😂👍👍👍',
       '👍👍👍👍👍👍👍😍😍😍😍😍😍самый  лучший красивый город 😛😛😛',
       '😂😂😂😂😂👍👍👍👍👍вот те на царица прям', '😂😂😂😂😂👍👍👍👍а я так часто еду 😂😂😂',
       '😂😂😂😂а я ничего себе не скажу 👍👍👍',
       'С горем пополам... ❄  ❄  ❄  ❄  ❄  👍',
       '[id138651229|Никита], И прям перед НГ! 🎄 🎄 🍻 🍻 🍷 🍷 ❄ ❄ ❄ ❄ ❄ ❄'], dtype=object)

### Proportion of alphabetical symbols

In [90]:
def get_abc_proportion(comments):
    abc_proportion = []     
    for i in range(len(comments)):
        com = re.sub(' *', '', comments[i])
        abc = re.findall('[а-яёa-z]', com, flags=re.IGNORECASE)
        abc_proportion.append(len(abc) / len(com))
    return np.array(abc_proportion)

In [91]:
abc_proportion = get_abc_proportion(comments_list)

In [92]:
sum(abc_proportion < 0.5)

549

In [93]:
comments_list[abc_proportion < 0.5][:10]

array(['+7 952 104 73 50', '[id33372525|Денис], а кто в 14)?',
       'Оооо подсолнухи 👍👍👍👍😍😍😍😍😍😍👍🌻 🌻 🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻🌻',
       '[id366133865|Андрей], у меня кот))) подойдет????? 😂😂😂😂',
       '[id330930820|Ляська], а у меня тоже.... 😂😂👍',
       '[id322876931|Андрей], 😂😂😂😂 вот блин 😂😂😂',
       '[id16867860|Дмитрий], но уж нет))) 😂😂😂😂',
       '[id42740602|Юлия], будут две 1, 22, 29, 38, 55 и 120А все платные.',
       '[id290551639|Vika], а у вас э',
       '[id138651229|Никита], И прям перед НГ! 🎄 🎄 🍻 🍻 🍷 🍷 ❄ ❄ ❄ ❄ ❄ ❄'], dtype=object)

### Filling DataFrame

In [99]:
comments['emojis'] = [' '.join(e) for e in emoji_from_comments_rep]
comments['em_proportion_rep'] = em_proportion_rep
comments['em_proportion_no_rep'] = em_proportion_no_rep
comments['abc_proportion'] = abc_proportion

In [103]:
comments.head(3)

Unnamed: 0,from_id,text,is_gum,hour,likes,sex,outlier,emojis,em_proportion_rep,em_proportion_no_rep,abc_proportion
6,9048238,"ну ппц, уже просто так телефон не зарядишь... 😆",0.0,22,4,2,1,😆,0.025641,0.025641,0.871795
7,10679122,"[id332962766|Игорь], они кагбэ намекают, что н...",0.0,20,0,2,1,,0.0,0.0,0.818182
8,10679122,"[id386347082|Jeg-Hater], просто вою! Бли-и-н.....",0.0,21,0,2,1,,0.0,0.0,0.739837


#### If message repeats more than one time - drop (spam)

In [104]:
print('{} different spam comments'.format((comments.text.value_counts() > 1).sum()))

261 different spam comments


In [105]:
print('{} total count of spam comments'
      .format(comments.text.value_counts()[comments.text.value_counts() > 1].values.sum()))

792 total count of spam comments


### Dropping outlier

In [106]:
spam_comments = comments.text.value_counts()[comments.text.value_counts() > 1].keys()
comments = comments[comments.text.apply(lambda t: t not in spam_comments)]

In [111]:
index_to_del = comments[(comments.em_proportion_rep > 0.15).values | (comments.abc_proportion < 0.5).values | 
                        (comments.text.value_counts() != 1).values].index

In [112]:
index_to_del.shape

(635,)

In [113]:
comments.drop(index_to_del, inplace=True)

In [114]:
comments.shape

(112282, 11)

In [115]:
comments_list = comments.text.values

### Replacing id

In [116]:
comments_list = []
for comment in comments.text.values:
    c = comment.split()
    if c[0].startswith('[id'):
        c[0] = 'имя'
    c_ = []
    for w in c:
        if w.startswith('id'):
            c_.append('имя')
        else:
            c_.append(w)
    comments_list.append(' '.join(c))
comments_list = np.array(comments_list)

In [118]:
comments_list[:3]

array(['ну ппц, уже просто так телефон не зарядишь... 😆',
       'имя они кагбэ намекают, что накручивают левых платежей не очень много. Суки они все.',
       'имя просто вою! Бли-и-н... А знаешь, что такое буква "У" на некоторых автомобилях :)))) (сколько же дебилов на свете,а?)'], 
      dtype='<U421')

In [241]:
comments.text = comments_list

# Clearing comments

In [238]:
y = comments.is_gum.values
adj_proportion = []
errors = []

In [239]:
def clear_comments(comments, with_stemmer=False, with_lemmer=True, to_lower=True, without_names=False,
                without_stop_words=False, min_word_len=None, with_emoji=False):
    global adj_proportion
    global errors 
    adj_proportion = []
    errors = []
    clear_comments = []
    stop_words = set(get_stop_words('ru'))
    stemmer = Stemmer.Stemmer('russian')
    lemmer = pymorphy2.MorphAnalyzer()
    
    names_del = 0
    i = -1
    for comment in comments:
        comment_ = comment
        i += 1
        if to_lower:
            comment = comment.lower()
        comment = re.sub('[^а-яА-ЯёЁa-zA-Z\-]', ' ', comment)
        comment = comment.split()
        if without_stop_words:
            comment = [c for c in comment if c not in stop_words]
        if with_stemmer:
            comment = stemmer.stemWords(comment)
            if without_names:
                with open('names_from_sent.txt', 'r') as f:
                    names = f.readlines()
                    names = set([name.strip() for name in names])
                before = len(comment)
                comment = [c for c in comment if c not in names]
                aft = len(comment)
                names_del += before - aft
        elif with_lemmer:
            parsed = [lemmer.parse(c)[0] for c in comment]
            comment = [p.normal_form for p in parsed]
            adj = sum([1 for p in parsed if 'ADJ' in str(p.tag)])
            if len(comment) == 0:
                errors.append(comment_)
                adj_proportion.append(0)
            else:
                adj_proportion.append(adj / len(comment))
            if without_names:
                with open('names.txt', 'r') as f:
                    names = f.readlines()
                    names = set([name.strip() for name in names])
                    before = len(comment)
                    comment = [c for c in comment if c not in names]
                    aft = len(comment)
                    names_del += before - aft
        if min_word_len is not None:
            comment = [c for c in comment if len(c) >= min_word_len]
        if with_emoji:
            comment.extend(emoji_from_comments_rep[i])
        clear_comments.append(' '.join(comment))
    print('names del: {}'.format(names_del))
    return clear_comments

### Word

In [242]:
%%time
clear_coms = clear_comments(comments_list, min_word_len=3, with_emoji=True, with_stemmer=False,
                            with_lemmer=True, without_names=True, without_stop_words=False)

names del: 7797
CPU times: user 9min 47s, sys: 5.8 s, total: 9min 53s
Wall time: 10min 8s


In [243]:
errors

[]

In [244]:
clear_coms[10:15]

['имя вот это номер давать свалить куда-нибудь наш дом или быть давать идиотский совет это мой дом ничуть маленький чем ваш неприятно немой видеть интерьер век',
 'имя про это тот подобный',
 'имя что весь молоко ваш группа назвать собираться прочитать ещё раз если понять мой мысль',
 'имя ваш воинство ком солдат это серьёзно',
 'имя потом попов фонарь вешать причём чекист отнюдь быть мериться пиписька']

In [245]:
adj_proportion[10:15]

[0.14705882352941177,
 0.2857142857142857,
 0.14285714285714285,
 0.09090909090909091,
 0.0]

In [258]:
%%time
vectorizer = CountVectorizer(max_features=10000, min_df=100, ngram_range=(1, 2), analyzer='word')
word_features = vectorizer.fit_transform(clear_coms)

CPU times: user 6.77 s, sys: 44 ms, total: 6.81 s
Wall time: 6.83 s


In [259]:
np.array(vectorizer.get_feature_names())[np.random.randint(0, 1000, 20)]

array(['за', 'вчера', 'баба', 'звонить', 'возить', 'падать', 'мальчик',
       'вызвать', 'искать', 'крыша', 'менять', 'имя быть', 'красиво',
       'новый', 'весна', 'достойный', 'жалко', 'имя ага', 'ниже', 'выпить'], 
      dtype='<U18')

In [266]:
lr = lm.LogisticRegression()

In [261]:
np.mean(cross_val_score(lr, word_features, comments.is_gum, cv=5))

0.52304956426916749

# Features

### Is the first letter of sentence upper

In [336]:
def sentence_stat(comments):
    big_letter = []
    sents_count = []
    for comment in comments:
        sents = list(segmenter.split_single(re.sub('(\)+|\.+)', '\n', comment)))
        count = sum([1 for sent in sents if sent and (sent[0].isupper() or 'имя' in sent
                                                      or (len(sent) > 1 and sent[0] == '"' and sent[1].isupper()))])
        total = sum([1 for sent in sents if sent.strip() != ''
                     and re.match('.*[a-zа-яё].*', sent.strip(), flags=re.IGNORECASE)])
        # print(count, total)
        if total:
            big_letter.append(count // total)
        else:
            big_letter.append(1)
        if total > 3:
            total = 4
        if total < 1:
            total = 1
        sents_count.append(total)
    return big_letter, sents_count

In [337]:
%%time
big_letter, sents_count = sentence_stat(comments_list)

CPU times: user 4.32 s, sys: 0 ns, total: 4.32 s
Wall time: 4.32 s


In [339]:
pd.Series(sents_count).value_counts()

1    61633
2    30433
3    11832
4     8384
dtype: int64

In [341]:
pd.Series(big_letter).value_counts()

1    75977
0    36305
dtype: int64

### Punctuation count in comment

In [346]:
def punctuation_counts(comments, pattern='\(+', partion=False):
    if partion:
        return [sum(len(p) for p in re.findall(pattern, c)) / len(c) for c in comments]
    else:
        return [1 if len(re.findall(pattern, c)) > 0 else 0 for c in comments]

In [347]:
commas = punctuation_counts(comments_list, pattern='[\.]{2,}', partion=False)

In [348]:
pd.Series(commas).value_counts()

0    93994
1    18288
dtype: int64

### Mean word length

In [322]:
def mean_word_length(comments):
    lengths = []
    for comment in comments:
        comment = comment.lower()
        comment = re.sub('[^а-яё\-]', ' ', comment).split()
        ls = [len(w) for w in comment]
        if len(ls):
            lengths.append(sum(ls) / len(ls))
        else:
            lengths.append(1)
    return lengths

In [324]:
mean_length = mean_word_length(comments_list)

### Caps WORD

In [349]:
def caps_words(comments, partion=False):
    caps = []
    for comment in comments:
        count = len(re.findall('[А-ЯЁA-Z\-]{4,}', comment))
        total = len(comment.split())
        if partion and total != 0:
            caps.append(count / total * 100)
        else:
            caps.append(1 if count > 0 else 0)
    return caps

In [350]:
caps = caps_words(comments_list, False)

In [351]:
pd.Series(caps).value_counts()

0    108995
1      3287
dtype: int64

### English words

In [352]:
def eng_words(comments, partion=False):
    engs = []
    for comment in comments:
        count = len([w for w in re.findall('[a-z\-]{3,}', comment, flags=re.IGNORECASE) if w != 'имя'])
        total = len(comment.split())
        if partion and total != 0:
            engs.append(count / total * 100)
        else:
            engs.append(1 if count > 0 else 0)
    return engs

In [353]:
engs = eng_words(comments_list, False)

In [363]:
pd.Series(engs).value_counts()

0    110628
1      1654
dtype: int64

### Total words

In [421]:
def total_words(comments):
    return [len(com.split()) if len(com.split()) < 25 else 25 for com in comments]

In [422]:
words_count = total_words(comments_list)

In [423]:
pd.Series(words_count).value_counts()

25    13788
5     13100
6     11924
7     10207
8      8874
9      7538
10     6474
11     5563
12     4982
13     4441
14     3844
15     3470
16     2980
17     2691
18     2438
19     2152
20     1858
21     1763
22     1503
23     1387
24     1305
dtype: int64

### Total chars

In [432]:
def total_chars(comments):
    return [len(com) if len(com) < 100 else 100 for com in comments]

In [433]:
chars_count = total_chars(comments_list)

In [434]:
pd.Series(chars_count).value_counts()

100    28340
33      1799
35      1767
38      1750
36      1735
34      1727
37      1696
30      1693
31      1682
32      1679
41      1659
39      1641
28      1635
42      1612
40      1584
43      1557
29      1552
44      1515
45      1485
46      1466
27      1414
47      1409
26      1392
48      1361
49      1327
50      1297
52      1296
51      1273
53      1270
25      1226
       ...  
84       656
80       656
81       635
85       623
87       604
86       603
83       592
90       590
88       569
95       533
91       528
89       526
94       506
93       505
97       484
92       480
20       474
96       456
98       455
99       417
19       335
18       234
17       139
16        78
15        28
14        18
13         6
11         2
12         1
9          1
dtype: int64

### All comments features together

In [435]:
def get_comments_features(coms):
    features = pd.DataFrame()
    features['with_emoji'] = with_emoji(coms)
    big_letter, sents_count = sentence_stat(coms)
    features['big_letter'] = big_letter
    features['sents_count'] = sents_count
    features['punct_)'] = punctuation_counts(coms, pattern='\)+')
    features['punct_('] = punctuation_counts(coms, pattern='\(+')
    features['punct_?'] = punctuation_counts(coms, pattern='\?+')
    features['punct_!'] = punctuation_counts(coms, pattern='\!+')
    features['punct_..'] = punctuation_counts(coms, pattern='[\.]{2,}')
    features['punct_1-9'] = punctuation_counts(coms, pattern='[0-9]{1,}')
    features['punct_"'] = punctuation_counts(coms, pattern='".+"')
    features['eng_words'] = eng_words(coms, True)
    features['mean_word_len'] = mean_word_length(coms)
    features['caps'] = caps_words(coms)
    features['em_proportion_rep'] = comments['em_proportion_rep'].values
    features['em_proportion_no_rep'] = comments['em_proportion_no_rep'].values
    features['adj_proportion'] = adj_proportion
    features['abc_proportion'] = comments['abc_proportion'].values
    features['words_count'] = total_words(coms)
    features['chars_count'] = total_chars(coms)
    return features

In [436]:
comment_features = get_comments_features(comments_list)

In [437]:
comment_features.describe()

Unnamed: 0,with_emoji,big_letter,sents_count,punct_),punct_(,punct_?,punct_!,punct_..,punct_1-9,"punct_""",eng_words,mean_word_len,caps,em_proportion_rep,em_proportion_no_rep,adj_proportion,abc_proportion,words_count,chars_count
count,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0,112282.0
mean,0.08753,0.676662,1.705803,0.246495,0.046045,0.199043,0.127696,0.162876,0.127776,0.055236,0.248619,4.651674,0.029275,0.00262,0.002025,0.09567,0.837887,12.379651,63.908489
std,0.282611,0.467752,0.930818,0.430972,0.209583,0.399283,0.333753,0.369254,0.333842,0.228441,2.870331,0.910977,0.168576,0.01114,0.007918,0.096873,0.111049,6.678896,27.59794
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5,5.0,9.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.058824,0.0,0.0,0.0,0.0,0.75641,7.0,39.0
50%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.6,0.0,0.0,0.0,0.085714,0.857143,10.0,60.0
75%,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.181818,0.0,0.0,0.0,0.153846,0.933333,17.0,100.0
max,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,110.0,19.2,1.0,0.15,0.15,1.0,1.0,25.0,100.0


In [369]:
from sklearn import ensemble

In [375]:
rf = ensemble.GradientBoostingClassifier()

In [438]:
X = comment_features.values

In [439]:
baseline_scores = cross_val_score(lr, X, y, cv=5)
print(baseline_scores.mean(), baseline_scores.std())

0.501309096063 0.0132788987626


In [377]:
baseline_scores

array([ 0.48975866,  0.47844674,  0.51126648,  0.51603135,  0.51937121])

In [378]:
from sklearn.feature_selection import VarianceThreshold

In [440]:
sel = VarianceThreshold(0.4)

In [441]:
X_ = sel.fit_transform(X)

In [442]:
comment_features.columns.values[sel.get_support()]

array(['sents_count', 'eng_words', 'mean_word_len', 'words_count',
       'chars_count'], dtype=object)

In [443]:
X_.shape

(112282, 5)

In [444]:
baseline_scores = cross_val_score(lr, X_, y, cv=5)
print(baseline_scores.mean(), baseline_scores.std())

0.539596989232 0.0110092089374


In [445]:
lr.fit(X_, y)

LogisticRegressionCV(Cs=[0.1, 1, 10, 100], class_weight=None, cv=None,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)

In [447]:
lr.coef_

array([[ 0.09238615, -0.00393661, -0.10605297, -0.03285124,  0.00487407]])