In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model as lm
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
import re
import pickle

# Loading data

In [36]:
data = pd.read_csv('data/comments_vrn.csv.gz')

In [39]:
data.shape

(193539, 6)

## Choose the best

In [39]:
data.shape

(193539, 6)

In [207]:
min_likes = 4
max_len = 150
min_len = 10

In [208]:
best_data = data[(data.likes >= min_likes) & (data.text.str.len() < max_len)\
                 & (data.text.str.len() > min_len)][['text']].reset_index(drop=True)

In [190]:
data.text.str.len().quantile(0.9)

158.0

In [209]:
best_data.shape

(10867, 1)

In [219]:
print(best_data.text.sample().values[0])

Фотка в посте просто огонь🔥🔥🔥🔥


In [221]:
best_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10867 entries, 0 to 10866
Data columns (total 1 columns):
text    10867 non-null object
dtypes: object(1)
memory usage: 85.0+ KB


In [281]:
with open('data_for_app/best_comments.pkl', 'wb') as f:
    pickle.dump(best_data, f)

In [282]:
%%time
with open('data_for_app/best_comments.pkl', 'rb') as f:
    b = pickle.load(f)

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 16.4 ms


# Preprocessing

## Words count

In [227]:
lenghts_word = np.array([len(m.split()) for m in data.text.values])
comments = data[(lenghts_word <= 20) & (lenghts_word > 1)]
comments.shape

(149563, 6)

In [226]:
comments.shape

(145819, 3)

## Links

In [228]:
without_link = [False if 'http' in c or 'www' in c or '.ru' in c or '.com' in c else True
                for c in comments.text.values] 
comments = comments[without_link]

### Dropping outlier

In [229]:
spam_comments = comments.text.value_counts()[comments.text.value_counts() > 1].keys()
comments = comments[comments.text.apply(lambda t: t not in spam_comments)]
comments_list = comments.text.values

### Replacing id

In [230]:
def replace_id(df):
    comments_list = []
    for comment in df.text.values:
        c = comment.split()
        if c[0].startswith('[id'):
            c[0] = ''
        c_ = []
        for w in c:
            if w.startswith('id'):
                c_.append('')
            else:
                c_.append(w)
        comments_list.append(' '.join(c))
    comments_list = np.array(comments_list)
    df.text = comments_list

In [231]:
replace_id(comments)

# Cross validation method

## Grouping by user

In [232]:
def make_df_balanced(df, by_col):
    """Make df balanced by binary columns named - by_col. Using oversampling"""
    big_class = 0
    small_class = 1
    if df[by_col].value_counts()[0] < df[by_col].value_counts()[1]:
        big_class = 1
        small_class = 0
    
    delta = df[by_col].value_counts()[big_class] - df[by_col].value_counts()[small_class]
    only_ing = df[df[by_col] == small_class]
    to_add_indexes = np.random.randint(0, len(only_ing) - 1, delta)
    df = pd.concat((df, only_ing.iloc[to_add_indexes]))

    # shuffle after adding
    df = df.iloc[np.random.permutation(df.shape[0])]
    return df

In [233]:
class Model:
    def __init__(self, name='-'):
        self.name = name
        
    def fit(self, X, y):
        raise NotImplemented()
    
    def predict(self, X):
        raise NotImplemented()
    
    def predict_proba(self, X):
        raise NotImplemented()

In [234]:
class LrModelCount(Model):
    def __init__(self, name='-', max_features=1000, analyzer='word', ngram_range=(1, 1), penalty='l2', C=1):
        super().__init__(name)
        self.vectorizer = CountVectorizer(max_features=max_features, analyzer=analyzer, ngram_range=ngram_range)
        self.model = lm.LogisticRegression(penalty=penalty, C=C)
        self._fitted = False
        
    def fit(self, X, y):
        X = self.vectorizer.fit_transform(X.text.values)
        self.model.fit(X, y)
        self._fitted = True
    
    def predict(self, X):
        if not self._fitted:
            raise Exception('Not fitted yet')
        X = self.vectorizer.transform(X.text.values)
        return self.model.predict(X)
    
    def predict_proba(self, X):
        if not self._fitted:
            raise Exception('Not fitted yet')
        X = self.vectorizer.transform(X.text.values)
        return self.model.predict_proba(X)

In [235]:
class LrModelTfidf(Model):
    def __init__(self, name='-', max_features=1000, penalty='l2', C=1):
        super().__init__(name)
        self.vectorizer = TfidfVectorizer(max_features=max_features)
        self.model = lm.LogisticRegression(penalty=penalty, C=C)
        self._fitted = False
        
    def fit(self, X, y):
        X = self.vectorizer.fit_transform(X.text.values)
        self.model.fit(X, y)
        self._fitted = True
    
    def predict(self, X):
        if not self._fitted:
            raise Exception('Not fitted yet')
        X = self.vectorizer.transform(X.text.values)
        return self.model.predict(X)

    def predict_proba(self, X):
        if not self._fitted:
            raise Exception('Not fitted yet')
        X = self.vectorizer.transform(X.text.values)
        return self.model.predict_proba(X)

In [288]:
import gzip

In [289]:
class AverageModel():
    def __init__(self, models):
        self.models = models
        self._fitted = False
    
    def fit(self, X, y):
        for m in self.models:
            m.fit(X, y)
        self._fitted = True
    
    def predict(self, X):
        if not self._fitted:
            raise Exception('Not fitted yet')
        
        predictions = np.hstack([np.expand_dims(m.predict(X), -1) for m in self.models])
        predictions = (np.median(predictions, axis=1) > 0.5).astype(int)
        return predictions
    
    def predict_proba(self, X):
        if not self._fitted:
            raise Exception('Not fitted yet')
        
        predictions = []
        for model in self.models:
            prediction = model.predict_proba(X)
            if prediction.shape[1] > 1:
                prediction = prediction[:, 1]
            else:
                prediction = prediction.ravel()
            predictions.append(prediction)
                
        predictions = np.hstack([np.expand_dims(p, -1) for p in predictions])
        predictions = np.mean(predictions, axis=1)
        return predictions
    
    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.models, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            self.models = pickle.load(f)
        self._fitted = True

In [290]:
av_model = AverageModel([
    LrModelTfidf('lr_tfidf_5k', 5000, penalty='l2', C=0.05),

    LrModelCount('lr_count_5k_word_12', 5000, ngram_range=(1, 2), penalty='l2', C=0.05),
    LrModelCount('lr_count_10k_word_13', 10000, ngram_range=(1, 3), penalty='l2', C=0.05),

    LrModelCount('lr_count_5k_char_23', 5000, 'char', (2, 3), penalty='l2', C=0.05),
    LrModelCount('lr_count_2k_char_23', 2000, 'char', (2, 3), penalty='l2', C=0.05),
])

In [262]:
n_coms = 11

unique_ids = comments.from_id.value_counts()[comments.from_id.value_counts() >= n_coms].index.values
additional_ids = comments.from_id.value_counts()[comments.from_id.value_counts() < n_coms].index.values

train_idxs = unique_ids[:int(len(unique_ids) * 0.8)]
test_idxs = unique_ids[int(len(unique_ids) * 0.8):]

train_comments = comments[[i in train_idxs for i in comments.from_id]]
additional_comments = comments[[i in additional_ids for i in comments.from_id]]
train_comments = pd.concat((train_comments.reset_index(drop=True), additional_comments.reset_index(drop=True)))

test_comments = comments[[i in test_idxs for i in comments.from_id]]

train_comments = make_df_balanced(train_comments, 'is_gum')

X_train, X_test = train_comments, test_comments
y_train, y_test = train_comments.is_gum.values, test_comments.is_gum.values

Усреднение вероятнотей, а не выбор самого частого класса предсказания, т.к второй вариант не учитывает, на сколько каждая модель уверена в своем ответе

In [263]:
test_comments.is_gum.value_counts()

0.0    2158
1.0    1718
Name: is_gum, dtype: int64

In [271]:
threshold = 0.493

In [291]:
%%time
av_model.fit(X_train, y_train)

CPU times: user 1min 45s, sys: 1.2 s, total: 1min 46s
Wall time: 1min 25s


In [292]:
pr = av_model.predict_proba(X_test)

In [274]:
accuracy_score(y_test, pr > threshold)

0.53689370485036114

In [294]:
av_model.save('data_for_app/av_model.pkl')

In [295]:
another_model = AverageModel([])

In [287]:
%%time
another_model.load('data_for_app/av_model.pkl')

CPU times: user 1.22 s, sys: 92 ms, total: 1.31 s
Wall time: 1.3 s


In [299]:
%%time
another_model.load('data_for_app/av_model.pkl')

CPU times: user 1.8 s, sys: 80 ms, total: 1.88 s
Wall time: 1.88 s


In [279]:
pr = another_model.predict_proba(X_test)

In [280]:
accuracy_score(y_test, pr > threshold)

0.53689370485036114

In [244]:
comments.sample(1).text.values

array(['Ох нытики, опять вам не так'], dtype=object)

In [245]:
my_coms = ['купи права',
           'автомобиль в студию',
           'а ты сфоткай побольше мест)',
           'Собчак в президенты😂',
           'поэтому бери копейку👍',
           'в твоем сообщение',
           'ну если ты купил диплом',
           'танж?',
           'а ты выглядишь как олень',
           'но если у тебя мозга, тебе нечем делать выводы',
           'блядь',
           'б**дь',
           ' дура'
          ]

In [254]:
my_coms_df = pd.DataFrame(my_coms, columns=['text'])

In [255]:
av_model.predict_proba(my_coms_df)

array([ 0.47520671,  0.41391705,  0.52521049,  0.48708607,  0.46064265,
        0.51721634,  0.44965737,  0.47391301,  0.48048024,  0.49664874,
        0.47562766,  0.46891131,  0.49851683])

In [256]:
av_model.predict_proba(my_coms_df) > threshold

array([False, False,  True, False, False,  True, False, False, False,
        True, False, False,  True], dtype=bool)

In [257]:
# prob = av_model.predict_proba(my_coms_df).mean()

In [258]:
prob = np.median(av_model.predict_proba(my_coms_df) > threshold)

In [259]:
print('Ваша оценка: {:.2f}'.format(prob))

Ваша оценка: 0.00
