In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

## Load data

In [24]:
df = pd.read_csv('vk-crawl.csv', sep=',')
df = df.dropna()
df = df[df['text'].str.contains('[a-zA-Zа-яА-Я]')]
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,id,text,sex,city,age,source
0,181158,Счастье - личный выбор каждого!,1,2,33,1
1,33761,Работа моя. Работушка. Работонька! Кормилица т...,2,2,30,1
2,429047370,"В любимом человеке нравятся даже недостатки, а...",1,2,17,1
3,236834920,"некоторым не то что-нет надо было сказать,а во...",1,2,21,1
4,294621065,РАБОТА! НУЖНЫ ОРГАНИЗАТОРЫ ПО УПРАВЛЕНИЮ ИНТЕР...,1,2,21,1


## Predict sex

In [165]:
idxs = []
m = min(df['sex'].value_counts())
for a in range(1,3):
    idxs.extend(df.index[df['sex'] == a].tolist()[:m])
dfs = df.loc[idxs]
print('Dataset size: {}'.format(dfs.shape[0]))

Dataset size: 302880


In [166]:
y = dfs['sex'].values - 1
corpus = dfs['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [27]:
y_train_probas = []
y_test_probas = []

In [45]:
# word case
vect = TfidfVectorizer(ngram_range=(1,2), min_df=3, analyzer='word')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

In [46]:
## Naive Bayes
from sklearn.naive_bayes import MultinomialNB

sex_predictor = MultinomialNB()
sex_predictor.fit(X_train, y_train)

y_train_probas.append(sex_predictor.predict_proba(X_train))
y_test_probas.append(sex_predictor.predict_proba(X_test))

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [47]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

Train 0.724833267301
Test 0.637925911252
Base mean 0.5


In [48]:
sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train_probas.append(sex_predictor.predict_proba(X_train))
y_test_probas.append(sex_predictor.predict_proba(X_test))

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [49]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}
df_sex_demo = pd.DataFrame()
df_sex_demo['F'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[:20]] 
df_sex_demo['M'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[-20:]]

In [50]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

Train 0.748068541997
Test 0.653146460644
Base mean 0.5


In [62]:
# char case
vect = TfidfVectorizer(ngram_range=(1,5), min_df=10, analyzer='char')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

In [103]:
sex_predictor = MultinomialNB(alpha=1e-5)
sex_predictor.fit(X_train, y_train)

# y_train_probas.append(sex_predictor.predict_proba(X_train))
# y_test_probas.append(sex_predictor.predict_proba(X_test))

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [104]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.6772176631496678)
('Test', 0.64347287781474893)
('Base mean', 0.50000773814129851)


In [63]:
sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train_probas.append(sex_predictor.predict_proba(X_train))
y_test_probas.append(sex_predictor.predict_proba(X_test))

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [64]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

Train 0.753734977549
Test 0.670199418912
Base mean 0.5


In [43]:
y_train__stacked = np.argmax(np.mean(np.c_[y_train_probas], axis=0), axis=1)
y_test__stacked = np.argmax(np.mean(np.c_[y_test_probas], axis=0), axis=1)

In [44]:
print('Train', accuracy_score(y_train__stacked, y_train))
print('Test', accuracy_score(y_test__stacked, y_test))

Train 0.757573131273
Test 0.664124405705


In [185]:
df_sex_demo.head(20)

Unnamed: 0,F,M
0,счастлива,хоккей
1,рада,счастлив
2,счастливая,цска
3,счастливой,футбол
4,мальчики,влюблен
5,мужа,влюблён
6,наращивание,пацаны
7,парня,крут
8,замужем,стал
9,маникюр,свободен


In [167]:
class CModel:
    def __init__(self):
        self.word_vect = TfidfVectorizer(ngram_range=(1,2), min_df=3, analyzer='word')
        self.char_vect = TfidfVectorizer(ngram_range=(1,5), min_df=10, analyzer='char')
        self.word_predictor = MultinomialNB()
        self.char_predictor = LogisticRegression()
    
    def fit(self, X, y):
        self.word_vect.fit(X)
        X_train = self.word_vect.transform(X)
        self.word_predictor.fit(X_train, y)
        
        self.char_vect.fit(X)
        X_train = self.char_vect.transform(X)
        self.char_predictor.fit(X_train, y)
    
    def predict(self, X):
        y_probas = []
        
        X_target = self.word_vect.transform(X)
        y_probas.append(self.word_predictor.predict_proba(X_target))
        X_target = self.char_vect.transform(X)
        y_probas.append(self.char_predictor.predict_proba(X_target))
        
        return np.argmax(np.mean(np.c_[y_probas], axis=0), axis=1)

In [77]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True)

train_scores, test_scores = [], []

for train, test in kf.split(corpus, y):
    cmodel = CModel()
    cmodel.fit(corpus[train], y[train])
    train_scores.append(accuracy_score(cmodel.predict(corpus[train]), y[train]))
    test_scores.append(accuracy_score(cmodel.predict(corpus[test]), y[test]))
    print('Train: {}, Test: {}'.format(train_scores[-1], test_scores[-1]))

print('Mean. Train: {}, Test: {}'.format(np.mean(train_scores), np.mean(test_scores)))

KeyboardInterrupt: 

nb
Train: 0.7617827192287374, Test: 0.6695390913893291
Train: 0.7622532025884838, Test: 0.6705460908610671
Train: 0.7615846209720021, Test: 0.6731378763866878
Train: 0.7624678090332805, Test: 0.6708432382461701
Train: 0.7618116085578447, Test: 0.6720813523507659
Mean. Train: 0.7619799920760697, Test: 0.671229529846804

In [164]:
dfe = pd.read_csv('vk-crawl_v2.csv', sep=',')
dfe = dfe[dfe['source'] == 2]
dfe = dfe.dropna()
dfe = dfe[dfe['text'].str.contains('[a-zA-Zа-яА-Я]')]
dfe = dfe.sample(frac=1).reset_index(drop=True)

dfe.head()

Unnamed: 0,age,city,id,sex,source,text
0,35,1,32114626,1,2,Зима пришла) урааа!😃У нас валит снег❄❄❄
1,36,1,342265186,1,2,"Дорогие друзья , ждём вас на нашем концерте в ..."
2,34,1,6288990,1,2,И ещё немного удовольствия от Сарьяна
3,18,1,101768973,1,2,"Если судьбе будет угодно,она не раз ещё сведёт..."
4,33,2,599885,1,2,Уииии


In [171]:
y_eval = dfe['sex'].values - 1
corpus_eval = dfe['text'].values

In [172]:
cmodel = CModel()
cmodel.fit(corpus, y)

In [178]:
print('Model:', accuracy_score(cmodel.predict(corpus_eval), y_eval))
print('Base:', accuracy_score(np.full_like(y_eval, 0), y_eval))

Model: 0.644951140065
Base: 0.505971769815


## Predict age

In [78]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true = y_true.astype('float32')
    y_pred = y_pred.astype('float32')
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [79]:
dfa = df

In [127]:
y = dfa['age'].values
corpus = dfa['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [128]:
idxs = []
m = np.min(np.bincount(y_test)[16:])
for a in range(16, 41):
    ids = np.where(y_test == a)[0]
    np.random.shuffle(ids)
    idxs.extend(ids[:m])
corpus_test = corpus_test[idxs]
y_test = y_test[idxs]

In [129]:
# word case
vect = TfidfVectorizer(ngram_range=(1, 2), min_df=10, analyzer='word')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

In [130]:
ros = RandomOverSampler()
X_train_res, y_train_res = ros.fit_sample(X_train, y_train)

In [131]:
age_predictor = linear_model.Ridge(alpha = 8.0)
age_predictor.fit(X_train_res, y_train_res)

Ridge(alpha=8.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [132]:
y_train__ = age_predictor.predict(X_train_res)
y_test__ = age_predictor.predict(X_test)

In [133]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}
df_age_demo = pd.DataFrame()
df_age_demo['Min Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[:20]] 
df_age_demo['Max Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[-20:]]

In [134]:
y_train__raw_word = age_predictor.predict(X_train_res)
y_test__raw_word =  age_predictor.predict(X_test)

In [135]:
print('Train', mean_absolute_error(y_train__, y_train_res))
print('Test', mean_absolute_error(y_test__, y_test))
print('Base mean', mean_absolute_error(np.full_like(y_test, np.mean(y_train_res)), y_test))

Train 5.34805978675
Test 5.69190681868
Base mean 6.24


In [136]:
# char case
vect = TfidfVectorizer(ngram_range=(1, 5), min_df=10, analyzer='char')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

In [137]:
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_sample(X_train, y_train)

In [138]:
age_predictor = linear_model.Ridge(alpha = 5.0)
age_predictor.fit(X_train, y_train)

Ridge(alpha=5.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [139]:
y_train__ = age_predictor.predict(X_train)
y_test__ = age_predictor.predict(X_test)

In [140]:
y_train__raw_char = age_predictor.predict(X_train)
y_test__raw_char = age_predictor.predict(X_test)

In [141]:
print('Train', mean_absolute_error(y_train__, y_train))
print('Test', mean_absolute_error(y_test__, y_test))
print('Base mean', mean_absolute_error(np.full_like(y_test__, np.mean(y_train)), y_test))

Train 4.91524767501
Test 5.47351221962
Base mean 6.24


In [142]:
y_train__stacked = (y_train__raw_char + y_train__raw_word) / 2.0
y_train__stacked = (y_train__stacked)
y_test__stacked = (y_test__raw_char + y_test__raw_word) / 2.0
y_test__stacked = (y_test__stacked)

In [143]:
print('Train', mean_absolute_error(y_train__stacked, y_train))
print('Test', mean_absolute_error(y_test__stacked, y_test))

Train 5.06297357128
Test 5.52872973823


In [125]:
df_age_demo.head(20)

Unnamed: 0,Min Age,Max Age
0,инст,рада новым
1,аниме,мебель
2,не кто,продаже
3,парня,40 лет
4,крутая,бабушка
5,inst,относись
6,не когда,массаж
7,не чего,женщина
8,паркур,ирина
9,псих,elena


In [146]:
class RModel:
    def __init__(self):
        self.word_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10, analyzer='word')
        self.char_vect = TfidfVectorizer(ngram_range=(1,4), min_df=10, analyzer='char')
        self.word_predictor = linear_model.Ridge(alpha = 8.0)
        self.char_predictor = linear_model.Ridge(alpha = 5.0)
    
    def fit(self, X, y):
        self.word_vect.fit(X)
        X_train = self.word_vect.transform(X)
        X_train_res, y_train_res = RandomOverSampler().fit_sample(X_train, y)
        self.word_predictor.fit(X_train_res, y_train_res)
        
        self.char_vect.fit(X)
        X_train = self.char_vect.transform(X)
        X_train_res, y_train_res = RandomOverSampler().fit_sample(X_train, y)
        self.char_predictor.fit(X_train_res, y_train_res)
    
    def predict(self, X):
        y_preds = []
        
        X_target = self.word_vect.transform(X)
        y_preds.append(self.word_predictor.predict(X_target))
        X_target = self.char_vect.transform(X)
        y_preds.append(self.char_predictor.predict(X_target))
        
        return (y_preds[0] + y_preds[1]) / 2.0

In [147]:
kf = KFold(n_splits=5, shuffle=True)

train_scores, test_scores = [], []

for train, test in kf.split(corpus, y):
    corpus_train, corpus_test = corpus[train], corpus[test]
    y_train, y_test = y[train], y[test]
    idxs = []
    m = np.min(np.bincount(y_test)[16:])
    for a in range(16, 41):
        ids = np.where(y_test == a)[0]
        np.random.shuffle(ids)
        idxs.extend(ids[:m])
    corpus_test = corpus_test[idxs]
    y_test = y_test[idxs]
    
    rmodel = RModel()
    rmodel.fit(corpus_train, y_train)
    train_scores.append(mean_absolute_error(rmodel.predict(corpus_train), y_train))
    test_scores.append(mean_absolute_error(rmodel.predict(corpus_test), y_test))
    print('Train: {}, Test: {}'.format(train_scores[-1], test_scores[-1]))

print('Mean. Train: {}, Test: {}'.format(np.mean(train_scores), np.mean(test_scores)))

fit word vect
fit word pred
fit char vect
fit char pred
Train: 5.035100314914137, Test: 5.513734914611344
fit word vect
fit word pred
fit char vect
fit char pred
Train: 5.038292510043037, Test: 5.53921470832174
fit word vect
fit word pred
fit char vect
fit char pred
Train: 5.033940849829584, Test: 5.5500055880735895
fit word vect
fit word pred
fit char vect
fit char pred
Train: 5.046695941581583, Test: 5.517875851319246
fit word vect
fit word pred
fit char vect
fit char pred
Train: 5.037042268432221, Test: 5.533469802676617
Mean. Train: 5.038214376960112, Test: 5.530860173000507


## Eval

In [160]:
y_eval = dfe['age'].values
corpus_eval = dfe['text'].values

idxs = []
m = np.min(np.bincount(y_eval)[16:])
for a in range(16, 41):
    ids = np.where(y_eval == a)[0]
    np.random.shuffle(ids)
    idxs.extend(ids[:m])
corpus_eval = corpus_eval[idxs]
y_eval = y_eval[idxs]

In [162]:
rmodel = RModel()
rmodel.fit(corpus, y)

fit word vect
fit word pred
fit char vect
fit char pred


In [163]:
print('Model:', mean_absolute_error(rmodel.predict(corpus_eval), y_eval))
print('Base:', mean_absolute_error(np.full_like(y_eval, 28), y_eval))

Model: 6.11936811624
Base: 6.24
