In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

## Load data

In [2]:
df = pd.read_csv('vk-crawl.csv', sep=',')
df = df.dropna()
df = df[df['text'].str.contains('[a-zA-Zа-яА-Я]')]
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,id,text,sex,city,age,source
0,371377313,🐺,1,1,21,1
1,180057341,"Чем больше привычек, тем меньше свободы",2,1,23,1
2,360477702,vk.com/app4236781_360477702,1,1,27,1
3,275323909,"Я тебя люблю, поэтому ревную и выпендриваюсь.",1,2,18,1
4,338504449,Добавляйтесь :*:*:*,1,1,24,1


## Predict sex

In [4]:
idxs = []
m = min(df['sex'].value_counts())
for a in range(1,3):
    idxs.extend(df.index[df['sex'] == a].tolist()[:m])
dfs = df.loc[idxs]
print('Dataset size: {}'.format(dfs.shape[0]))

Dataset size: 323074


In [5]:
y = dfs['sex'].values - 1
corpus = dfs['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [94]:
y_train_probas = []
y_test_probas = []

In [95]:
# word case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=3, analyzer='word')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

In [97]:
## Naive Bayes
from sklearn.naive_bayes import MultinomialNB

sex_predictor = MultinomialNB()
sex_predictor.fit(X_train, y_train)

y_train_probas.append(sex_predictor.predict_proba(X_train))
y_test_probas.append(sex_predictor.predict_proba(X_test))

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [98]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.72312823310467034)
('Test', 0.63031803760736671)
('Base mean', 0.50000773814129851)


In [99]:
sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train_probas.append(sex_predictor.predict_proba(X_train))
y_test_probas.append(sex_predictor.predict_proba(X_test))

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [100]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}
df_sex_demo = pd.DataFrame()
df_sex_demo['F'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[:20]] 
df_sex_demo['M'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[-20:]]

In [101]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.73138099272998813)
('Test', 0.63330496014857229)
('Base mean', 0.50000773814129851)


In [102]:
# char case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=10, analyzer='char')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

In [103]:
sex_predictor = MultinomialNB(alpha=1e-5)
sex_predictor.fit(X_train, y_train)

y_train_probas.append(sex_predictor.predict_proba(X_train))
y_test_probas.append(sex_predictor.predict_proba(X_test))

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [104]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.6772176631496678)
('Test', 0.64347287781474893)
('Base mean', 0.50000773814129851)


In [105]:
sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train_probas.append(sex_predictor.predict_proba(X_train))
y_test_probas.append(sex_predictor.predict_proba(X_test))

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [106]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.72700892598052302)
('Test', 0.66797183316567366)
('Base mean', 0.50000773814129851)


In [183]:
y_train__stacked = np.argmax(np.mean(np.c_[[y_train_probas[1], y_train_probas[-1]]], axis=0), axis=1)
y_test__stacked = np.argmax(np.mean(np.c_[[y_test_probas[1], y_test_probas[-1]]], axis=0), axis=1)

In [184]:
print('Train', accuracy_score(y_train__stacked, y_train))
print('Test', accuracy_score(y_test__stacked, y_test))

('Train', 0.76033336041693267)
('Test', 0.67567902189893991)


In [185]:
df_sex_demo.head(20)

Unnamed: 0,F,M
0,счастлива,хоккей
1,рада,счастлив
2,счастливая,цска
3,счастливой,футбол
4,мальчики,влюблен
5,мужа,влюблён
6,наращивание,пацаны
7,парня,крут
8,замужем,стал
9,маникюр,свободен


## Predict age

In [3]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true = y_true.astype('float32')
    y_pred = y_pred.astype('float32')
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [4]:
dfa = df

In [17]:
y = dfa['age'].values
corpus = dfa['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [18]:
# word case
vect = TfidfVectorizer(ngram_range=(1, 2), min_df=10, analyzer='word')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

In [19]:
ros = RandomOverSampler(random_state=0)
X_train_res, y_train_res = ros.fit_sample(X_train, y_train)

In [20]:
age_predictor = linear_model.Ridge(alpha = 8.0)
age_predictor.fit(X_train_res, y_train_res)

Ridge(alpha=8.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [21]:
y_train__ = age_predictor.predict(X_train_res)
y_test__ = age_predictor.predict(X_test)

In [22]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}
df_age_demo = pd.DataFrame()
df_age_demo['Min Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[:20]] 
df_age_demo['Max Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[-20:]]

In [23]:
y_train__raw_word = age_predictor.predict(X_train_res)
y_test__raw_word =  age_predictor.predict(X_test)

In [24]:
print('Train', mean_absolute_error(y_train__, y_train_res))
print('Test', mean_absolute_error(y_test__, y_test))
print('Base mean', mean_absolute_error(np.full_like(y_test, np.mean(y_train_res)), y_test))

('Train', 5.3492191828784401)
('Test', 5.5179414695853524)
('Base mean', 6.1123908244553222)


In [25]:
# char case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=10, analyzer='char')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

In [26]:
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_sample(X_train, y_train)

In [27]:
age_predictor = linear_model.Ridge(alpha = 5.0)
age_predictor.fit(X_train, y_train)

Ridge(alpha=5.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [28]:
y_train__ = age_predictor.predict(X_train)
y_test__ = age_predictor.predict(X_test)

In [29]:
y_train__raw_char = age_predictor.predict(X_train)
y_test__raw_char = age_predictor.predict(X_test)

In [30]:
print('Train', mean_absolute_error(y_train__, y_train))
print('Test', mean_absolute_error(y_test__, y_test))
print('Base mean', mean_absolute_error(np.full_like(y_test__, np.mean(y_train)), y_test))

('Train', 5.0005220229158116)
('Test', 5.2847301352748017)
('Base mean', 6.1123908244553222)


In [31]:
y_train__stacked = (y_train__raw_char + y_train__raw_word) / 2.0
y_train__stacked = (y_train__stacked)
y_test__stacked = (y_test__raw_char + y_test__raw_word) / 2.0
y_test__stacked = (y_test__stacked)

In [32]:
print('Train', mean_absolute_error(y_train__stacked, y_train))
print('Test', mean_absolute_error(y_test__stacked, y_test))

('Train', 5.1211978653222348)
('Test', 5.3405843895835794)


In [33]:
df_age_demo.head(20)

Unnamed: 0,Min Age,Max Age
0,инст,женщина
1,аниме,господи
2,не кто,инструктор
3,не когда,партнеров
4,крутая,консультации
5,inst,хорошая но
6,ребят,olga
7,некогда не,светлана
8,не чего,относись
9,по вам,гадание
