In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

## Load data

In [2]:
df = pd.read_csv('vk-crawl.csv', sep=',')
df = df.dropna()
df = df[df['text'].str.contains('[a-zA-Zа-яА-Я]')]
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,id,text,sex,city,age,source
0,28888394,Хэлпабудетсёня? Σ ▼▲,2,1,21,1
1,8075277,Один из главных признаков счастья и гармонии -...,1,2,35,1
2,166835532,кому нужен админ пишите в ЛС),2,2,27,1
3,253434067,Тхэквондо. Кикбоксинг. клуб единоборств Легион...,2,1,38,1
4,411487814,Сеть стоматологических клиник | Москва | м. Ор...,2,1,35,1


## Predict sex

In [3]:
idxs = []
m = min(df['sex'].value_counts())
for a in range(1,3):
    idxs.extend(df.index[df['sex'] == a].tolist()[:m])
dfs = df.loc[idxs]
print('Dataset size: {}'.format(dfs.shape[0]))

Dataset size: 323074


In [4]:
from sklearn.model_selection import train_test_split

y = dfs['sex'].values - 1
corpus = dfs['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [5]:
# word case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=3, analyzer='word')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [6]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}
df_sex_demo = pd.DataFrame()
df_sex_demo['F'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[:20]] 
df_sex_demo['M'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[-20:]]

In [7]:
y_train__raw_word = sex_predictor.decision_function(X_train)
y_test__raw_word =  sex_predictor.decision_function(X_test)

In [8]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.73069229548980685)
('Test', 0.63240733575795094)
('Base mean', 0.50000773814129851)


In [9]:
# char case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=10, analyzer='char')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [10]:
y_train__raw_char = sex_predictor.decision_function(X_train)
y_test__raw_char =  sex_predictor.decision_function(X_test)

In [11]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.72653302844938661)
('Test', 0.66820397740462745)
('Base mean', 0.50000773814129851)


In [12]:
y_train__stacked = np.heaviside(y_train__raw_char + y_train__raw_word, 0)
y_test__stacked = np.heaviside(y_test__raw_char + y_test__raw_word, 0)

In [13]:
print('Train', accuracy_score(y_train__stacked, y_train))
print('Test', accuracy_score(y_test__stacked, y_test))

('Train', 0.75934674358408882)
('Test', 0.67662307513735198)


In [14]:
df_sex_demo.head(20)

Unnamed: 0,F,M
0,счастлива,влюблен
1,рада,крут
2,счастливая,зенит
3,мужа,брат
4,счастливой,хоккей
5,замужем,ремонт
6,парня,крутой
7,маникюр,уверен
8,наращивание,отец
9,мальчики,стал


## Predict age

In [232]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true = y_true.astype('float32')
    y_pred = y_pred.astype('float32')
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [328]:
# vac = df['age'].value_counts().sort_index()
# Ns = vac.values
# As = vac.index.values

# rho = (Ns * As).min()
# clips = rho / As
# age_clip = np.c_[As, clips]

In [329]:
# idxs = []
# for age, clip in age_clip:
#     idxs.extend(df.index[df['age'] == age].tolist()[:clip])
# dfa = df.loc[idxs]
# print('Dataset size: {}'.format(dfa.shape[0]))

Dataset size: 159746


In [371]:
idxs = []
m = df['age'].value_counts().values.min()
for age in df['age'].unique():
    idxs.extend(df.index[df['age'] == age].tolist()[:m])
dfa = df.loc[idxs]
print('Dataset size: {}'.format(dfa.shape[0]))

Dataset size: 103975


In [372]:
y = dfa['age'].values
corpus = dfa['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [380]:
# word case
vect = TfidfVectorizer(ngram_range=(1, 2), min_df=10, analyzer='word')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

# n = np.bincount(y_test)[16:].min()
# mask = np.hstack([np.random.choice(np.where(y_test == l)[0], n, replace=False)
#                       for l in np.unique(y_test)])
# X_test = X_test[mask, :]
# y_test = y_test[mask]

age_predictor = linear_model.Ridge(alpha = 5.0)
# age_predictor.fit(X_train, np.log(y_train))
age_predictor.fit(X_train, y_train)

Ridge(alpha=5.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [381]:
# y_train__ = np.exp(age_predictor.predict(X_train))
# y_test__ = np.exp(age_predictor.predict(X_test))

y_train__ = age_predictor.predict(X_train)
y_test__ = age_predictor.predict(X_test)

In [382]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}
df_age_demo = pd.DataFrame()
df_age_demo['Min Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[:20]] 
df_age_demo['Max Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[-20:]]

In [383]:
y_train__raw_word = age_predictor.predict(X_train)
y_test__raw_word =  age_predictor.predict(X_test)

In [384]:
print('Train', mean_squared_error(y_train__, y_train))
print('Test', mean_squared_error(y_test__, y_test))
base_mean = np.median(np.unique(y_train))
print('Base mean', mean_squared_error(np.full_like(y_test, 28), y_test))

('Train', 42.366005601333747)
('Test', 46.254127450659219)
('Base mean', 51.994517912959843)


In [402]:
# char case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=10, analyzer='char')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

# age_predictor = LinearRegression()
age_predictor = linear_model.Ridge(alpha = 5.0)
# age_predictor.fit(X_train, np.log(y_train))
age_predictor.fit(X_train, y_train)

Ridge(alpha=5.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [403]:
# y_train__ = np.exp(age_predictor.predict(X_train))
# y_test__ = np.exp(age_predictor.predict(X_test))

y_train__ = age_predictor.predict(X_train)
y_test__ = age_predictor.predict(X_test)

In [404]:
y_train__raw_char = age_predictor.predict(X_train)
y_test__raw_char = age_predictor.predict(X_test)

In [405]:
print('Train', mean_squared_error(y_train__, y_train))
print('Test', mean_squared_error(y_test__, y_test))
print('Base mean', mean_squared_error(np.full_like(y_test__, np.mean(y_train)), y_test))

('Train', 37.997581817382972)
('Test', 43.88164595788583)
('Base mean', 51.994519083664073)


In [406]:
# y_train__stacked = (y_train__raw_char + y_train__raw_word) / 2.0
# y_train__stacked = np.exp(y_train__stacked)
# y_test__stacked = (y_test__raw_char + y_test__raw_word) / 2.0
# y_test__stacked = np.exp(y_test__stacked)

y_train__stacked = (y_train__raw_char + y_train__raw_word) / 2.0
y_train__stacked = (y_train__stacked)
y_test__stacked = (y_test__raw_char + y_test__raw_word) / 2.0
y_test__stacked = (y_test__stacked)

In [407]:
print('Train', mean_squared_error(y_train__stacked, y_train))
print('Test', mean_squared_error(y_test__stacked, y_test))

('Train', 39.251523787994564)
('Test', 44.057238494635662)


In [408]:
df_age_demo.head(20)

Unnamed: 0,Min Age,Max Age
0,inst,поздравления
1,парня,рада новым
2,17,сезон
3,лол,ольга
4,инст,поиске
5,вопросам сотрудничества,мария
6,минут назад,за поздравления
7,беги,новым друзьям
8,не кто,помощь
9,киса,консультирование
