In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

## Load data

In [2]:
df = pd.read_csv('vk-crawl.csv', sep=',')
df = df.dropna()
df = df[df['text'].str.contains('[a-zA-Zа-яА-Я]')]
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,id,text,sex,city,age,source
0,132475635,а че случилось?,2,1,35,1
1,208617149,Лучшая подруга - это когда у вас есть такие сл...,1,1,20,1
2,493072,Капитан Очевидность должен сделать в жизни три...,1,1,37,1
3,365796530,Всем привееееееееет!,1,1,21,1
4,430739402,"Чему бы жизнь нас не учила, а сердце верит в ч...",1,1,20,1


## Predict sex

In [3]:
idxs = []
m = min(df['sex'].value_counts())
for a in range(1,3):
    idxs.extend(df.index[df['sex'] == a].tolist()[:m])
dfs = df.loc[idxs]

In [101]:
from sklearn.model_selection import train_test_split

y = dfs['sex'].values - 1
corpus = dfs['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [102]:
# word case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=3, analyzer='word')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [103]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}
df_sex_demo = pd.DataFrame()
df_sex_demo['F'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[:20]] 
df_sex_demo['M'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[-20:]]

In [105]:
y_train__raw_word = sex_predictor.decision_function(X_train)
y_test__raw_word =  sex_predictor.decision_function(X_test)

In [106]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.73118753844903839)
('Test', 0.63220614408419096)
('Base mean', 0.49999226185870155)


In [48]:
# char case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=10, analyzer='char')
vect.fit(corpus_train)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [49]:
y_train__raw_char = sex_predictor.decision_function(X_train)
y_test__raw_char =  sex_predictor.decision_function(X_test)

In [50]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

('Train', 0.72665683918919444)
('Test', 0.67140756790218992)
('Base mean', 0.49999226185870155)


In [51]:
y_train__stacked = np.heaviside(y_train__raw_char + y_train__raw_word, 0)
y_test__stacked = np.heaviside(y_test__raw_char + y_test__raw_word, 0)

In [52]:
print('Train', accuracy_score(y_train__stacked, y_train))
print('Test', accuracy_score(y_test__stacked, y_test))

('Train', 0.75934674358408882)
('Test', 0.67789213031029949)


In [104]:
df_sex_demo.head(20)

Unnamed: 0,F,M
0,счастлива,счастлив
1,рада,спартак
2,счастливая,клан
3,счастливой,ремонт
4,мужа,честь
5,парня,волк
6,мальчики,люблю её
7,наращивание,вернулся
8,замужем,хоккей
9,маникюр,влюблен


## Predict age

In [53]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [54]:
idxs = []
m = min(df['age'].value_counts())
for a in range(16,41):
    idxs.extend(df.index[df['age'] == a].tolist()[:m])
dfa = df.loc[idxs]

In [55]:
y = dfa['age'].values
corpus = dfa['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [97]:
# word case
vect = TfidfVectorizer(ngram_range=(1,2), min_df=2, analyzer='word')
vect.fit(corpus)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

# age_predictor = LinearRegression()
age_predictor = linear_model.Ridge(alpha = 3.0)
age_predictor.fit(X_train, np.log(y_train))

y_train__ = np.exp(age_predictor.predict(X_train))
y_test__ = np.exp(age_predictor.predict(X_test))

In [98]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}
df_age_demo = pd.DataFrame()
df_age_demo['Min Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[:20]] 
df_age_demo['Max Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[-20:]]

In [81]:
y_train__raw_word = age_predictor.predict(X_train)
y_test__raw_word =  age_predictor.predict(X_test)

In [83]:
print('Train', mean_absolute_percentage_error(y_train__, y_train))
print('Test', mean_absolute_percentage_error(y_test__, y_test))
print('Base mean', mean_absolute_percentage_error(np.full_like(y_test, np.mean(y_train)), y_test))

('Train', 0.18616729270707263)
('Test', 0.21542272596903164)
('Base mean', 0.4799230584275066)


In [92]:
# char case
vect = TfidfVectorizer(ngram_range=(1,4), min_df=10, analyzer='char')
vect.fit(corpus)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

# age_predictor = LinearRegression()
age_predictor = linear_model.Ridge(alpha = 3.0)
age_predictor.fit(X_train, np.log(y_train))

y_train__ = np.exp(age_predictor.predict(X_train))
y_test__ = np.exp(age_predictor.predict(X_test))

In [93]:
y_train__raw_char = age_predictor.predict(X_train)
y_test__raw_char =  age_predictor.predict(X_test)

In [94]:
print('Train', mean_absolute_percentage_error(y_train__, y_train))
print('Test', mean_absolute_percentage_error(y_test__, y_test))
print('Base mean', mean_absolute_percentage_error(np.full_like(y_test__, np.mean(y_train)), y_test))

('Train', 0.18622596859456805)
('Test', 0.20772561864931444)
('Base mean', 0.22283983306143404)


In [95]:
y_train__stacked = (y_train__raw_char + y_train__raw_word) / 2.0
y_train__stacked = np.exp(y_train__stacked)
y_test__stacked = (y_test__raw_char + y_test__raw_word) / 2.0
y_test__stacked = np.exp(y_test__stacked)

In [96]:
print('Train', mean_absolute_percentage_error(y_train__stacked, y_train))
print('Test', mean_absolute_percentage_error(y_test__stacked, y_test))
# не сработало :(

('Train', 0.18418496988564506)
('Test', 0.20924604539339678)


In [99]:
df_age_demo.head(20)

Unnamed: 0,Min Age,Max Age
0,не чего,ремонт
1,кек,женщине
2,17,рада новым
3,cs,консультант
4,люблю всех,отпуск
5,топ,правильно
6,inst,консультации
7,дратути,наталья
8,хай,стилист
9,лю,психолог
