In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

## Load data

In [2]:
df = pd.read_csv('vk-crawl.csv', sep=',')
df = df.dropna()
df = df[df['text'].str.contains('[a-zA-Zа-яА-Я]')]
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,id,text,sex,city,age,source
0,356783654,читер!!! читер!!! ахаха \nЛюблю когда называют...,2,1,19,1
1,139035920,Люблю тебя❤,2,1,24,1
2,251609296,Память полезно иногда терять - тогда радуешься...,1,2,26,1
3,86610358,"Отличное начало, кажется, пора заканчивать...",1,1,30,1
4,314595249,Н_И_К_О_М_У Н_Е Н_У_Ж_Е_Н,2,1,17,1


## Predict sex

In [3]:
idxs = []
m = min(df['sex'].value_counts())
for a in range(1,3):
    idxs.extend(df.index[df['sex'] == a].tolist()[:m])
dfs = df.loc[idxs]

In [4]:
from sklearn.model_selection import train_test_split

y = dfs['sex'].values - 1
corpus = dfs['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [5]:
vect = TfidfVectorizer(ngram_range=(1,4), min_df=3)
vect.fit(corpus)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [6]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base mean', accuracy_score(np.full_like(y_test, 0), y_test))

Train 0.763639890386
Test 0.656580163761
Base mean 0.5


In [7]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}

In [13]:
df_sex_demo = pd.DataFrame()
df_sex_demo['F'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[:20]] 
df_sex_demo['M'] = [inv_vocab[i] for i in np.argsort(sex_predictor.coef_[0])[-20:]]
df_sex_demo.head(20)

Unnamed: 0,F,M
0,счастлива,крутой
1,рада,влюблен
2,счастливая,какой есть
3,счастливой,уверен
4,мужа,футбол
5,мальчики,ремонт
6,парня,свободен
7,замужем,стал
8,наращивание,честь
9,маникюр,отец


## Predict age

In [14]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [15]:
idxs = []
m = min(df['age'].value_counts())
for a in range(16,41):
    idxs.extend(df.index[df['age'] == a].tolist()[:m])
dfa = df.loc[idxs]

In [16]:
y = dfa['age'].values
corpus = dfa['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [17]:
vect = TfidfVectorizer(ngram_range=(1,2), min_df=2)
vect.fit(corpus)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

# age_predictor = LinearRegression()
age_predictor = linear_model.Ridge(alpha = 3.0)
age_predictor.fit(X_train, np.log(y_train))

y_train__ = np.exp(age_predictor.predict(X_train))
y_test__ = np.exp(age_predictor.predict(X_test))

In [18]:
print('Train', mean_absolute_percentage_error(y_train__, y_train))
print('Test', mean_absolute_percentage_error(y_test__, y_test))
print('Base mean', mean_absolute_percentage_error(np.full_like(y_test, np.mean(y_train)), y_test))

Train 0.184076408167
Test 0.216070454012
Base mean 0.232591846442


In [19]:
inv_vocab = {v: k for k, v in vect.vocabulary_.items()}

In [20]:
df_age_demo = pd.DataFrame()
df_age_demo['Min Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[:20]] 
df_age_demo['Max Age'] = [inv_vocab[i] for i in np.argsort(age_predictor.coef_)[-20:]]
df_age_demo.head(20)

Unnamed: 0,Min Age,Max Age
0,парня,бизнес
1,чс,вам
2,киса,женщины
3,inst,рыбалка
4,не кто,кофе
5,крут,улыбайтесь
6,аниме,елена
7,крутая,elena
8,11 лет,консультации
9,нечего,алексей
