In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


## Load data

In [2]:
df = pd.read_csv('vk-crawl.csv', sep=',')

df = df.dropna()

df = df[df['text'].str.contains('[a-zA-Zа-яА-Я]')]

df = df.sample(frac=1).reset_index(drop=True)
# idxs = []
# for a in range(16,41):
#     idxs.extend(df.index[df['age'] == a].tolist()[:4000])
# df = df.loc[idxs]

print(df.shape)
df.head()

IOError: File ../vk-crawl.csv does not exist

## Predict sex

In [195]:
from sklearn.model_selection import train_test_split

y = df['sex'].values - 1
corpus = df['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [196]:
vect = TfidfVectorizer(ngram_range=(1,4), min_df=3)
vect.fit(corpus)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

sex_predictor = LogisticRegression(C=1.0)
sex_predictor.fit(X_train, y_train)

y_train__ = sex_predictor.predict(X_train)
y_test__ = sex_predictor.predict(X_test)

In [197]:
print('Train', accuracy_score(y_train__, y_train))
print('Test', accuracy_score(y_test__, y_test))
print('Base', accuracy_score(np.full_like(y_test, 0), y_test))

Train 0.762267741213
Test 0.660403545951
Base 0.547980777841


## Predict age

In [193]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [137]:
from sklearn import linear_model

y = df['age'].values
corpus = df['text'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

In [191]:
vect = TfidfVectorizer(ngram_range=(1,3), min_df=2)
vect.fit(corpus)

X_train = vect.transform(corpus_train)
X_test = vect.transform(corpus_test)

# age_predictor = LinearRegression()
age_predictor = linear_model.Ridge(alpha = 2.0)
age_predictor.fit(X_train, np.log(y_train))

y_train__ = np.exp(age_predictor.predict(X_train))
y_test__ = np.exp(age_predictor.predict(X_test))

In [194]:
print('Train', mean_absolute_percentage_error(y_train__, y_train))
print('Test', mean_absolute_percentage_error(y_test__, y_test))
print('Base', mean_absolute_percentage_error(np.full_like(y_test, 28), y_test))

Train 0.170113117722
Test 0.203996264524
Base 0.215835682092
