In [577]:
import sqlite3
from tqdm import tqdm
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

from tqdm import tqdm
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


%matplotlib inline
from jupyterthemes import jtplot
jtplot.style(theme='onedork')

## Model 1 - Texts as they are

In [456]:
#load psychological data
cols = ['id', 'sex', 'HEX1_eX', 'HEX2_A', 'HEX3_C', 'HEX4_E', 'HEX5_O', 'HEX6_H']
traits = pd.read_csv('data/survey_data.csv', sep=';', decimal=',', usecols=cols)
traits.shape

(152, 8)

In [457]:
#get text data from db
conn = sqlite3.connect('ud.db')
c = conn.cursor()
query = 'SELECT DISTINCT owner_id, text FROM posts WHERE text IS NOT NULL AND text != "";'
texts = pd.read_sql(query, conn)
texts.shape

(38375, 2)

Filter out short texts

In [458]:
lens = np.array([len(str(t)) for t in texts.text])
print('Mean:', lens.mean(),
      '\nMedian:', np.median(lens), '\nMin:', min(lens), '\nMax:', max(lens))

Mean: 189.076039088 
Median: 61.0 
Min: 1 
Max: 16384


In [459]:
trsh_up, trsh_lo = 700, 200
print('{:.2f}% shorter than {}'.format(lens[lens<trsh_up].shape[0]/lens.shape[0]*100, trsh_up))
print('{:.2f}% longer than {}'.format(lens[lens>trsh_lo].shape[0]/lens.shape[0]*100, trsh_lo))

94.32% shorter than 700
16.88% longer than 200


In [460]:
lens = np.array([len(str(t)) for t in texts.text])
texts = texts[(lens < trsh_up) & (lens > trsh_lo)]
texts.shape

(4298, 2)

In [461]:
#join data
data = pd.merge(texts, traits, how='left', left_on='owner_id', right_on='id')
data.text = data.text.str.lower()
data.shape

(4532, 10)

In [464]:
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def build_model(data, X, y, vectorizer, model):
    print("="*80)
    print('BUILDING MODEL FOR {}'.format(y))
    print("="*80)
    
    X_train, X_test, y_train, y_test = train_test_split(data[X], data[y], test_size=0.1)
    
    print('Train sample: {}\nTest sample: {}'.format(len(X_train), len(X_test)))
        
    train_vec = vectorizer.fit_transform(X_train)
    test_vec = vectorizer.transform(X_test)
     
    print('\nIncluded tokens ({})'.format(train_vec.shape[1]))
    print(np.array(vectorizer.get_feature_names())[np.random.randint(0, len(vectorizer.get_feature_names()), 20)])
    print('\nExcluded tokens ({})'.format(len(vectorizer.stop_words_)))
    print(np.array(list(vectorizer.stop_words_))[np.random.randint(0, len(vectorizer.stop_words_), 20)])
    
    model.fit(train_vec, y_train)
    
    y_train_pred = model.predict(train_vec)
    print('\nMAPE on training sample: {:.2f}%'.format(mape(y_train, y_train_pred)))
    print('R2 on training sample: {:.3f}'.format(r2_score(y_train, y_train_pred)))

    y_test_pred = model.predict(test_vec)
    print('\nMAPE on test sample: {:.2f}%'.format(mape(y_test, y_test_pred)))
    print('R2 on training sample: {:.3f}'.format(r2_score(y_test, y_test_pred)))
    
    print('\nHigh pole')
    #[print(a) for a in sorted(list(zip(model.coef_, vectorizer.get_feature_names())), reverse=True)[0:5]]
    print('\nLow pole')
    #[print(a) for a in sorted(list(zip(model.coef_, vectorizer.get_feature_names())))[0:5]]
    print()

In [465]:
%%time
for trait in ['HEX1_eX', 'HEX2_A', 'HEX3_C', 'HEX4_E', 'HEX5_O', 'HEX6_H']:
    lm = RandomForestRegressor()
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), 
                         analyzer='word', 
                         tokenizer=word_tokenize, 
                         min_df = 30, 
                         max_df = 0.7, 
                         max_features = 10000)
    build_model(data, X='text', y=trait, vectorizer=vectorizer, model=lm)

BUILDING MODEL FOR HEX1_eX
Train sample: 4078
Test sample: 454

Included tokens (1123)
['других' ', который' 'ей' 'тому' ', из' 'том ,' 'интересно ,'
 '# цитаты_из_прочитанного' 'кстати' '—' 'вопросы' ', из' '=' 'им' '— это'
 'этом' 'мира' ', это' '! ! !' 'свою']

Excluded tokens (438187)
['не денут ,' 'games' 'балконе' 'них и' 'чистотой' 'виной твоим коленям'
 ', задавать' 'ценный .' 'инициатива' 'только я' 'в себя ли'
 'таким уродцем..' 'ощущений ?' 'и придумали одно' 'что словарь'
 'деталях показать процесс' 'заслушалась' 'попала в предпиковую'
 ', после которой' 'отсутствие и негативность']

MAPE on training sample: 6.54%
R2 on training sample: 0.872

MAPE on test sample: 15.83%
R2 on training sample: 0.268

High pole

Low pole

BUILDING MODEL FOR HEX2_A
Train sample: 4078
Test sample: 454

Included tokens (1128)
['моего' 'ибо' 'сильно' 'двух' 'но и' 'сентября' 'словно' 'которым'
 'времени' 'минут' 'минут' 'деле' 'одну' '. )' 'знаю' 'точно' 'там'
 'человека' 'все ,' 'последний']

E

## Model 2 - concatenated texts

In [466]:
#load psychological data
cols = ['id', 'sex', 'HEX1_eX', 'HEX2_A', 'HEX3_C', 'HEX4_E', 'HEX5_O', 'HEX6_H']
traits = pd.read_csv('data/survey_data.csv', sep=';', decimal=',', usecols=cols)
traits.shape

(152, 8)

In [467]:
#get text data from db
conn = sqlite3.connect('ud.db')
c = conn.cursor()
query = 'SELECT DISTINCT owner_id, text FROM posts WHERE text IS NOT NULL AND text != "";'
texts = pd.read_sql(query, conn)
texts.shape

(38375, 2)

In [468]:
texts['text'] = texts['text'].apply(str).apply(str.lower)
texts_conc = texts.groupby('owner_id')['text'].apply(lambda x: ' <ps> '.join(x))
texts_conc = pd.DataFrame(texts_conc.reset_index())
texts_conc.shape

(106, 2)

In [469]:
lens = np.array([len(str(t)) for t in texts_conc.text])
print('Mean:', lens.mean(),
      '\nMedian:', np.median(lens), '\nMin:', min(lens), '\nMax:', max(lens))

Mean: 70617.0566038 
Median: 26465.5 
Min: 85 
Max: 918554


In [470]:
trsh_up, trsh_lo = 10**10, 1000
print('{:.2f}% shorter than {}'.format(lens[lens<trsh_up].shape[0]/lens.shape[0]*100, trsh_up))
print('{:.2f}% longer than {}'.format(lens[lens>trsh_lo].shape[0]/lens.shape[0]*100, trsh_lo))

100.00% shorter than 10000000000
95.28% longer than 1000


In [471]:
lens = np.array([len(str(t)) for t in texts_conc.text])
texts_conc = texts_conc[(lens < trsh_up) & (lens > trsh_lo)]
texts_conc.shape

(101, 2)

In [472]:
#join data
data2 = pd.merge(texts_conc, traits, how='left', left_on='owner_id', right_on='id')
data2.shape

(103, 10)

In [473]:
%%time
for trait in ['HEX1_eX', 'HEX2_A', 'HEX3_C', 'HEX4_E', 'HEX5_O', 'HEX6_H']:
    lm = RandomForestRegressor()
    vectorizer = TfidfVectorizer(ngram_range=(1, 1), 
                         analyzer='word', 
                         tokenizer=word_tokenize, 
                         min_df = 10, 
                         max_df = 0.7, 
                         max_features = 1000)
    build_model(data2, X='text', y=trait, vectorizer=vectorizer, model=lm)

BUILDING MODEL FOR HEX1_eX
Train sample: 92
Test sample: 11

Included tokens (1000)
['него' 'невозможно' 'решил' 'that' 'целый' 'хороший' 'фото' 'кроме'
 'мнение' 'благодаря' 'мозг' 'дом' 'хотя' 'сначала' 'почти' 'сторону'
 'постоянно' 'цветы' 'могли' 'говорят']

Excluded tokens (133131)
['бить' 'ничтожного' 'знакомо-то' 'саше' 'растаман' 'впечатляет' 'соседняя'
 'аргентину' 'сашулей' 'скрытая' 'знакомыми' 'анализировали' 'просыпается'
 'универсистеского' 'налил' 'гораций' 'сравнивала' 'dumb' 'клан' 'среднюю']

MAPE on training sample: 6.48%
R2 on training sample: 0.833

MAPE on test sample: 16.32%
R2 on training sample: 0.324

High pole

Low pole

BUILDING MODEL FOR HEX2_A
Train sample: 92
Test sample: 11

Included tokens (1000)
['so' 'вконтакте' 'хорошего' 'like' 'прекрасный' 'результаты' 'наконец-то'
 'группы' '***' 'всеми' '@' 'кого-то' 'об' 'кажется' 'делают' 'xd'
 'настроение' 'днем' 'наше' 'ветер']

Excluded tokens (109938)
['погибает' 'альтернативный' 'ярослав' 'замело' 'блогер

## Model 3 - Nominal traits

In [490]:
#load psychological data
cols = ['id', 'sex', 'HEX1_eX', 'HEX2_A', 'HEX3_C', 'HEX4_E', 'HEX5_O', 'HEX6_H']
traits = pd.read_csv('data/survey_data.csv', sep=';', decimal=',', usecols=cols)
traits.shape

(152, 8)

In [491]:
def set_groups(x, dev=1, M=50, SD=10):
    if x > M+dev*SD:
        return 'high'
    elif x < M-dev*SD:
        return 'low'
    else:
        return 'average'

In [502]:
for trait in ['HEX1_eX', 'HEX2_A', 'HEX3_C', 'HEX4_E', 'HEX5_O', 'HEX6_H']:
    scale = trait + '_nom'
    traits[scale] = traits[trait].apply(set_groups, dev=0.5)
    print(trait)
    print(traits[scale].value_counts())

HEX1_eX
high       53
average    51
low        48
Name: HEX1_eX_nom, dtype: int64
HEX2_A
average    58
high       51
low        43
Name: HEX2_A_nom, dtype: int64
HEX3_C
average    53
high       52
low        47
Name: HEX3_C_nom, dtype: int64
HEX4_E
high       57
average    48
low        47
Name: HEX4_E_nom, dtype: int64
HEX5_O
high       56
average    50
low        46
Name: HEX5_O_nom, dtype: int64
HEX6_H
average    54
high       50
low        48
Name: HEX6_H_nom, dtype: int64


In [504]:
#get text data from db
conn = sqlite3.connect('ud.db')
c = conn.cursor()
query = 'SELECT DISTINCT owner_id, text FROM posts WHERE text IS NOT NULL AND text != "";'
texts = pd.read_sql(query, conn)
texts.shape

(38375, 2)

In [506]:
trsh_up, trsh_lo = 700, 200
print('{:.2f}% shorter than {}'.format(lens[lens<trsh_up].shape[0]/lens.shape[0]*100, trsh_up))
print('{:.2f}% longer than {}'.format(lens[lens>trsh_lo].shape[0]/lens.shape[0]*100, trsh_lo))

94.32% shorter than 700
16.88% longer than 200


In [512]:
lens = np.array([len(str(t)) for t in texts.text])
texts = texts[(lens < trsh_up) & (lens > trsh_lo)]
texts.shape

(4298, 2)

In [582]:
def cleanse(s):
    rgxp = '[\`\)\(\|©~^<>/\'\"\«№#$&\*.,;=+?!\—_@:\]\[%\{\}\\n]'
    return re.sub(' +', ' ', re.sub(rgxp, ' ', s.lower()))

In [588]:
#join data
data3 = pd.merge(texts, traits, how='left', left_on='owner_id', right_on='id')
data3.text = data3.text.str.lower().apply(cleanse)
data3.shape
data3.text.head()

0     мастер ты говорил что если я познаю кто я то ...
1    однажды солдат охранявший дорогу остановил буд...
2    люди я хочу извиниться перед вами…перед всеми ...
3    what i plainly see before my eyes makes me fin...
4    здравствуй мальчик бананан если ты нормальный ...
Name: text, dtype: object

In [608]:
def build_model2(data, X, y, vectorizer, model):
    print("="*40)
    print('BUILDING MODEL FOR {}'.format(y))
    print("="*40)
    X_train, X_test, y_train, y_test = train_test_split(data[X], data[y], test_size=0.1)
    print('Train sample: {}\nTest sample: {}'.format(len(X_train), len(X_test)))
    train_vec = vectorizer.fit_transform(X_train)
    test_vec = vectorizer.transform(X_test)
    print('\nIncluded tokens ({})'.format(train_vec.shape[1]))
    print(np.array(vectorizer.get_feature_names())[np.random.randint(0, len(vectorizer.get_feature_names()), 20)])
    print('\nExcluded tokens ({})'.format(len(vectorizer.stop_words_)))
    print(np.array(list(vectorizer.stop_words_))[np.random.randint(0, len(vectorizer.stop_words_), 20)])
    model.fit(train_vec, y_train)
    y_train_pred = model.predict(train_vec)
    print('\nAccuracy on training sample: {:.2f}%'.format(accuracy_score(y_train, y_train_pred)))
    print(classification_report(y_train, y_train_pred))
    y_test_pred = model.predict(test_vec)
    print('Accuracy on test sample: {:.2f}%'.format(accuracy_score(y_test, y_test_pred)))
    print(classification_report(y_test, y_test_pred))
    print()

In [611]:
%%time
for trait in ['HEX1_eX_nom', 'HEX2_A_nom', 'HEX3_C_nom', 'HEX4_E_nom', 'HEX5_O_nom', 'HEX6_H_nom']:
    lm = RandomForestClassifier(n_estimators=500, max_features='log2', 
                                min_samples_leaf=20, oob_score = True)  
    lm = LogisticRegression()
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), 
                         analyzer='word', 
                         tokenizer=word_tokenize, 
                         min_df = 30, 
                         max_df = 0.3, 
                         max_features = 10000)
    build_model2(data3, X='text', y=trait, vectorizer=vectorizer, model=lm)

BUILDING MODEL FOR HEX1_eX_nom
Train sample: 4078
Test sample: 454

Included tokens (832)
['важно' 'слишком' 'будто' 'раз' 'кого' 'is' 'душе' 'оказывается' 'лет'
 'не было' 'дела' 'перед' 'даже' 'но не' 'взгляд' 'очень' 'так как' 'себе'
 'стороны' 'в том']

Excluded tokens (398142)
['презентацию' 'кое-какими' 'насладиться этим помедитировать'
 'буду писать о' 'войной и' 'отливов' 'то приходится' 'студентом и не'
 'сравнивают положение своих' 'накурено очень' 'из них видимо'
 'порывистого мира' 'могу быть там' 'p p s' 'id1775145 кирилла'
 'пара штучек лишних' 'твиттере очень' 'obshem minutka tragizma'
 'или читать' 'тратьте плюсы у']

Accuracy on training sample: 0.64%
             precision    recall  f1-score   support

    average       0.61      0.71      0.66      1536
       high       0.66      0.76      0.70      1516
        low       0.66      0.35      0.46      1026

avg / total       0.64      0.64      0.63      4078

Accuracy on test sample: 0.55%
             precision  

## Model 4 - Naive model

Have we achieved anything at all? Are our models even "napolshischechki" better than naive model?

In [572]:
y_naive = np.random.choice(['high', 'average', 'low'], size=(len(data3),), p=[0.3, 0.4, 0.3])
for trait in ['HEX1_eX_nom', 'HEX2_A_nom', 'HEX3_C_nom', 'HEX4_E_nom', 'HEX5_O_nom', 'HEX6_H_nom']:
    print("="*80)
    print('NAIVE MODEL FOR {}'.format(trait))
    print("="*80)  
    print('\nAccuracy of naive: {:.2f}%'.format(accuracy_score(data[trait], y_naive)))
    print(classification_report(data[trait], y_naive))

NAIVE MODEL FOR HEX1_eX_nom

Accuracy of naive: 0.35%
             precision    recall  f1-score   support

    average       0.39      0.42      0.40      1699
       high       0.39      0.32      0.35      1704
        low       0.25      0.30      0.28      1129

avg / total       0.36      0.35      0.35      4532

NAIVE MODEL FOR HEX2_A_nom

Accuracy of naive: 0.35%
             precision    recall  f1-score   support

    average       0.40      0.40      0.40      1781
       high       0.29      0.32      0.31      1273
        low       0.34      0.30      0.32      1478

avg / total       0.35      0.35      0.35      4532

NAIVE MODEL FOR HEX3_C_nom

Accuracy of naive: 0.33%
             precision    recall  f1-score   support

    average       0.26      0.41      0.32      1155
       high       0.40      0.32      0.35      1760
        low       0.36      0.30      0.33      1617

avg / total       0.35      0.33      0.34      4532

NAIVE MODEL FOR HEX4_E_nom

Accuracy

Seems yes. Our model ~two times more precise than naive