In [None]:
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

## Loading

### Train

In [None]:
# Read CSV
train = pd.read_csv('wcpr_mypersonality.csv', dtype=object, encoding='cp1252')

# Fill NAs
train.iloc[6399, 13:] = train.iloc[6399, 12:-1]
train.iloc[6399, 12] = None  # Missing date

# Rename columns
train = train.rename(columns={
    '#AUTHID': 'id',
    'STATUS': 'status',
    'sEXT': 'extraversion_score',
    'sNEU': 'neuroticism_score',
    'sAGR': 'agreeableness_score',
    'sCON': 'conscientiousness_score',
    'sOPN': 'openness_score',
    'cEXT': 'extraversion_class',
    'cNEU': 'neuroticism_class',
    'cAGR': 'agreeableness_class',
    'cCON': 'conscientiousness_class',
    'cOPN': 'openness_class',
    'DATE': 'date',
    'NETWORKSIZE': 'network_size',
    'BETWEENNESS': 'betweenness_raw',
    'NBETWEENNESS': 'betweenness_normalized',
    'DENSITY': 'density',
    'BROKERAGE': 'brokerage_raw',
    'NBROKERAGE': 'brokerage_normalized',
    'TRANSITIVITY': 'transitivity'
})

# Relabel classes
train = train.replace({
    'extraversion_class': {'y': 'sociable', 'n': 'shy'},
    'neuroticism_class': {'y': 'neurotic', 'n': 'calm'},
    'agreeableness_class': {'y': 'friendly', 'n': 'uncooperative'},
    'conscientiousness_class': {'y': 'organized', 'n': 'careless'},
    'openness_class': {'y': 'insightful', 'n': 'unimaginative'}
})

# Cast dtypes
train = train.astype({
    'id': 'category',
    'status': 'string',
    'extraversion_score': 'float32',
    'neuroticism_score': 'float32',
    'agreeableness_score': 'float32',
    'conscientiousness_score': 'float32',
    'openness_score': 'float32',
    'extraversion_class': 'category',
    'neuroticism_class': 'category',
    'agreeableness_class': 'category',
    'conscientiousness_class': 'category',
    'openness_class': 'category',
    'date': 'datetime64[ns]',
    'network_size': 'uint16',
    'betweenness_raw': 'float32',
    'betweenness_normalized': 'float32',
    'density': 'float32',
    'brokerage_raw': 'uint32',
    'brokerage_normalized': 'float32',
    'transitivity': 'float32'
})

# Group rows and filter columns
train = train.groupby(['id', 'neuroticism_class'], observed=True)['status']
train = train.agg(' '.join).reset_index()
train = train[['status', 'neuroticism_class']]

# Show train
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   status             250 non-null    string  
 1   neuroticism_class  250 non-null    category
dtypes: category(1), string(1)
memory usage: 2.4 KB


### Test

In [None]:
# Read CSV
test = pd.read_csv('wcpr_essays.csv', dtype=object, encoding='cp1252')

# Rename columns
test = test.rename(columns={
    '#AUTHID': 'id',
    'TEXT': 'text',
    'cEXT': 'extraversion_class',
    'cNEU': 'neuroticism_class',
    'cAGR': 'agreeableness_class',
    'cCON': 'conscientiousness_class',
    'cOPN': 'openness_class'
})

# Relabel classes
test = test.replace({
    'extraversion_class': {'y': 'sociable', 'n': 'shy'},
    'neuroticism_class': {'y': 'neurotic', 'n': 'calm'},
    'agreeableness_class': {'y': 'friendly', 'n': 'uncooperative'},
    'conscientiousness_class': {'y': 'organized', 'n': 'careless'},
    'openness_class': {'y': 'insightful', 'n': 'unimaginative'}
})

# Cast dtypes
test = test.astype({
    'id': 'string',
    'text': 'string',
    'extraversion_class': 'category',
    'neuroticism_class': 'category',
    'agreeableness_class': 'category',
    'conscientiousness_class': 'category',
    'openness_class': 'category'
})

# Filter columns
test = test[['text', 'neuroticism_class']]

# Show test
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2468 entries, 0 to 2467
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   text               2468 non-null   string  
 1   neuroticism_class  2468 non-null   category
dtypes: category(1), string(1)
memory usage: 21.9 KB


## Training

In [None]:
# Create model
model = Pipeline([
    ('vectorizer', None),
    ('classifier', None)
])

# Define hyperparameters
params = [
    {
        "vectorizer": [
            CountVectorizer(binary=True),
            CountVectorizer(),
            TfidfVectorizer()
        ],
        "vectorizer__ngram_range": [(1, 1), (2, 2)],
        "vectorizer__max_df": [0.5, 1.0],
        "vectorizer__min_df": [1, 2],
        "classifier": [SVC()],
        "classifier__C": [0.01, 0.1, 1, 10, 100],
        "classifier__kernel": ["linear", "rbf"]
    },
    {
        "vectorizer": [
            CountVectorizer(binary=True),
            CountVectorizer(),
            TfidfVectorizer()
        ],
        "vectorizer__ngram_range": [(1, 1), (2, 2)],
        "vectorizer__max_df": [0.5, 1.0],
        "vectorizer__min_df": [1, 2],
        "classifier": [KNeighborsClassifier()],
        "classifier__n_neighbors": [1, 3, 5, 7, 9],
        "classifier__metric": ["euclidean", "cosine"]
    },
    {
        "vectorizer": [
            CountVectorizer(binary=True),
            CountVectorizer(),
            TfidfVectorizer()
        ],
        "vectorizer__ngram_range": [(1, 1), (2, 2)],
        "vectorizer__max_df": [0.5, 1.0],
        "vectorizer__min_df": [1, 2],
        "classifier": [RandomForestClassifier(random_state=641)]
    },
    {
        "vectorizer": [
            CountVectorizer(binary=True),
            CountVectorizer(),
            TfidfVectorizer()
        ],
        "vectorizer__ngram_range": [(1, 1), (2, 2)],
        "vectorizer__max_df": [0.5, 1.0],
        "vectorizer__min_df": [1, 2],
        "classifier": [GradientBoostingClassifier(random_state=641)],
        "classifier__max_depth": [1, 2, 3, 4, 5]
    }
]

# Create folds
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=641)

# Tune model
grid = GridSearchCV(model, params, scoring='balanced_accuracy', cv=cv)
grid.fit(train['status'], train['neuroticism_class'])

# Save results
with open('results.pkl', 'wb') as file:
    pickle.dump(grid.cv_results_, file)

# Show results
print(f'''Pipeline([
    {grid.best_estimator_.steps[0]},
    {grid.best_estimator_.steps[1]}
])''')

Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(2, 2)))
    ('classifier', KNeighborsClassifier(metric='euclidean', n_neighbors=9))
])


## Testing

In [None]:
# Show results
print(classification_report(
    test['neuroticism_class'],
    grid.predict(test['text'])
))

              precision    recall  f1-score   support

        calm       0.51      0.47      0.49      1235
    neurotic       0.51      0.55      0.53      1233

    accuracy                           0.51      2468
   macro avg       0.51      0.51      0.51      2468
weighted avg       0.51      0.51      0.51      2468



## Extra

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin, BaseEstimator):
    '''scikit-learn wrapper for scipy _spbase.toarray'''

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.toarray()

In [None]:
import nltk
import re
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet', quiet=True)

class LemmaTokenizer:
    '''Wrapper for NLTK WordNetLemmatizer.lemmatize'''

    def __init__(self, *, token_pattern=None):
        if token_pattern is None:
            token_pattern = r'(?u)\b\w\w+\b'
        self.tokenizer = re.compile(token_pattern).findall
        self.lemmatizer = WordNetLemmatizer().lemmatize

    def __call__(self, x):
        return [self.lemmatizer(t) for t in self.tokenizer(x)]

In [None]:
from nltk.stem import PorterStemmer
import re

class StemTokenizer:
    '''Wrapper for NLTK PorterStemmer.stem'''

    def __init__(self, *, token_pattern=None):
        if token_pattern is None:
            token_pattern = r'(?u)\b\w\w+\b'
        self.tokenizer = re.compile(token_pattern).findall
        self.stemmer = PorterStemmer().stem

    def __call__(self, x):
        return [self.stemmer(t, to_lowercase=False) for t in self.tokenizer(x)]

In [None]:
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.base import TransformerMixin
from gensim.models import Word2Vec

class Word2Vectorizer(TransformerMixin, BaseEstimator):
    '''scikit-learn wrapper for gensim Word2Vec'''

    def __init__(self, *, analyzer=None):
        if analyzer is None:
            analyzer = HashingVectorizer().build_analyzer()
        self.analyzer = analyzer
        self.model_ = None

    def fit(self, X, y=None):
        self.model_ = Word2Vec([self.analyzer(x) for x in X])
        return self

    def transform(self, X):
        return np.vstack([self.model_.wv.get_mean_vector(self.analyzer(x)) for x in X])

In [None]:
class _Regenerator:
    '''Utility for Word2Vectorizer'''

    def __init__(self, callback, iterable):
        self.callback = callback
        self.iterable = iterable
        self.generator = None

    def __iter__(self):
        self.generator = (self.callback(x) for x in self.iterable)
        return self

    def __next__(self):
        return next(self.generator)

In [None]:
from sklearn.base import BaseEstimator
from gensim.models.doc2vec import Doc2Vec
from sklearn.feature_extraction.text import HashingVectorizer
from gensim.models.doc2vec import TaggedDocument
from sklearn.base import TransformerMixin

class Doc2Vectorizer(TransformerMixin, BaseEstimator):
    '''scikit-learn wrapper for gensim Doc2Vec'''

    def __init__(self, *, analyzer=None):
        if analyzer is None:
            analyzer = HashingVectorizer().build_analyzer()
        self.analyzer = analyzer
        self.model_ = None

    def fit(self, X, y=None):
        self.model_ = Doc2Vec([TaggedDocument(self.analyzer(x), [i]) for i, x in enumerate(X)])
        return self

    def transform(self, X):
        return np.vstack([self.model_.infer_vector(self.analyzer(x)) for x in X])

In [None]:
class _ERegenerator:
    '''Utility for Doc2Vectorizer'''

    def __init__(self, callback, iterable):
        self.callback = callback
        self.iterable = iterable
        self.generator = None

    def __iter__(self):
        self.generator = (self.callback(x, i) for i, x in enumerate(self.iterable))
        return self

    def __next__(self):
        return next(self.generator)