In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import spacy

In [None]:
data = pd.read_csv('data/personality.csv')

In [None]:
data

In [None]:
# check if the columns post_extrovert, post_feeling, post_judging, post_sensing are all the same for each row
data['post_extrovert'].equals(data['post_feeling']) and data['post_feeling'].equals(data['post_judging']) and data['post_judging'].equals(data['post_sensing'])

In [None]:
# from the post columns keep only one and rename it to post
data = data.drop(columns=['post_feeling', 'post_judging', 'post_sensing'])
data = data.rename(columns={'post_extrovert': 'post'})


In [None]:
# extrovert, feeling, judging, sensing transform to int
data['extrovert'] = data['extrovert'].astype(int)
data['feeling'] = data['feeling'].astype(int)
data['judging'] = data['judging'].astype(int)
data['sensing'] = data['sensing'].astype(int)

In [None]:
# create columns that concatanates the columns extrovert, feeling, judging, sensing (does not add them but concatanates them), columns are floats
data['personality'] = data['extrovert'].astype(str) + data['feeling'].astype(str) + data['judging'].astype(str) + data['sensing'].astype(str)

In [None]:
data.info()

In [None]:
# put a unique id for each new value in the personality column
data['personality_id'] = data['personality'].astype('category')
data

In [None]:
le =  LabelEncoder()
data['personality_id'] = le.fit_transform(data['personality'])
data


In [None]:
data['personality'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['post'], data['personality_id'],test_size=0.2, random_state=42)

In [None]:
tfidf = TfidfVectorizer(min_df=3)
X_train = tfidf.fit_transform(X_train)

In [None]:
lr = LogisticRegression(n_jobs=-1)
lr.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(lr.predict(X_train), y_train),
accuracy_score(lr.predict(tfidf.transform(X_test)), y_test))


In [None]:
class Tokenizer(object):
    
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        
    def tokenize(self, doc):
        return [token.text for token in self.nlp(doc)]
    
    def lemmatize(self, doc):
        return [token.lemma_ for token in self.nlp(doc)]

tok = Tokenizer()

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(sublinear_tf=True, tokenizer=tok.tokenize)),
    ("clf", MultinomialNB()),
])

parameters = {
    "tfidf__ngram_range": ((1, 1), (1, 2)),  # unigrams or uni+bigrams
    "tfidf__lowercase": [True, False],       # lowercase yes / no
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_estimator_.get_params()

print("Best parameter settings:")
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

model = grid_search.best_estimator_