In [12]:
import os
import numpy as np
import pandas as pd
import csv
import joblib
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


MODELS_DIR = ""
DATA_DIR = ""
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
TOP_WORDS = 2500
MAX_POST_LENGTH = 40

for k in range(len(DIMENSIONS)):
    x_train = []
    y_train = []
    x_test = []
    y_test = []

    ### Read in data
    with open(os.path.join(DATA_DIR, "train_{}.csv".format(DIMENSIONS[k][0])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_train.append(post)
                y_train.append(0)
    with open(os.path.join(DATA_DIR, "train_{}.csv".format(DIMENSIONS[k][1])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_train.append(post)
                y_train.append(1)
    with open(os.path.join(DATA_DIR, "test_{}.csv".format(DIMENSIONS[k][0])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_test.append(post)
                y_test.append(0)
    with open(os.path.join(DATA_DIR, "test_{}.csv".format(DIMENSIONS[k][1])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_test.append(post)
                y_test.append(1)

    MBTI_TYPES = [
        "INFJ",
        "ENTP",
        "INTP",
        "INTJ",
        "ENTJ",
        "ENFJ",
        "INFP",
        "ENFP",
        "ISFP",
        "ISTP",
        "ISFJ",
        "ISTJ",
        "ESTP",
        "ESFP",
        "ESTJ",
        "ESFJ",
    ]
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    tokenizer = Tokenizer(num_words=TOP_WORDS, filters="")
    tokenizer.fit_on_texts(x_train + x_test)

    def lemmatize(x):
        lemmatized = []
        for post in x:
            temp = post.lower()
            for mbti_type in MBTI_TYPES:
                mbti_type = mbti_type.lower()
                temp = temp.replace(" " + mbti_type, "")
            temp = " ".join(
                [
                    lemmatizer.lemmatize(word)
                    for word in temp.split(" ")
                    if (word not in stop_words)
                ]
            )
            lemmatized.append(temp)
        return np.array(lemmatized)

    def preprocess(x):
        lemmatized = lemmatize(x)
        tokenized = tokenizer.texts_to_sequences(lemmatized)
        return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH)

    x_train = lemmatize(x_train)
    x_test = lemmatize(x_test)

    ### Assign to dataframe
    df = pd.DataFrame(data={"text": x_train, "type": y_train})
    df = df.sample(frac=1).reset_index(drop=True)  ### Shuffle rows

    ### Make pipeline
    pipeline = Pipeline(
        [
            ("vectorizer", CountVectorizer(stop_words="english")),  ### Bag-of-words
            ("transformer", TfidfTransformer()),
            ("classifier", MultinomialNB()),
        ]
    )
    
   ### Test set classification (individual posts)
    pipeline.fit(df["text"].values, df["type"].values)
    predictions = pipeline.predict(x_test)
    confusion = confusion_matrix(y_test, predictions)
    score = accuracy_score(y_test, predictions)
    with open(
        os.path.join(MODELS_DIR, "baseline_accuracy_{}.txt".format(DIMENSIONS[k])), "w", encoding="utf-8"
    ) as f:
        f.write(
            "*** {}/{} TEST SET CLASSIFICATION (POSTS) ***\n".format(
                DIMENSIONS[k][0], DIMENSIONS[k][1]
            )
        )
        f.write("Total posts classified: {}\n".format(len(x_test)))
        f.write("Accuracy: {}\n".format(score))
        f.write("Confusion matrix: \n")
        f.write(np.array2string(confusion, separator=", "))
    print(
        f"Wrote training / test results for {DIMENSIONS[k]} here: {os.path.join(MODELS_DIR, 'baseline_accuracy_{}.txt'.format(DIMENSIONS[k]))}"
    )
    
    joblib.dump(pipeline,
               os.path.join(MODELS_DIR, "baseline_pipeline_{}.pkl".format(DIMENSIONS[k])),
   )
    del pipeline

Wrote training / test results for IE here: baseline_accuracy_IE.txt
[1]
Wrote training / test results for NS here: baseline_accuracy_NS.txt
[1]
Wrote training / test results for FT here: baseline_accuracy_FT.txt
[0]
Wrote training / test results for PJ here: baseline_accuracy_PJ.txt
[0]
