In [1]:
import os
import numpy as np
import pandas as pd
import csv
import joblib
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


MODELS_DIR = ""
DATA_DIR = ""
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
TOP_WORDS = 2500
MAX_POST_LENGTH = 40
CROSS_VALIDATION = False
SAVE_MODEL = False

for k in range(len(DIMENSIONS)):
    x_train = []
    y_train = []
    x_test = []
    y_test = []

    ### Read in data
    with open(os.path.join(DATA_DIR, "train_{}.csv".format(DIMENSIONS[k][0])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_train.append(post)
                y_train.append(0)
    with open(os.path.join(DATA_DIR, "train_{}.csv".format(DIMENSIONS[k][1])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_train.append(post)
                y_train.append(1)
    with open(os.path.join(DATA_DIR, "test_{}.csv".format(DIMENSIONS[k][0])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_test.append(post)
                y_test.append(0)
    with open(os.path.join(DATA_DIR, "test_{}.csv".format(DIMENSIONS[k][1])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_test.append(post)
                y_test.append(1)

    MBTI_TYPES = [
        "INFJ",
        "ENTP",
        "INTP",
        "INTJ",
        "ENTJ",
        "ENFJ",
        "INFP",
        "ENFP",
        "ISFP",
        "ISTP",
        "ISFJ",
        "ISTJ",
        "ESTP",
        "ESFP",
        "ESTJ",
        "ESFJ",
    ]
    
 ##import goslate
##gs = goslate.Goslate()
##text = input("please input the word you would like translated:\n")
##print(gs.translate(text,'Ar'))


    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    tokenizer = Tokenizer(num_words=TOP_WORDS, filters="")
    tokenizer.fit_on_texts(x_train + x_test)

    def lemmatize(x):
        lemmatized = []
        for post in x:
            temp = post.lower()
            for mbti_type in MBTI_TYPES:
                mbti_type = mbti_type.lower()
                temp = temp.replace(" " + mbti_type, "")
            temp = " ".join(
                [
                    lemmatizer.lemmatize(word)
                    for word in temp.split(" ")
                    if (word not in stop_words)
                ]
            )
            lemmatized.append(temp)
        return np.array(lemmatized)

    def preprocess(x):
        lemmatized = lemmatize(x)
        tokenized = tokenizer.texts_to_sequences(lemmatized)
        return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH)

    x_train = lemmatize(x_train)
    x_test = lemmatize(x_test)
    

print(x_train)
print(x_test)



["'i really close friend college stuff tore u apart friend group. year upon year passed made several attempt fix it, hurt let..."
 'northern light freaking love description this. helpful. would seriously cool meet someone similar me.        szartsky based profile...'
 "either get asked i'm angry told need smile least day. beyond annoying.   i'm also pretty even keeled content majority time long i'm allowed to..."
 ...
 'argument work better used describe problem type profile socionics/mbti problem enneagram. agree lot description se socionics is...'
 "i'm going bookmark thread, six month i'm going bump laugh quite sure people subsequently retyped something else."
 "easy. self-preservation 7w6. excitement, possibilities, freedom, independence, uncomfortable structure routine, us distraction avoid negative feeling like stress. lot positive...'"]
["'i got 593.  i've read enneagram i'm 953, though.  read somewhere lot 9's mistype 5's."
 'gtfo feeler!'
 'lot stuff read description applies t