In [45]:
import os
import numpy as np
import pandas as pd
import csv
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

MODELS_DIR = ""
DATA_DIR = ""
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
TOP_WORDS = 2500
MAX_POST_LENGTH = 40
CROSS_VALIDATION = False
SAVE_MODEL = False

for k in range(len(DIMENSIONS)):
    x_train = []
    y_train = []
    x_test = []
    y_test = []

    ### Read in data
    with open(os.path.join(DATA_DIR, "train_{}.csv".format(DIMENSIONS[k][0])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_train.append(post)
                y_train.append(0)
    with open(os.path.join(DATA_DIR, "train_{}.csv".format(DIMENSIONS[k][1])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_train.append(post)
                y_train.append(1)
    with open(os.path.join(DATA_DIR, "test_{}.csv".format(DIMENSIONS[k][0])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_test.append(post)
                y_test.append(0)
    with open(os.path.join(DATA_DIR, "test_{}.csv".format(DIMENSIONS[k][1])), "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for post in row:
                x_test.append(post)
                y_test.append(1)

    MBTI_TYPES = [
        "INFJ",
        "ENTP",
        "INTP",
        "INTJ",
        "ENTJ",
        "ENFJ",
        "INFP",
        "ENFP",
        "ISFP",
        "ISTP",
        "ISFJ",
        "ISTJ",
        "ESTP",
        "ESFP",
        "ESTJ",
        "ESFJ",
    ]

    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    tokenizer = Tokenizer(num_words=TOP_WORDS, filters="")
    tokenizer.fit_on_texts(x_train + x_test)

    def lemmatize(x):
        lemmatized = []
        for post in x:
            temp = post.lower()
            for mbti_type in MBTI_TYPES:
                mbti_type = mbti_type.lower()
                temp = temp.replace(" " + mbti_type, "")
            temp = " ".join(
                [
                    lemmatizer.lemmatize(word)
                    for word in temp.split(" ")
                    if (word not in stop_words)
                ]
            )
            lemmatized.append(temp)
        return np.array(lemmatized)

    def preprocess(x):
        lemmatized = lemmatize(x)
        tokenized = tokenizer.texts_to_sequences(lemmatized)
        return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH)

    x_train = lemmatize(x_train)
    x_test = lemmatize(x_test)
    

print(x_train)
print(x_test)
print ("\n\n\n\n\n\n\n\n\n\n")

## Sample representaion
document = x_train[0:1001]

# Create a Vectorizer Object
vectorizer = CountVectorizer()
vectorizer.fit(document)
vocabulary = vectorizer.vocabulary_

# Printing the identified Unique words along with their indices
print("Vocabulary: ", vocabulary)
print("\n\n")
pipe = Pipeline([('vectorizer', CountVectorizer(vocabulary=vocabulary)),
                 ('transformer', TfidfTransformer())]).fit(document)

countVectorM = pipe['vectorizer'].transform(document).toarray()
inverseDocFreq = pipe['transformer'].idf_
pipe.transform(document).shape


export = pd.DataFrame(inverseDocFreq)
export.to_csv("tf_idf_vector.csv")

export = pd.DataFrame(np.unique(inverseDocFreq))
export.to_csv("tf_idf_unique_vector.csv")

#print(countVectorM)
#print("\n\n")
#print(inverseDocFreq)
#print("\n\n")
#print(inverseDocFreq.shape)
#print("\n\n")
#print(np.unique(inverseDocFreq))

["'my friend tell u guy recently slept with. guy liked said name much fan. kinda sleep around finished said..."
 'wish drunk...its long evening'
 'usually try stay fights, get dragged end mediator' ...
 'dad an. could even explain amount senseless nitpicking? chores. swear could open door house see weed cut right...'
 "lol. case i'll stop trolling troll post let silly little way :p"
 "well xnfx type, guess mean oblivious reason like whine lot. likely trying interrupt someone trying think important thing so,...'"]
["'i got 593.  i've read enneagram i'm 953, though.  read somewhere lot 9's mistype 5's."
 'gtfo feeler!'
 'lot stuff read description applies tos well.  like mentioned share dominant function. honestly, think even thing descriptions...'
 ...
 "apart obvious thing like loved one's death/betrayal (basically thing, betray they're dead me), usually get triggered someone falsely accusing me. like,..."
 "hey lionfart! guess i'm gonna shorten lion too, otherwise sound weird call som