In [None]:
import os, sys, time
import multiprocessing
import pickle
import re, string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

## Preprocess

TODO:
* Add intermediate data storage
* Remove stopping words first

In [None]:
data = pd.read_csv("mbti_1.csv")
stopwords = pd.read_csv("stopwords.csv").to_numpy().reshape(-1)
n_users = len(data)
posts = data["posts"]

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
type_val = data["type"].value_counts()
labels = type_val.keys()
x = np.arange(len(labels))
ax.bar(x, type_val.values)
ax.set_ylabel("# of people")
ax.set_xticks(x)
ax.set_xticklabels(labels,rotation='45')
ax.set_axisbelow(True)
ax.yaxis.grid(color='gray', linestyle='dashed')
fig.tight_layout()
plt.show()

In [None]:
def preprocess():
    word_lst = []
    user_posts = []
    for uid in range(n_users):
        # add empty space first (better used for regex parsing)
        new_post = posts[uid].replace("|||"," ||| ")
        new_post = new_post.replace(",",", ")
        # remove url links
        new_post = re.sub("(http|https):\/\/.*?( |'|\")","",new_post)
        # avoid words in two sentences merged together after removing spaces
        new_post = new_post.replace(".",". ")
        new_post = new_post.replace("'","' ")
        # remove useless numbers and punctuations
        new_post = re.sub(r"[0-9]+", "", new_post) 
        new_post = new_post.translate(str.maketrans('', '', string.punctuation))
        # remove redundant empty spaces
        new_post = re.sub(" +"," ",new_post).strip()
        # make all characters lower
        new_post = new_post.lower()
        temp = []
        for word in new_post.split():
            if len(word) != 1 and word not in stopwords:
                temp.append(word)
        user_posts.append(temp)
        word_lst += user_posts[-1]
        if uid * 100 % n_users == 0:
            print("Done {}/{} = {}%".format(uid,n_users,uid*100/n_users))
    print("Finished generating word list")

    # make dictionary (bag of words, BOW)
    word_counts = Counter(word_lst)
    word_counts["<UNK>"] = max(word_counts.values()) + 1
    # remove words that don’t occur too frequently
    print("# of words before:",len(word_counts))
    for word in list(word_counts): # avoid changing size
        if word_counts[word] < 3:
            del word_counts[word]
    print("# of words after:",len(word_counts))
    # sort based on counts, but only remain the word strings
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)

    # make embedding based on the occurance frequency of the words
    int_to_word = {k: w for k, w in enumerate(sorted_vocab)}
    word_to_int = {w: k for k, w in int_to_word.items()}
    n_word = len(int_to_word)
    print('Vocabulary size:', n_word)
    return word_to_int

preprocess()