In [None]:
import pandas as pd
import spacy
import regex as re
import nltk
import numpy as np
from nltk.corpus import stopwords
from collections import Counter
from string import punctuation
from nltk.tokenize import word_tokenize


In [None]:
# Read df
df = pd.read_csv('mbti_1.csv')

# Global Vars
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")
cachedStopWords = stopwords.words("english")
types = df['type'].tolist()
set_types = set([i.lower() for i in types])
print(set_types)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'infj', 'enfj', 'intp', 'isfp', 'istj', 'estp', 'esfp', 'istp', 'entp', 'estj', 'esfj', 'isfj', 'infp', 'intj', 'enfp', 'entj'}


In [None]:
def remove_stop(row):
  global cachedStopWords
  global set_types

  row = ' '.join([word for word in row.split() if word not in cachedStopWords])
  row = ' '.join([word for word in row.split() if word not in set_types])
  return row

In [None]:
def lemmatize(row):
  doc = nlp(row)
  return ' '.join([token.lemma_ for token in doc])

In [None]:
def get_keywords(text):
    keywords = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] 
    doc = nlp(text) 
    for token in doc:

        if(token.text in punctuation):
            keywords.append(token.text)

        if(token.pos_ in pos_tag):
            keywords.append(token.text)
    return ' '.join(word for word in keywords)

In [None]:
def remove_unwanted_space(text):
    val1 = '.'
    sentences = text.split('.')
    updated_sentences = []
    for sentence in sentences:
        updated_sentences.append(sentence.strip())
    try:
        while True:
            updated_sentences.remove(val1)
    except ValueError:
        pass
    val2 = ''
    try:
        while True:
            updated_sentences.remove(val2)
    except ValueError:
        pass
    updated_text = ". ".join(updated_sentences)
    return updated_text

In [None]:
df = pd.read_csv('mbti_1.csv')

In [None]:
def process_text(df):

  df['posts'] = df['posts'].apply(lambda x: x.lower())
  df['posts'] = df['posts'].apply(lambda x: re.sub(r'http\S+', '', x))
  df['posts'] = df['posts'].apply(lambda x: x.replace("'", ""))

  df['posts'] = df['posts'].apply(lambda x: re.sub(r'[^ a-z\.]+', '', x))
  df['posts'] = df['posts'].apply(lambda x: remove_stop(x))
  df['posts'] = df['posts'].apply(lambda x: lemmatize(x))
  df['posts'] = df['posts'].apply(lambda x: get_keywords(x))
  df['posts'] = df['posts'].apply(lambda x: remove_unwanted_space(x))
  return df

In [None]:
df = process_text(df)

In [None]:
df.to_csv('preprocessed.csv')

In [None]:
df

Unnamed: 0,type,posts
0,INFJ,moment sportscenter top prankswhat lifechangin...
1,ENTP,lack post alarming.sex bore position . example...
2,INTP,good course blessing positive good friend amaz...
3,INTJ,dear conversation day . esoteric gabbing natur...
4,ENTJ,silly misconception . approach key unlocking e...
...,...,...
8670,ISFP,cat fi dom reason . website neo nazis perc.im ...
8671,ENFP,thread someplace hereooop movie watch thread t...
8672,INTP,many question thing . purple pill . win lotter...
8673,INFP,conflict child . maternal instinct . none clos...


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd
print('Indexing word vectors.')

embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

--2022-11-03 16:48:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-11-03 16:48:36--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-03 16:48:36--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
def do_embedding(row):
  vector_list = []
  for word in row:
    try:
      vector_list.append(embeddings_index[word])
    except:
      pass
  return vector_list

def word_embeddings(df):
  df['vectors'] = df['posts'].apply(lambda x: do_embedding(x))
  return df

In [None]:
vectorized_df = word_embeddings(df)
vectorized_df.to_pickle("vectorized.pkl")

In [None]:
print(len(df), len(vectorized_df))
vectorized_df

8675 8675


Unnamed: 0,type,posts,vectors
0,INFJ,moment sportscenter top prankswhat lifechangin...,"[[0.29492, 0.56874, -0.20245, 0.50244, -0.6829..."
1,ENTP,lack post alarming.sex bore position . example...,"[[-0.45433, 1.0234, 0.024278, -0.086367, -0.69..."
2,INTP,good course blessing positive good friend amaz...,"[[-0.37628, 0.37102, 0.32594, -0.085084, -0.55..."
3,INTJ,dear conversation day . esoteric gabbing natur...,"[[-0.91091, 0.50459, 0.058175, -0.78618, 0.088..."
4,ENTJ,silly misconception . approach key unlocking e...,"[[0.13739, 0.77891, 0.80054, 0.13819, -0.49792..."
...,...,...,...
8670,ISFP,cat fi dom reason . website neo nazis perc.im ...,"[[-0.11752, 0.97272, -0.29021, 0.25914, -0.426..."
8671,ENFP,thread someplace hereooop movie watch thread t...,"[[0.13482, 0.40224, -0.42266, -0.055631, -0.55..."
8672,INTP,many question thing . purple pill . win lotter...,"[[0.29492, 0.56874, -0.20245, 0.50244, -0.6829..."
8673,INFP,conflict child . maternal instinct . none clos...,"[[-0.11752, 0.97272, -0.29021, 0.25914, -0.426..."


In [None]:
# Read pickle
vectorized_df = pd.read_pickle("vectorized.pkl")