**Abstract**: This program uses Regular Expression (`re`) and Natural Language Toolkit (`nltk`) to clean raw post data and collect some features of the data. It uses object-oriented programming (OOP) strategy and creates father class `Data_to_Clean` and derived class `Data_to_Analyze` including various methods to clean and analyze data.

### Import modules and load data

In [376]:
# Import necessary modules

# Module to load raw data(CSV file)
import pandas as pd

# Modules for NLP
import re # Regular Expression
import string
from typing import List
import nltk # Natural Language Toolkit
from nltk.tokenize import word_tokenize # For text tokenization
from nltk.corpus import stopwords,wordnet # For stopwords removal
# For tokens part-of-speech tagging and lemmatization
from nltk import pos_tag 
from nltk.stem import WordNetLemmatizer
my_nltk_path="Data"
nltk.data.path.append(my_nltk_path)
import textstat # Evaluate text readability
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # Evaluate text emotion
# Transformers model to evaluate text emotion
from transformers import pipeline
import torch

# Modules to read/write external files,etc.
import json
import pickle
import copy

from tqdm import tqdm

# Average function
def ave(l):
    return sum(l)/len(l)

# MBTI type dictionary
MBTI_types = [
    'istj', 'isfj', 'infj', 'intj', 
    'istp', 'isfp', 'infp', 'intp', 
    'estp', 'esfp', 'enfp', 'entp', 
    'estj', 'esfj', 'enfj', 'entj'
    ]

# Data loading and spliting 
raw_data=pd.read_csv("Data\\twitter_MBTI.csv",encoding='utf-8')
raw_data.drop(columns="Unnamed: 0",inplace=True)
raw_data.columns=["posts","type"]
# for i in raw_data.index:
#     temp=raw_data.loc[i,"text"]
#     temp=temp.split("|||")
#     raw_data.loc[i,"text"]=temp

In [377]:
raw_data.head(20)

Unnamed: 0,posts,type
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj
1,@Hispanthicckk Being you makes you look cute||...,intj
2,@Alshymi Les balles sont réelles et sont tirée...,intj
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj
5,Frances Farmer Will Have Her Revenge On Seattl...,intj
6,🤣🤭🤣🤭🤣🤭🤣🤭 https://t.co/2a0tICP1yk|||Blind faith...,intj
7,proud of this one it goes hard https://t.co/RQ...,intj
8,@Ieokuras so amazing!|||@hxhrats @ETTUKILLUG w...,intj
9,@JadMitri Good luck Jad!|||@ElsaYaghi A lawyer...,intj


In [378]:
raw_data["type"].value_counts()

type
infp    1282
infj    1057
intp     811
intj     781
enfp     729
entp     577
enfj     518
isfp     367
isfj     364
istp     327
entj     279
istj     259
esfp     174
esfj     105
estp     100
estj      81
Name: count, dtype: int64

In [379]:
for i in raw_data.index:
    raw_data.loc[i,"posts"]=raw_data.loc[i,"posts"].split("|||")

In [380]:
raw_data.head(20)

Unnamed: 0,posts,type
0,[@Pericles216 @HierBeforeTheAC @Sachinettiyil ...,intj
1,"[@Hispanthicckk Being you makes you look cute,...",intj
2,[@Alshymi Les balles sont réelles et sont tiré...,intj
3,"[I'm like entp but idiotic, Hey boy, do you wa...",intj
4,[@kaeshurr1 Give it to @ZargarShanif ... He ha...,intj
5,[Frances Farmer Will Have Her Revenge On Seatt...,intj
6,"[🤣🤭🤣🤭🤣🤭🤣🤭 https://t.co/2a0tICP1yk, Blind faith...",intj
7,[proud of this one it goes hard https://t.co/R...,intj
8,"[@Ieokuras so amazing!, @hxhrats @ETTUKILLUG w...",intj
9,"[@JadMitri Good luck Jad!, @ElsaYaghi A lawyer...",intj


### Create a class to clean data

In [381]:
class Data_to_Clean:

    # Load the contraction map in class
    with open(file="contractions.json",mode='r',encoding='utf-8') as f:
        contractions_map=json.load(f)
    def __init__(self,source=raw_data):
        #self.data should be ALL THE POSTS, type:pd.Series
        self.data=source
    
    # Remove "@Someone"
    def remove_mention_and_tag(self):
        def process_removal(post):
            post_without_mention=[]
            for sentence in post:
                # Use re to scan and substitute
                post_without_mention.append(
                    re.sub(
                        pattern=r'@\w+|#\w+',
                        repl=" ",
                        string=sentence
                    )
                )
            return post_without_mention
        self.data["posts"]=self.data["posts"].apply(process_removal)
        
    # Remove URL
    def remove_url(self):
        def process_remove_url(post):
            post_without_url=[]
            for sentence in post:
                # Use re to scan and substitute
                post_without_url.append(
                re.sub(
                    pattern=r'http\S+|www\S+|https\S+|\n',
                    repl='',
                    string=sentence,
                    flags=re.MULTILINE
                    )
                )
            return post_without_url
        self.data["posts"]=self.data["posts"].apply(process_remove_url)
    
    # Remove emoji
    def remove_emoji(self):
        def process_remove_emoji(post):
            post_without_emoji=[]
            for sentence in post:
                # Use re to scan and substitute
                emoji_pattern=re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed Characters, etc.
        "\U0001f926-\U0001f937"  # Supplemental Symbols and Pictographs
        "\U00010000-\U0010ffff"  # Broader range for some less common emojis
        "]+", flags=re.UNICODE
                )
                post_without_emoji.append(
                    emoji_pattern.sub(
                        repl=" ",
                        string=sentence
                    )
                )
            return post_without_emoji
        self.data["posts"]=self.data["posts"].apply(process_remove_emoji)
        

    # Expand contractions
    @staticmethod
    def text_expand(original_string, contraction_mapping=contractions_map):

        standardized_contraction_map = {k.lower(): v for k, v in contraction_mapping.items()}

        sorted_contractions = sorted(
            standardized_contraction_map.items(),
            key=lambda item: len(item[0]),
            reverse=True
        )

        pattern_parts = []
        for contraction, _ in sorted_contractions:
            pattern_parts.append(r'\b' + re.escape(contraction) + r'\b')

        if not pattern_parts:
            return original_string

        contractions_pattern = re.compile(
            '({})'.format('|'.join(pattern_parts)),
            flags=re.IGNORECASE
        )

        def text_mapping(match_obj):
            old_text = match_obj.group(0)
            new_text = standardized_contraction_map.get(old_text.lower())
            
            if new_text:
                return new_text + " "
            else:
                return old_text + " "
        
        expanded_string = contractions_pattern.sub(
            repl=text_mapping,
            string=original_string
        )
        final_result = expanded_string.strip()
        return final_result
    # Apply the function to dataset
    def expand_contractions(self):
        def process_expand_contractions(original_list):
            for idx in range(len(original_list)):
                original_list[idx]=Data_to_Clean.text_expand(original_list[idx])
            return original_list
        self.data["posts"]=self.data["posts"].apply(lambda x:process_expand_contractions(x))

    # Convert to lower case
    def tolower(self):
        def process_tolower(post):
            return [
                sentence.lower() for sentence in post
            ]
        self.data["posts"]=self.data["posts"].apply(process_tolower)
    
    # Remove punctuations
    def remove_punct(self):
        def process_remove_punct(post):
            post_without_punct=[]
            for sentence in post:
                post_without_punct.append(
                    re.sub(
                    pattern=r'[^a-zA-Z\s]',
                    repl=' ',
                    string=sentence
                    )
                )
            return post_without_punct
        self.data["posts"]=self.data["posts"].apply(process_remove_punct)
        
    # Remove empty string and whitespace characters
    def remove_whitespace(self):
        def process_remove_whitespace(post):
            return [
                sentence for sentence in post if sentence.strip()
            ]
        self.data["posts"]=self.data["posts"].apply(process_remove_whitespace)

    # Text tokenization
    def totokens(self):
        def process_totokens(post):
            post_totokens=[]
            for sentence in post:
                tokens=word_tokenize(sentence)
                post_totokens.append(tokens)
            return post_totokens
        self.data["posts"]=self.data["posts"].apply(process_totokens)
    
    # Remove stopwords in tokenized text
    def remove_stopwords(self):
        def process_remove_stopwords(post):
            stop_words=set(stopwords.words("english"))
            filtered_post=[]
            for sentence in post:
                filtered_sentence=[]
                for word in sentence:
                    if word not in stop_words:
                        filtered_sentence.append(word)
                filtered_post.append(filtered_sentence)
            return filtered_post
        self.data["posts"]=self.data["posts"].apply(process_remove_stopwords)

    # Lemmatization
    def post_lemmatize(self):
        def process_lemmatize(post):
            # Convert format of part-of-speech tags
            def get_wordnet_postag(old_postag):
                if old_postag.startswith('J'):  
                    return wordnet.ADJ 
                elif old_postag.startswith('V'):  
                    return wordnet.VERB
                elif old_postag.startswith('N'):  
                    return wordnet.NOUN  
                elif old_postag.startswith('R'):  
                    return wordnet.ADV  
                else:  
                    return wordnet.NOUN
            lemmatizer=WordNetLemmatizer()
            lemmatized_post=[]
            for tokens in post:
                lemmatized_tokens=[]
                # Part of speech tagging
                tagged_tokens=pos_tag(tokens)
                # Lemmatize tokens
                for word,tag in tagged_tokens:
                    lemmatized_tokens.append(lemmatizer.lemmatize(word,get_wordnet_postag(tag)))
                lemmatized_post.append(lemmatized_tokens)
            return lemmatized_post
        self.data["posts"]=self.data["posts"].apply(process_lemmatize)
        
    

### Create a derived class to analysis data

In [382]:
class Data_to_Analyze(Data_to_Clean):
    def __init__(self,type,source=raw_data):
        # First initialize an object of father class(Data_to_Clean)
        super().__init__(source)
        # self.data is of type pd.DataFrame, now specific the MBTI type
        self.data=self.data.loc[self.data["type"]==type].reset_index(drop=True)
        self.data_to_vec=None
        # Store bacic identities of the text
        self.basic_identities=pd.Series({

            "type":type,
            # Number of sentences in a post
            "sentence_quantity":[],
            "ave_sentence_quantity":None,
            # Number of words in a post
            "word_count":[],
            "ave_word_count":None,
            # Ratio of upper case characters in a post
            "upper_ratio":[],
            "ave_upper_ratio":None,
            # Two indicators of text readability: Flesch Reading Ease and Gunning Fog Index 
            "reading_ease":[],
            "ave_reading_ease":None,
            "GF_index":[],
            "ave_GF_index":None,
            # Overall text emotion indicator
            "overall_vader_score":None
        })

    # Design various methods to get identity data

    def get_sentence_quantity(self):
        for post in self.data["posts"].values:
            self.basic_identities["sentence_quantity"].append(len(post))
        self.basic_identities["ave_sentence_quantity"]=ave(self.basic_identities["sentence_quantity"])
    
    def get_word_count(self):
        for post in self.data["posts"].values:
            ans=0
            for sentence in post:
                ans+=len(sentence.split(" "))
            self.basic_identities["word_count"].append(ans)
        self.basic_identities["ave_word_count"]=ave(self.basic_identities["word_count"])
 
    def get_upper_ratio(self):
        for post in self.data["posts"].values:
            char_count=0;upper_count=0
            for sentence in post:
                for char in sentence:
                    if char.isalpha():
                        char_count+=1
                        if char.isupper():
                            upper_count+=1
            if char_count!=0:
                self.basic_identities["upper_ratio"].append(upper_count/char_count)
            else:
                continue
        self.basic_identities["ave_upper_ratio"]=ave(self.basic_identities["upper_ratio"])
    
    def get_readability(self):
        reading_ease=[];GF_idx=[]
        for post in self.data["posts"].values:
            concatenated_post=post[0]
            for idx in range(1,len(post)):
                concatenated_post+=post[idx]
            reading_ease.append(
                textstat.flesch_reading_ease(concatenated_post)
            )
            GF_idx.append(
                textstat.gunning_fog(concatenated_post)
            )
        self.basic_identities["reading_ease"]=reading_ease
        self.basic_identities["ave_reading_ease"]=ave(self.basic_identities["reading_ease"])
        self.basic_identities["GF_index"]=GF_idx
        self.basic_identities["ave_GF_index"]=ave(self.basic_identities["GF_index"])
    @staticmethod
    def concatenate_full_post(post):
                filtered_post=[sentence for sentence in post if not sentence.isspace()]
                return "".join(filtered_post)
    def get_vader_score(self):
        analyzer = SentimentIntensityAnalyzer()
        overall_vader_score={'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
        def addup_score_dict(new_dict,base_dict):
            for key in base_dict.keys():
                base_dict[key]+=new_dict[key]
        def ave_score_dict(base_dict,n):
            for key in base_dict.keys():
                base_dict[key]/=n
        def process_vader_score(post):
            post_vader_score={'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
            for sentence in post:
               addup_score_dict(analyzer.polarity_scores(sentence),base_dict=post_vader_score) 
            ave_score_dict(base_dict=post_vader_score,n=len(post))
            addup_score_dict(new_dict=post_vader_score,base_dict=overall_vader_score)
            return post_vader_score
        self.data["vader_score"]=self.data["posts"].apply(process_vader_score)
        ave_score_dict(overall_vader_score,len(self.data["posts"]))
        self.basic_identities["overall_vader_score"]=overall_vader_score
    
    def get_transformer_emotion():
        device=0 if torch.cuda.is_available() else -1
        emotion_pipeline=pipeline(
            "sentiment-analysis",
            model="distilbert-base-uncased-finetuned-sst-2-english",
            device=device
        )


#### Create a function including all the procedures of data 

In [383]:
def analyze_data(TYPE):
    data=Data_to_Analyze(type=TYPE)
    data.remove_url()
    data.remove_mention_and_tag()
    # Some features like text readability need to be collected BEFORE the following cleaning procedures
    # Otherwise, they are NOT accurate
    data.get_sentence_quantity()
    data.get_word_count()
    data.get_upper_ratio()
    data.get_readability()
    data.expand_contractions()
    data.get_vader_score()
    print(data.basic_identities["type"],":",data.basic_identities["overall_vader_score"])
    # Continue to clean the data
    data.remove_emoji()
    data.tolower()
    data.remove_punct()
    data.remove_whitespace()
    data.totokens()
    data.data_to_vec = copy.deepcopy(data.data)
    data.post_lemmatize()
    data.remove_stopwords()
    # Save cleaned data to pickle binary files so that they can be loaded easily in other programs
    with open(f"Data\\cleaned_data\\{TYPE}_cleaned.pkl","wb") as f:
        pickle.dump(data,f)

# Analyze posts from all MBTI types

for T in tqdm(MBTI_types):
    analyze_data(T)

  0%|          | 0/16 [00:00<?, ?it/s]

istj : {'neg': 0.08179204052140848, 'neu': 0.7249500944389327, 'pos': 0.15388113119426872, 'compound': 0.10164875627896154}


  6%|▋         | 1/16 [00:16<04:09, 16.63s/it]

isfj : {'neg': 0.07307475726763728, 'neu': 0.7092950262050157, 'pos': 0.1780807564973477, 'compound': 0.15809072866646495}


 12%|█▎        | 2/16 [00:40<04:49, 20.68s/it]

infj : {'neg': 0.07584714731456584, 'neu': 0.7097540956016621, 'pos': 0.17042750349849217, 'compound': 0.1475314534570671}


 19%|█▉        | 3/16 [01:48<09:13, 42.58s/it]

intj : {'neg': 0.07646494130004093, 'neu': 0.7273120366006491, 'pos': 0.15560567437599288, 'compound': 0.1185340401071561}


 25%|██▌       | 4/16 [02:45<09:38, 48.22s/it]

istp : {'neg': 0.08758517103854414, 'neu': 0.7145789736745929, 'pos': 0.15005359596830448, 'compound': 0.08114281022963879}


 31%|███▏      | 5/16 [03:05<06:59, 38.16s/it]

isfp : {'neg': 0.08393759602287792, 'neu': 0.7119191720353625, 'pos': 0.16474647901787084, 'compound': 0.11191127362775574}


 38%|███▊      | 6/16 [03:28<05:27, 32.78s/it]

infp : {'neg': 0.08438535575798742, 'neu': 0.7115175027654902, 'pos': 0.1629096166660798, 'compound': 0.11520142388893198}


 44%|████▍     | 7/16 [04:48<07:15, 48.43s/it]

intp : {'neg': 0.08404421389439712, 'neu': 0.7236381252164173, 'pos': 0.14983393704772277, 'compound': 0.09258191667174276}


 50%|█████     | 8/16 [05:40<06:36, 49.50s/it]

estp : {'neg': 0.08534405874034974, 'neu': 0.7220947381219299, 'pos': 0.1494163043386319, 'compound': 0.09507354855255952}


 56%|█████▋    | 9/16 [05:47<04:12, 36.00s/it]

esfp : {'neg': 0.08613173727411579, 'neu': 0.7187609236585647, 'pos': 0.15637893001749342, 'compound': 0.09654172377714527}


 62%|██████▎   | 10/16 [05:58<02:50, 28.42s/it]

enfp : {'neg': 0.08002103480025655, 'neu': 0.7196480618480484, 'pos': 0.16231035237299019, 'compound': 0.12779401145885105}


 69%|██████▉   | 11/16 [06:50<02:57, 35.56s/it]

entp : {'neg': 0.08524583691141015, 'neu': 0.7267366885500482, 'pos': 0.1436696952854968, 'compound': 0.0789864217377744}


 75%|███████▌  | 12/16 [07:29<02:26, 36.60s/it]

estj : {'neg': 0.07382632168574729, 'neu': 0.718806844746287, 'pos': 0.16687541537553724, 'compound': 0.13803746344905554}


 81%|████████▏ | 13/16 [07:35<01:22, 27.35s/it]

esfj : {'neg': 0.07768482115947141, 'neu': 0.7092284482788589, 'pos': 0.16311898802018593, 'compound': 0.12895779971990678}


 88%|████████▊ | 14/16 [07:42<00:42, 21.31s/it]

enfj : {'neg': 0.0757417265354425, 'neu': 0.7092767349336974, 'pos': 0.1715985550279135, 'compound': 0.148586578688477}


 94%|█████████▍| 15/16 [08:17<00:25, 25.53s/it]

entj : {'neg': 0.08113876758749386, 'neu': 0.721488860130292, 'pos': 0.156149050706599, 'compound': 0.10743330928888349}


100%|██████████| 16/16 [08:37<00:00, 32.32s/it]


#### Demonstration of each step

In [348]:
infp=Data_to_Analyze("infp")

In [349]:
infp.data.head(30)

Unnamed: 0,posts,type
0,"[@ForestWellsDen Thanks Forest!, @chaotic_reso...",infp
1,[@AndrewNation13 @GameStopNFT Submission was m...,infp
2,[the only acceptable minion meme https://t.co/...,infp
3,"[NO FUCKING WAY? https://t.co/qzR600M29Y, SCRE...",infp
4,"[when ur mom says u lost weight 🫣🤭😍, i am insa...",infp
5,"[I’m on @maIagenre guys, Why do people keep fo...",infp
6,[Yea the drake album took me straight to the b...,infp
7,[Threads have been made about this chick on /b...,infp
8,[one person followed me and 2 people unfollowe...,infp
9,"[who wants to be my comfie, @bitter__mochi @ho...",infp


In [350]:
infp.remove_url()

In [351]:
infp.remove_mention_and_tag()
infp.data.head(30)

Unnamed: 0,posts,type
0,"[ Thanks Forest!, Thank you., Thanks!, ...",infp
1,"[ Submission was made with the name ""The Ma...",infp
2,"[the only acceptable minion meme , , WOW UR ...",infp
3,"[NO FUCKING WAY? , SCREMSING AND CRYING , SA...",infp
4,"[when ur mom says u lost weight 🫣🤭😍, i am insa...",infp
5,"[I’m on guys, Why do people keep following t...",infp
6,[Yea the drake album took me straight to the b...,infp
7,[Threads have been made about this chick on /b...,infp
8,[one person followed me and 2 people unfollowe...,infp
9,"[who wants to be my comfie, nerd, isn’...",infp


In [352]:
infp.remove_emoji()
infp.data.loc[5,"posts"]

['I’m on   guys',
 'Why do people keep following this account ',
 '',
 '',
 '  felt this resonate deep in my SOUL',
 'i swear something shifted.  ',
 'i feel like the phrase “4th gen leaders” doesn’t even begin to describe the impact skz has',
 'OTMYDYDGSGGWBWBD',
 'ON A MINI ALBUM?????? ',
 'GET A MF REAL JOB! BE A REAL MAN!',
 'fuck any grown white man who plays minecraft over the age of 14 ngl',
 '  i’ll have your back  ',
 'cis white man complex',
 'genuinely hope this man has a horrible day and life ',
 '  OHHH everything tbh i can’t remember specifics since i already finished it last semester but i just remember it was hard',
 '  is that not the right word',
 '  now trig is where it starts to get me…',
 '',
 'i hope y’all not only see me as han’s s/o, but as a friend too',
 '    YAS SLAY (for now)',
 'ooo follow my mainpriv too i’m more active on there   ',
 'bang chan is such a swell little guy',
 '  YOU KNOW I FANCY YOU',
 'proof that time isn’t real ',
 '  uh',
 '  psychologis

In [353]:
infp.expand_contractions()
infp.data.head(30)

Unnamed: 0,posts,type
0,"[Thanks Forest!, Thank you., Thanks!, Thank yo...",infp
1,"[Submission was made with the name ""The Matchs...",infp
2,"[the only acceptable minion meme, , WOW you ar...",infp
3,"[NO FUCKING WAY?, SCREMSING AND CRYING, SAYR C...",infp
4,"[when you are mom says you lost weight, i am...",infp
5,"[I am on guys, Why do people keep followin...",infp
6,[Yea the drake album took me straight to the b...,infp
7,[Threads have been made about this chick on /b...,infp
8,[one person followed me and 2 people unfollowe...,infp
9,"[who wants to be my comfie, nerd, isn’t she a ...",infp


In [354]:
infp.tolower()
infp.data.head(30)

Unnamed: 0,posts,type
0,"[thanks forest!, thank you., thanks!, thank yo...",infp
1,"[submission was made with the name ""the matchs...",infp
2,"[the only acceptable minion meme, , wow you ar...",infp
3,"[no fucking way?, scremsing and crying, sayr c...",infp
4,"[when you are mom says you lost weight, i am...",infp
5,"[i am on guys, why do people keep followin...",infp
6,[yea the drake album took me straight to the b...,infp
7,[threads have been made about this chick on /b...,infp
8,[one person followed me and 2 people unfollowe...,infp
9,"[who wants to be my comfie, nerd, isn’t she a ...",infp


In [355]:
infp.remove_punct()
infp.data.head(30)

Unnamed: 0,posts,type
0,"[thanks forest , thank you , thanks , thank yo...",infp
1,[submission was made with the name the matchs...,infp
2,"[the only acceptable minion meme, , wow you ar...",infp
3,"[no fucking way , scremsing and crying, sayr c...",infp
4,"[when you are mom says you lost weight, i am...",infp
5,"[i am on guys, why do people keep followin...",infp
6,[yea the drake album took me straight to the b...,infp
7,[threads have been made about this chick on b...,infp
8,[one person followed me and people unfollowe...,infp
9,"[who wants to be my comfie, nerd, isn t she a ...",infp


In [356]:
infp.remove_whitespace()
infp.data.head(30)

Unnamed: 0,posts,type
0,"[thanks forest , thank you , thanks , thank yo...",infp
1,[submission was made with the name the matchs...,infp
2,"[the only acceptable minion meme, wow you are ...",infp
3,"[no fucking way , scremsing and crying, sayr c...",infp
4,"[when you are mom says you lost weight, i am...",infp
5,"[i am on guys, why do people keep followin...",infp
6,[yea the drake album took me straight to the b...,infp
7,[threads have been made about this chick on b...,infp
8,[one person followed me and people unfollowe...,infp
9,"[who wants to be my comfie, nerd, isn t she a ...",infp


In [357]:
infp.totokens()
infp.data.loc[11,"posts"]

[['as',
  'soon',
  'as',
  'the',
  'butcher',
  'and',
  'maeve',
  'scene',
  'happened',
  'i',
  'was',
  'immediately',
  'like',
  'anna',
  'is',
  'going',
  'to',
  'absolutely',
  'scream'],
 ['oh', 'my', 'god', 'i', 'loved', 'it'],
 ['the',
  'way',
  'every',
  'week',
  'i',
  'watch',
  'an',
  'episode',
  'of',
  'the',
  'boys',
  'then',
  'immediately',
  'go',
  'to',
  'your',
  'profile',
  'to',
  'see',
  'what',
  'your',
  'thoughts',
  'are'],
 ['we', 'are', 'definitely', 'not', 'used', 'to', 'it', 'at', 'all'],
 ['agreed', 'more', 'din', 'djarin', 'please'],
 ['the',
  'heat',
  'is',
  'different',
  'here',
  'i',
  'swear',
  'we',
  'don',
  't',
  'have',
  'ac'],
 ['me', 'thinking', 'am'],
 ['oh',
  'my',
  'god',
  'what',
  'time',
  'are',
  'you',
  'getting',
  'there',
  'i',
  'want',
  'to',
  'get',
  'there',
  'super',
  'early',
  'but',
  'i',
  'don',
  't',
  'know',
  'what',
  'time',
  'laughing',
  'my',
  'ass',
  'off'],
 ['are', 

In [358]:
infp.post_lemmatize()
infp.data.head(30)

Unnamed: 0,posts,type
0,"[[thanks, forest], [thank, you], [thanks], [th...",infp
1,"[[submission, be, make, with, the, name, the, ...",infp
2,"[[the, only, acceptable, minion, meme], [wow, ...",infp
3,"[[no, fucking, way], [scremsing, and, cry], [s...",infp
4,"[[when, you, be, mom, say, you, lose, weight],...",infp
5,"[[i, be, on, guy], [why, do, people, keep, fol...",infp
6,"[[yea, the, drake, album, take, me, straight, ...",infp
7,"[[thread, have, be, make, about, this, chick, ...",infp
8,"[[one, person, follow, me, and, people, unfoll...",infp
9,"[[who, want, to, be, my, comfie], [nerd], [isn...",infp


In [359]:
infp.remove_stopwords()
infp.data.head(30)

Unnamed: 0,posts,type
0,"[[thanks, forest], [thank], [thanks], [thank, ...",infp
1,"[[submission, make, name, matchstickguy], [ple...",infp
2,"[[acceptable, minion, meme], [wow, xiao, real]...",infp
3,"[[fucking, way], [scremsing, cry], [sayr, cute...",infp
4,"[[mom, say, lose, weight], [insane], [worth, h...",infp
5,"[[guy], [people, keep, follow, account], [felt...",infp
6,"[[yea, drake, album, take, straight, beach, dr...",infp
7,"[[thread, make, chick, biz, year], [incels, re...",infp
8,"[[one, person, follow, people, unfollowed, aut...",infp
9,"[[want, comfie], [nerd], [junior], [laugh, wha...",infp
