**Abstract**: This program uses Regular Expression (`re`) and Natural Language Toolkit (`nltk`) to clean raw post data and collect some features of the data. It uses object-oriented programming (OOP) strategy and creates father class `Data_to_Clean` and derived class `Data_to_Analyze` including various methods to clean and analyze data.

### Import modules and load data

In [51]:
# Import necessary modules

# Module to load raw data(CSV file)
import pandas as pd

# Modules for NLP
import re # Regular Expression
import string
from typing import List
import nltk # Natural Language Toolkit
from nltk.tokenize import word_tokenize # For text tokenization
from nltk.corpus import stopwords,wordnet # For stopwords removal
# For tokens part-of-speech tagging and lemmatization
from nltk import pos_tag 
from nltk.stem import WordNetLemmatizer
my_nltk_path="Data"
nltk.data.path.append(my_nltk_path)
import textstat # Evaluate text readability
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # Evaluate text emotion

# Modules to read/write external files,etc.
import json
import pickle
import copy

# Average function
def ave(l):
    return sum(l)/len(l)

# MBTI type dictionary
MBTI_types = [
    "ISTJ", "ISFJ", "INFJ", "INTJ",
    "ISTP", "ISFP", "INFP", "INTP",
    "ESTP", "ESFP", "ENFP", "ENTP",
    "ESTJ", "ESFJ", "ENFJ", "ENTJ"
]

# Data loading and spliting 
raw_data=pd.read_csv("Data\\mbti_1.csv")
for i in raw_data.index:
    temp=raw_data.loc[i,"posts"]
    temp=temp.split("|||")
    raw_data.loc[i,"posts"]=temp

### Create a class to clean data

In [52]:
class Data_to_Clean:

    # Load the contraction map in class
    with open(file="contractions.json",mode='r',encoding='utf-8') as f:
        contractions_map=json.load(f)
    def __init__(self,source=raw_data):
        #self.data should be ALL THE POSTS, type:pd.Series
        self.data=source
        
    # Remove URL
    def remove_url(self):
        def process_remove_url(post):
            post_without_url=[]
            for sentence in post:
                # Use re to scan and substitute
                post_without_url.append(
                re.sub(
                    pattern=r'http\S+|www\S+|https\S+',
                    repl='',
                    string=sentence,
                    flags=re.MULTILINE
                    )
                )
            return post_without_url
        self.data["posts"]=self.data["posts"].apply(process_remove_url)
    
    # Expand contractions
    @staticmethod
    def text_expand(original_string,contraction_mapping=contractions_map):
        # Compile an re pattern
        contractions_pattern = re.compile(
            '({})'.format('|'.join(contraction_mapping.keys())),
            flags=re.IGNORECASE|re.DOTALL
            )
        # Map original string to expanded string
        def text_mapping(text_matched):
            old_text=text_matched.group(0)
            new_text=contraction_mapping.get(old_text.lower())
            if not new_text:
                new_text=contraction_mapping.get(old_text)
                if not new_text:
                    return old_text
            return new_text
        # Use re.sub() to scan and substitute
        expanded_string=contractions_pattern.sub(
            repl=lambda m:text_mapping(m),
            string=original_string
        )
        return expanded_string
    # Apply the function to dataset
    def expand_contractions(self):
        def process_expand_contractions(original_list):
            for idx in range(len(original_list)):
                original_list[idx]=Data_to_Clean.text_expand(original_list[idx])
            return original_list
        self.data["posts"]=self.data["posts"].apply(lambda x:process_expand_contractions(x))

    # Convert to lower case
    def tolower(self):
        def process_tolower(post):
            return [
                sentence.lower() for sentence in post
            ]
        self.data["posts"]=self.data["posts"].apply(process_tolower)
    
    # Remove punctuations
    def remove_punct(self):
        def process_remove_punct(post):
            post_without_punct=[]
            for sentence in post:
                post_without_punct.append(
                    re.sub(
                    pattern=r'[^a-zA-Z\s]',
                    repl=' ',
                    string=sentence
                    )
                )
            return post_without_punct
        self.data["posts"]=self.data["posts"].apply(process_remove_punct)
        
    # Remove empty string and whitespace characters
    def remove_whitespace(self):
        def process_remove_whitespace(post):
            return [
                sentence for sentence in post if sentence.strip()
            ]
        self.data["posts"]=self.data["posts"].apply(process_remove_whitespace)

    # Text tokenization
    def totokens(self):
        def process_totokens(post):
            post_totokens=[]
            for sentence in post:
                tokens=word_tokenize(sentence)
                post_totokens.append(tokens)
            return post_totokens
        self.data["posts"]=self.data["posts"].apply(process_totokens)
    
    # Remove stopwords in tokenized text
    def remove_stopwords(self):
        def process_remove_stopwords(post):
            stop_words=set(stopwords.words("english"))
            filtered_post=[]
            for sentence in post:
                filtered_sentence=[]
                for word in sentence:
                    if word not in stop_words:
                        filtered_sentence.append(word)
                filtered_post.append(filtered_sentence)
            return filtered_post
        self.data["posts"]=self.data["posts"].apply(process_remove_stopwords)

    # Lemmatization
    def post_lemmatize(self):
        def process_lemmatize(post):
            # Convert format of part-of-speech tags
            def get_wordnet_postag(old_postag):
                if old_postag.startswith('J'):  
                    return wordnet.ADJ 
                elif old_postag.startswith('V'):  
                    return wordnet.VERB
                elif old_postag.startswith('N'):  
                    return wordnet.NOUN  
                elif old_postag.startswith('R'):  
                    return wordnet.ADV  
                else:  
                    return wordnet.NOUN
            lemmatizer=WordNetLemmatizer()
            lemmatized_post=[]
            for tokens in post:
                lemmatized_tokens=[]
                # Part of speech tagging
                tagged_tokens=pos_tag(tokens)
                # Lemmatize tokens
                for word,tag in tagged_tokens:
                    lemmatized_tokens.append(lemmatizer.lemmatize(word,get_wordnet_postag(tag)))
                lemmatized_post.append(lemmatized_tokens)
            return lemmatized_post
        self.data["posts"]=self.data["posts"].apply(process_lemmatize)
        
    

### Create a derived class to analysis data

In [53]:
class Data_to_Analyze(Data_to_Clean):
    def __init__(self,type,source=raw_data):
        # First initialize an object of father class(Data_to_Clean)
        super().__init__(source)
        # self.data is of type pd.DataFrame, now specific the MBTI type
        self.data=self.data.loc[self.data["type"]==type].reset_index(drop=True)
        self.data_to_vec=None
        # Store bacic identities of the text
        self.basic_identities=pd.Series({

            "type":type,
            # Number of sentences in a post
            "sentence_quantity":[],
            "ave_sentence_quantity":None,
            # Number of words in a post
            "word_count":[],
            "ave_word_count":None,
            # Ratio of upper case characters in a post
            "upper_ratio":[],
            "ave_upper_ratio":None,
            # Two indicators of text readability: Flesch Reading Ease and Gunning Fog Index 
            "reading_ease":[],
            "ave_reading_ease":None,
            "GF_index":[],
            "ave_GF_index":None,
            # Overall text emotion indicator
            "overall_vader_score":None
        })

    # Design various methods to get identity data

    def get_sentence_quantity(self):
        for post in self.data["posts"].values:
            self.basic_identities["sentence_quantity"].append(len(post))
        self.basic_identities["ave_sentence_quantity"]=ave(self.basic_identities["sentence_quantity"])
    
    def get_word_count(self):
        for post in self.data["posts"].values:
            ans=0
            for sentence in post:
                ans+=len(sentence.split(" "))
            self.basic_identities["word_count"].append(ans)
        self.basic_identities["ave_word_count"]=ave(self.basic_identities["word_count"])
 
    def get_upper_ratio(self):
        for post in self.data["posts"].values:
            char_count=0;upper_count=0
            for sentence in post:
                for char in sentence:
                    if char.isalpha():
                        char_count+=1
                        if char.isupper():
                            upper_count+=1
            if char_count!=0:
                self.basic_identities["upper_ratio"].append(upper_count/char_count)
            else:
                continue
        self.basic_identities["ave_upper_ratio"]=ave(self.basic_identities["upper_ratio"])
    
    def get_readability(self):
        reading_ease=[];GF_idx=[]
        for post in self.data["posts"].values:
            concatenated_post=post[0]
            for idx in range(1,len(post)):
                concatenated_post+=post[idx]
            reading_ease.append(
                textstat.flesch_reading_ease(concatenated_post)
            )
            GF_idx.append(
                textstat.gunning_fog(concatenated_post)
            )
        self.basic_identities["reading_ease"]=reading_ease
        self.basic_identities["ave_reading_ease"]=ave(self.basic_identities["reading_ease"])
        self.basic_identities["GF_index"]=GF_idx
        self.basic_identities["ave_GF_index"]=ave(self.basic_identities["GF_index"])
    def get_vader_score(self):
        analyzer = SentimentIntensityAnalyzer()
        overall_vader_score={'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
        def process_vader_score(post):
            def concatenate_full_post(post):
                filtered_post=[sentence for sentence in post if not sentence.isspace()]
                return "".join(filtered_post)
            post_string=concatenate_full_post(post)
            scores=analyzer.polarity_scores(post_string)
            overall_vader_score["neg"]+=scores["neg"]
            overall_vader_score["neu"]+=scores["neu"]
            overall_vader_score["pos"]+=scores["pos"]
            overall_vader_score["compound"]+=scores["compound"]
            return scores
        self.data["vader_score"]=self.data["posts"].apply(process_vader_score)
        overall_vader_score["neg"]/=len(self.data)
        overall_vader_score["neu"]/=len(self.data)
        overall_vader_score["pos"]/=len(self.data)
        overall_vader_score["compound"]/=len(self.data)
        self.basic_identities["overall_vader_score"]=overall_vader_score


#### Test the class

#### Create a function including all the procedures of data 

In [54]:
def analyze_data(TYPE):
    data=Data_to_Analyze(type=TYPE)
    data.remove_url()
    # Some features like text readability need to be collected BEFORE the following cleaning procedures
    # Otherwise, they are NOT accurate
    data.get_vader_score()
    data.get_sentence_quantity()
    data.get_word_count()
    data.get_upper_ratio()
    data.get_readability()
    print(data.basic_identities)
    # Continue to clean the data
    data.expand_contractions()
    data.tolower()
    data.remove_punct()
    data.remove_whitespace()
    data.totokens()
    data.data_to_vec = copy.deepcopy(data.data)
    data.remove_stopwords()
    data.post_lemmatize()
    # Save cleaned data to pickle binary files so that they can be loaded easily in other programs
    with open(f"Data\\cleaned_data\\{TYPE}_cleaned.pkl","wb") as f:
        pickle.dump(data,f)

# Analyze posts from all MBTI types

for T in MBTI_types:
    analyze_data(T)

#analyze_data("INFP")

type                                                                  ISTJ
sentence_quantity        [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 47, 3...
ave_sentence_quantity                                            48.356098
word_count               [936, 1289, 1498, 1392, 1532, 1698, 1921, 1355...
ave_word_count                                                 1302.160976
upper_ratio              [0.05190708345282478, 0.04737864077669903, 0.0...
ave_upper_ratio                                                   0.053204
reading_ease             [76.52, 76.93, 77.53, 79.67, 74.9, 73.98, 76.3...
ave_reading_ease                                                 73.097854
GF_index                 [7.08, 7.67, 6.93, 6.38, 8.22, 8.45, 7.16, 7.3...
ave_GF_index                                                      7.710293
overall_vader_score      {'neg': 0.07765365853658539, 'neu': 0.77300000...
dtype: object
type                                                                  ISFJ
sentence_qu