In [153]:
import pandas as pd
import numpy as np
import re
import neattext as nt
import neattext.functions as nfx
from neattext.explainer import emoji_explainer
# import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from string import punctuation
# I guess i need to plot for presentation in case.
import plotly.express as px

In [154]:
# Downloaded dataset from Kaggle https://www.kaggle.com/datasets/datasnaek/mbti-type
df = pd.read_csv('..\\data\\data\\mbti_1.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,'😂😂😂😂 http://www.youtube.com/watch?v=qsXHcwe3k...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [155]:
#drop all the na data. not gonna bother check.
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [156]:
# Check if all MBTI types are present . 4 *4 so must be total of 16
types = df['type'].unique()
print(types)
print(len(types))

['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']
16


In [157]:
# Check the distribution of the MBTI types in the dataset
print(df['type'].value_counts())
#plot the distribution of the MBTI types in the dataset
fig = px.bar(df['type'].value_counts(), title='Distribution of MBTI types in the dataset')

type
INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: count, dtype: int64


In [158]:
fig

Looks like INFP is more prominent in dataset. Anyway, clean the dataset

In [159]:
stopwords = stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()  

In [160]:
#tokenize function to apply to dataframe
def cleantext (text):
    #remove links first
    post = re.sub(r'''(https?:\/\/[^| ]+|www\.[^| ]+)''', ' ', text, flags=re.MULTILINE)
    # removew all the numbers only too since it is useless.
    post = re.sub(r'''[0-9]+''', ' ', post, flags=re.MULTILINE) 
    # kaggle datasource says (Each entry separated by "|||" (3 pipe characters)) so we just remove it and process full chunk of text.
    post = re.sub(r'\|\|\|', ' ', post, flags=re.MULTILINE)
    # Maybe in advance, will have to consider processing each entry separately.
    # check if text contains emoji, if yes replace the emoji with the description
    #List of Emojis
    emoji_list = dict(zip(nt.explainer.__EMOJI_TO_NAME_DICT.values(), nt.explainer.__EMOJI_TO_NAME_DICT.keys()))
    for cr in post:
        if cr in emoji_list.keys():
            post = post.replace(cr, emoji_explainer(cr)+" ")
    # Lowercase the text
    
    post = post.lower()
    
    # Looking at Noise in the text Before cleaning
    # nt_text = nt.TextFrame(post)
    # print("Before - {}".format(  nt_text.noise_scan()))
    
    #Whatever left, trust the library and clean it 😂😂😂 and hope for the best for nows.
    post = nfx.clean_text(post, puncts=True, special_char=True, stopwords=True, urls=True, emails=True, numbers=True, emojis=True)
    #tokenize the text
    
    # Looking at Noise in the text After cleaning
    # nt_text = nt.TextFrame(post)
    # print("After - {}".format(  nt_text.noise_scan()))
    
    return post 
    
def tokenize_and_lemmatize(text):
    post = word_tokenize(text)
    return [lemmatizer.lemmatize(p) for p in post if p not in stopwords]

In [161]:
# testing tokenization re working or not.
print(df['posts'][0])
print(cleantext(df['posts'][0])) 
print(tokenize_and_lemmatize(df['posts'][0]))

'😂😂😂😂 http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg

In [162]:
df['clean_posts'] = df['posts'].apply(cleantext)
df['tokens'] = df['clean_posts'].apply(tokenize_and_lemmatize)

In [163]:
df_tokens = df['tokens'].apply(lambda x: ' '.join(x))
df_tokens.head()

0    face tear joy face tear joy face tear joy face...
1    im finding lack post alarming sex boring posit...
2    good course know thats blessing curse absolute...
3    dear intp enjoyed conversation day esoteric ga...
4    youre fired thats silly misconception approach...
Name: tokens, dtype: object

In [168]:
#Sentiment analysis for the dataset using Vader
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    return sia.polarity_scores(text)

df['sentiment'] = df_tokens.apply(get_sentiment)
df = df.join(pd.DataFrame(df.pop('sentiment').values.tolist()))

In [169]:
#Convert the MBTI types to binary
# Extrovert Introvert
# Sensing Intuition
# Thinking Feeling
# Judging Perceiving
df['is_E'] = df['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
df['is_S'] = df['type'].apply(lambda x: 1 if x[1] == 'S' else 0)
df['is_T'] = df['type'].apply(lambda x: 1 if x[2] == 'T' else 0)
df['is_J'] = df['type'].apply(lambda x: 1 if x[3] == 'J' else 0)


In [171]:
df.head()

Unnamed: 0,type,posts,clean_posts,tokens,is_E,is_S,is_T,is_J,neg,neu,pos,compound
0,INFJ,'😂😂😂😂 http://www.youtube.com/watch?v=qsXHcwe3k...,face tears joy face tears joy face tears joy f...,"[face, tear, joy, face, tear, joy, face, tear,...",0,0,0,1,0.118,0.573,0.309,0.9967
1,ENTP,'I'm finding the lack of me in these posts ver...,im finding lack posts alarming sex boring posi...,"[im, finding, lack, post, alarming, sex, borin...",1,0,1,0,0.145,0.575,0.28,0.9979
2,INTP,'Good one _____ https://www.youtube.com/wat...,good course know thats blessing curse absolute...,"[good, course, know, thats, blessing, curse, a...",0,0,1,0,0.129,0.55,0.321,0.9983
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear intp enjoyed conversation day esoteric ga...,"[dear, intp, enjoyed, conversation, day, esote...",0,0,1,1,0.096,0.636,0.268,0.9984
4,ENTJ,'You're fired.|||That's another silly misconce...,youre fired thats silly misconception approach...,"[youre, fired, thats, silly, misconception, ap...",1,0,1,1,0.206,0.529,0.265,0.9714


In [179]:
#plot the distribution of the MBTI types in the dataset
# print(pd.melt(df[['is_E', 'is_S', 'is_T', 'is_J']]).value_counts())
melted_df = pd.melt(df[['is_E', 'is_S', 'is_T', 'is_J']])
value_counts_df = melted_df.value_counts().reset_index()
value_counts_df.columns = ['Variable', 'Value', 'Count']  # Rename columns appropriately
# print(value_counts_df)
fig = px.bar(value_counts_df, x='Variable', y='Count', color='Value', barmode='group',
             title='Distribution of MBTI types in the dataset')
fig.show()