In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('mbti_1(C).csv')

In [3]:
df['posts'] = df['posts'].apply(lambda x: re.sub(r'http\S+', 'URL', x))

In [4]:
def remove_url(text):
    return text.replace('URL', '')

df['posts'] = df['posts'].apply(remove_url)

In [5]:
print(df.head(6))

   type                                              posts
0  INFJ  ' and intj moments    sportscenter not top ten...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____    course, to which I say I k...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...
5  INTJ  '18/37 @.@|||Science  is not perfect. No scien...


In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df['posts'] = df['posts'].replace(r'\|\|\|', ' ', regex=True)

In [8]:
df.replace(r'[^a-zA-Z\s]', '', regex=True, inplace=True)

In [9]:
print(df.head(6))

   type                                              posts
0  INFJ   and intj moments    sportscenter not top ten ...
1  ENTP  Im finding the lack of me in these posts very ...
2  INTP  Good one      course to which I say I know tha...
3  INTJ  Dear INTP   I enjoyed our conversation the oth...
4  ENTJ  Youre fired Thats another silly misconception ...
5  INTJ    Science  is not perfect No scientist claims ...


In [10]:
mbti_regex = re.compile(r'\b(?:ENFJ|ENFP|ENTJ|ENTP|ESFJ|ESFP|ESTJ|ESTP|INFJ|INFP|INTJ|INTP|ISFJ|ISFP|ISTJ|ISTP|enfj|enfp|entj|entp|esfj|esfp|estj|estp|infj|infp|intj|intp|isfj|isfp|istj|istp)\b')

df['posts'] = df['posts'].apply(lambda x: re.sub(mbti_regex, '', x))


In [11]:
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


In [12]:
def remove_stopwords(text):
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    return text

df['posts'] = df['posts'].apply(remove_stopwords)

In [13]:
print(df.head())

   type                                              posts
0  INFJ  moments sportscenter top ten plays pranks life...
1  ENTP  Im finding lack posts alarming Sex boring posi...
2  INTP  Good one course say know thats blessing curse ...
3  INTJ  Dear enjoyed conversation day Esoteric gabbing...
4  ENTJ  Youre fired Thats another silly misconception ...


In [14]:
df['posts'] = df['posts'].str.lower()

In [15]:
print(df.head())

   type                                              posts
0  INFJ  moments sportscenter top ten plays pranks life...
1  ENTP  im finding lack posts alarming sex boring posi...
2  INTP  good one course say know thats blessing curse ...
3  INTJ  dear enjoyed conversation day esoteric gabbing...
4  ENTJ  youre fired thats another silly misconception ...


In [18]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  


def lemmatize_text(text):
    
    words = nltk.word_tokenize(text)
    
    tagged_words = nltk.pos_tag(words)
    
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_words]
    
    return ' '.join(lemmatized_words)


df['posts'] = df['posts'].apply(lemmatize_text)


In [19]:
print(df.head())

   type                                              posts
0  INFJ  moment sportscenter top ten play prank lifecha...
1  ENTP  im find lack post alarm sex boring position of...
2  INTP  good one course say know thats bless curse abs...
3  INTJ  dear enjoyed conversation day esoteric gabbing...
4  ENTJ  youre fire thats another silly misconception a...


In [20]:
df.to_csv('Clear_mbti.csv', index=False)