In [1]:
import os
import re
import string
import numpy as np
import pandas as pd
import nltk

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
training = pd.read_excel("dataset/mbti_training_dataset.xlsx")

###TASK 1: Dataset Inspection###
#inspecting the datasets
def inspect_dataset(df,name="Dataset"):
  print(f"/n====={name} INSPECTION=====")

#looking through shape and info about the datasets
  print(f"Shape: {df.shape}")
  print(f"Columns: {df.columns}")
  print("Info:")
  print(df.info())

  print("\nHead:")
  print(df.head())
#missing values
  print("\nMissing Values per Column:")
  print(df.isnull().sum())

#summary statistics
  print("\nSummary Statistics:(numeric columns):")
  print(df.describe())

  #for checking outliers
  print("\nOutlier Check (based on IQR, numeric columns):")
  numeric_cols = df.select_dtypes(include=[np.number]).columns
  for col in numeric_cols:
      q1 = df[col].quantile(0.25)
      q3 = df[col].quantile(0.75)
      iqr = q3 - q1

      outliers = df[(df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))]
      print(f"{col}: {len(outliers)} potential outliers")

  print("="*50)

# Inspecting all the datasets
inspect_dataset(training, "MBTI Training Dataset")



/n=====MBTI Training Dataset INSPECTION=====
Shape: (18861, 2)
Columns: Index(['type', 'posts'], dtype='object')
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18861 entries, 0 to 18860
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    18861 non-null  object
 1   posts   18861 non-null  object
dtypes: object(2)
memory usage: 294.8+ KB
None

Head:
        type                                              posts
0  <UNKNOWN>  BBC|||I only have a social software like Weibo...
1  <UNKNOWN>  # Yang Yang is the first love #|||This is fun|...
2  <UNKNOWN>  I've just taken a video of the video|||those w...
3  <UNKNOWN>  I've just given a flower to Zhong Han-Lian, an...
4  <UNKNOWN>  Zenium|||If you don't, you don't have long hai...

Missing Values per Column:
type     0
posts    0
dtype: int64

Summary Statistics:(numeric columns):
             type  posts
count       18861  18861
unique          1  16728
top     <UNKNO

In [3]:
### TASK 2: Clean data, remove stopwords and tokenize text ###
# Prepare stop words (combine sklearn's stop words with NLTK's if available)

stop_words = set(ENGLISH_STOP_WORDS)
try:
    from nltk.corpus import stopwords as nltk_stop
    stop_words = stop_words.union(set(nltk_stop.words('english')))
except Exception:
    # NLTK stopwords may not be downloaded in this environment; it's ok to proceed
    pass

def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    return ' '.join(tokens)

training['cleaned_text'] = training.get('posts', '') .apply(clean_text) if 'posts' in training.columns else ''

# TF-IDF vectorizer (only run if cleaned text present)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=5)
if 'cleaned_text' in training.columns and training['cleaned_text'].astype(bool).any():
    X = vectorizer.fit_transform(training['cleaned_text'])
    y = training['type'] if 'type' in training.columns else None
else:
    X = None
    y = None

training.head()

Unnamed: 0,type,posts,cleaned_text
0,<UNKNOWN>,BBC|||I only have a social software like Weibo...,bbci social software like weibo ive recently i...
1,<UNKNOWN>,# Yang Yang is the first love #|||This is fun|...,yang yang love funi shared whats aboutpoison i...
2,<UNKNOWN>,I've just taken a video of the video|||those w...,ive just taken video videothose rebut microblo...
3,<UNKNOWN>,"I've just given a flower to Zhong Han-Lian, an...",ive just given flower zhong hanlian ive earned...
4,<UNKNOWN>,"Zenium|||If you don't, you don't have long hai...",zeniumif dont dont long hair right humming whi...
