In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Machine Learning models
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import LinearSVC
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
# Evaluation
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Install dependencies (run once)
# pip install kagglehub[pandas-datasets]

import kagglehub
from kagglehub import KaggleDatasetAdapter

# Path to the file INSIDE the dataset
file_path = "mbti_1.csv"

# Load the dataset
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "datasnaek/mbti-type",
    file_path,
)

In [None]:
df.head(10)

In [None]:
#showing the number of users per personality type
df.type.value_counts()

In [None]:
#visualizing the number of users per personality type using a histogram
plt.figure(figsize=(20,10))
sns.countplot(df.type)
plt.xlabel('Types count');

In [None]:
#removing URLs and punctuation from dataset
# 1. Replace '|||' with a space (mimics the split/join logic)
df['posts'] = df['posts'].str.replace(r'\|\|\|', ' ', regex=True)

# 2. Remove URLs
df['posts'] = df['posts'].str.replace(r"http\S+", "", regex=True)

# 3. Remove punctuation and numbers
df['posts'] = df['posts'].str.replace(r"[-/@.?!_,:;()|0-9]", "", regex=True)

# 4. Remove extra whitespace (mimics the split('  ') logic but more robust)
df['posts'] = df['posts'].str.replace(r"\s+", " ", regex=True).str.strip()

In [None]:
df.head(30)

In [None]:
#identifying the different classes of users in the dataset
labels = df.type.unique()
labels

In [None]:
#mapping personality types with their numberical representation
labels2 = []
label_rep = {}
for index,labels in enumerate(labels):
    label_rep[labels] = index
    labels2.append(labels)
label_rep

In [None]:
#replacing each personality type with its numerical representation
df['label'] = df.type.replace(label_rep)
df.head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5
)

X = tfidf.fit_transform(df['posts'])
y = df['label']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

svm = LinearSVC(class_weight='balanced')
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

In [None]:
# Accuracy
acc = accuracy_score(y_test, y_pred_svm)

# Macro-averaged metrics (recommended for imbalanced MBTI data)
precision_macro = precision_score(y_test, y_pred_svm, average="macro")
recall_macro = recall_score(y_test, y_pred_svm, average="macro")
f1_macro = f1_score(y_test, y_pred_svm, average="macro")

print("SVM Performance")
print("------------------------------")
print(f"Accuracy        : {acc:.4f}")
print(f"Macro Precision : {precision_macro:.4f}")
print(f"Macro Recall    : {recall_macro:.4f}")
print(f"Macro F1-score  : {f1_macro:.4f}")

print("\nSVM Classification Report")
print("------------------------------")
print(classification_report(y_test, y_pred_svm))

In [None]:
cm_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_svm,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False
)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix — SVM")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

rf = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_dense, y_train)
y_pred_rf = rf.predict(X_test_dense)

In [None]:
acc_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average="macro")
recall_rf = recall_score(y_test, y_pred_rf, average="macro")
f1_rf = f1_score(y_test, y_pred_rf, average="macro")

print("Random Forest Performance")
print("------------------------------")
print(f"Accuracy        : {acc_rf:.4f}")
print(f"Macro Precision : {precision_rf:.4f}")
print(f"Macro Recall    : {recall_rf:.4f}")
print(f"Macro F1-score  : {f1_rf:.4f}")

print("\nRandom Forest Classification Report")
print("------------------------------")
print(classification_report(y_test, y_pred_rf))

In [None]:
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False
)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix — Random Forest")
plt.tight_layout()
plt.show()

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

sample_weight = compute_sample_weight(
    class_weight="balanced",
    y=y_train
)

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=16,
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    n_jobs=-1,
    random_state=42,
)

xgb_model.fit(
    X_train,
    y_train,
    sample_weight=sample_weight
)

y_pred_xgb = xgb_model.predict(X_test)

In [None]:
acc_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average="macro")
recall_xgb = recall_score(y_test, y_pred_xgb, average="macro")
f1_xgb = f1_score(y_test, y_pred_xgb, average="macro")

print("XGBoost Performance")
print("------------------------------")
print(f"Accuracy        : {acc_xgb:.4f}")
print(f"Macro Precision : {precision_xgb:.4f}")
print(f"Macro Recall    : {recall_xgb:.4f}")
print(f"Macro F1-score  : {f1_xgb:.4f}")

print("\nXGBoost Classification Report")
print("------------------------------")
print(classification_report(y_test, y_pred_xgb))

In [None]:
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_xgb,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False
)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix — XGBoost")
plt.tight_layout()
plt.show()

In [None]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=16,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    class_weight='balanced',
    n_jobs=-1,
    verbosity=-1,
)

lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)

In [None]:
acc_lgb = accuracy_score(y_test, y_pred_lgb)
precision_lgb = precision_score(y_test, y_pred_lgb, average="macro")
recall_lgb = recall_score(y_test, y_pred_lgb, average="macro")
f1_lgb = f1_score(y_test, y_pred_lgb, average="macro")

print("LightGBM Performance")
print("------------------------------")
print(f"Accuracy        : {acc_lgb:.4f}")
print(f"Macro Precision : {precision_lgb:.4f}")
print(f"Macro Recall    : {recall_lgb:.4f}")
print(f"Macro F1-score  : {f1_lgb:.4f}")

print("\nLightGBM Classification Report")
print("------------------------------")
print(classification_report(y_test, y_pred_lgb))

In [None]:
cm_lgb = confusion_matrix(y_test, y_pred_lgb)
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_lgb,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False
)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix — LightGBM")
plt.tight_layout()
plt.show()

In [None]:
!pip install sentence-transformers scikit-learn xgboost lightgbm
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

In [None]:
!pip install transformers torch scikit-learn pandas
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

model.eval()  # inference mode

In [None]:
def bert_base_encode(texts, tokenizer, model, max_length=512):
    embeddings = []

    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=max_length
            )

            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # (1, 768)
            embeddings.append(cls_embedding.squeeze().numpy())

    return np.vstack(embeddings)
X = bert_base_encode(
    df['posts'].tolist(),
    tokenizer,
    model
)

y = df['label'].values

print(X.shape, y.shape)

In [None]:
embedding_cols = [f"bert_{i}" for i in range(X.shape[1])]

df_bert = pd.DataFrame(X, columns=embedding_cols)
df_bert["label"] = y

df_bert.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,        # 80% train, 20% test
    random_state=42,
    stratify=y            # giữ tỷ lệ class
)

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)


In [None]:
rf = RandomForestClassifier(
    n_estimators=500,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced",
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("RF (balanced) Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

num_classes = len(np.unique(y_train))

lgb_model = lgb.LGBMClassifier(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=63,
    objective="multiclass",
    num_class=num_classes,
    class_weight="balanced",  
    random_state=42,
    n_jobs=-1,
    verbosity=-1,
)

lgb_model.fit(X_train, y_train)

y_pred_lgb = lgb_model.predict(X_test)

print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb, digits=4))

In [None]:
# =========================
# 5. XGBoost (with sample weights)
# =========================
num_classes = len(np.unique(y_train))

sample_weight = compute_sample_weight(
    class_weight="balanced",
    y=y_train
)

xgb_model = xgb.XGBClassifier(
    n_estimators=600,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=num_classes,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train,
    y_train,
    sample_weight=sample_weight
)

y_pred_xgb = np.argmax(
    xgb_model.predict_proba(X_test),
    axis=1
)

print("\n===== XGBoost =====")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, digits=4))

In [None]:
svm_model = LinearSVC(
    class_weight="balanced",  
    random_state=42,
    max_iter=10000
)

svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, digits=4))

In [None]:
# Load Sentence-BERT model
bert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode posts into vectors
X = bert_model.encode(
    df['posts'].tolist(),
    batch_size=32,
    show_progress_bar=True
)

# Labels
y = df['label'].values

print(X.shape, y.shape)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
rf = RandomForestClassifier(
    n_estimators=500,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"  
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("RF (balanced) Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

In [None]:
import xgboost as xgb

num_classes = len(np.unique(y_train))

xgb_model = xgb.XGBClassifier(
    n_estimators=600,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=num_classes,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train,
    y_train,
    sample_weight=sample_weight
)

y_pred_xgb = np.argmax(
    xgb_model.predict_proba(X_test),
    axis=1
)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, digits=4))

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

num_classes = len(np.unique(y_train))

lgb_model = lgb.LGBMClassifier(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=63,
    objective="multiclass",
    num_class=num_classes,
    class_weight="balanced",  
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(X_train, y_train)

y_pred_lgb = lgb_model.predict(X_test)

print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb, digits=4))

In [None]:
svm_model = LinearSVC(
    class_weight="balanced",  
    random_state=42,
    max_iter=10000
)

svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, digits=4))