In [1]:
!pip install pandas scikit-learn nltk



In [2]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [3]:
from google.colab import files

uploaded = files.upload()

Saving mbti_1.csv to mbti_1.csv


In [4]:
df = pd.read_csv("mbti_1.csv")
df.head()


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [5]:
print("Shape:", df.shape)
print("\nColumns:", df.columns)
print("\nTop 10 MBTI type counts:")
print(df["type"].value_counts().head(10))


Shape: (8675, 2)

Columns: Index(['type', 'posts'], dtype='object')

Top 10 MBTI type counts:
type
INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
Name: count, dtype: int64


In [6]:
def clean_text(text):
    text = text.lower()
    # remove mbti type strings so model doesn't cheat
    text = re.sub(r"infj|intj|intp|entp|entj|estj|esfj|enfj|enfp|istp|isfp|istj|isfj|estp|esfp", "", text)
    # remove URLs
    text = re.sub(r"http\S+|www\.\S+", "", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

# apply cleaning
df["clean_posts"] = df["posts"].apply(clean_text)

# create label for Extraversion vs Introversion
df["EI"] = df["type"].str[0]  # 'E' or 'I'

df[["type", "EI", "clean_posts"]].head()


Unnamed: 0,type,EI,clean_posts
0,INFJ,I,and moments sportscenter not top ten plays pra...
1,ENTP,E,i m finding the lack of me in these posts very...
2,INTP,I,good one course to which i say i know that s m...
3,INTJ,I,dear i enjoyed our conversation the other day ...
4,ENTJ,E,you re fired that s another silly misconceptio...


In [7]:
X_text = df["clean_posts"]
y = df["EI"]

# baseline TF-IDF: 5000 features
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

X = vectorizer.fit_transform(X_text)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((6940, 5000), (1735, 5000))

In [8]:
model = LogisticRegression(max_iter=2000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Baseline Accuracy (5000 features):", accuracy_score(y_test, y_pred))
print("\nClassification Report (5000 features):")
print(classification_report(y_test, y_pred))


Baseline Accuracy (5000 features): 0.7896253602305475

Classification Report (5000 features):
              precision    recall  f1-score   support

           E       0.76      0.13      0.22       400
           I       0.79      0.99      0.88      1335

    accuracy                           0.79      1735
   macro avg       0.78      0.56      0.55      1735
weighted avg       0.78      0.79      0.73      1735



In [9]:
# TF-IDF with more features (10,000)
vectorizer2 = TfidfVectorizer(
    max_features=10000,
    stop_words="english"
)

X2 = vectorizer2.fit_transform(X_text)

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model2 = LogisticRegression(max_iter=2000)
model2.fit(X2_train, y2_train)

y2_pred = model2.predict(X2_test)

print("Improved Accuracy (10,000 features):", accuracy_score(y2_test, y2_pred))
print("\nClassification Report (10,000 features):")
print(classification_report(y2_test, y2_pred))


Improved Accuracy (10,000 features): 0.7878962536023055

Classification Report (10,000 features):
              precision    recall  f1-score   support

           E       0.81      0.10      0.19       400
           I       0.79      0.99      0.88      1335

    accuracy                           0.79      1735
   macro avg       0.80      0.55      0.53      1735
weighted avg       0.79      0.79      0.72      1735



In [10]:
model = LogisticRegression(max_iter=2000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Baseline Accuracy (5000 features):", accuracy_score(y_test, y_pred))
print("\nClassification Report (5000 features):")
print(classification_report(y_test, y_pred))


Baseline Accuracy (5000 features): 0.7896253602305475

Classification Report (5000 features):
              precision    recall  f1-score   support

           E       0.76      0.13      0.22       400
           I       0.79      0.99      0.88      1335

    accuracy                           0.79      1735
   macro avg       0.78      0.56      0.55      1735
weighted avg       0.78      0.79      0.73      1735



In [11]:
# TF-IDF with more features (10,000)
vectorizer2 = TfidfVectorizer(
    max_features=10000,
    stop_words="english"
)

X2 = vectorizer2.fit_transform(X_text)

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model2 = LogisticRegression(max_iter=2000)
model2.fit(X2_train, y2_train)

y2_pred = model2.predict(X2_test)

print("Improved Accuracy (10,000 features):", accuracy_score(y2_test, y2_pred))
print("\nClassification Report (10,000 features):")
print(classification_report(y2_test, y2_pred))


Improved Accuracy (10,000 features): 0.7878962536023055

Classification Report (10,000 features):
              precision    recall  f1-score   support

           E       0.81      0.10      0.19       400
           I       0.79      0.99      0.88      1335

    accuracy                           0.79      1735
   macro avg       0.80      0.55      0.53      1735
weighted avg       0.79      0.79      0.72      1735



In [12]:
def predict_EI(text):
    cleaned = clean_text(text)
    vec = vectorizer2.transform([cleaned])  # use improved vectorizer + model
    pred = model2.predict(vec)[0]
    return pred

sample_text = "I love hanging out with friends and meeting new people."
print("Predicted E/I:", predict_EI(sample_text))


Predicted E/I: E


In [13]:
def predict_EI(text):
    cleaned = clean_text(text)
    vec = vectorizer2.transform([cleaned])  # use improved vectorizer + model
    pred = model2.predict(vec)[0]
    return pred

sample_text = "I love hanging out with friends and meeting new people."
print("Predicted E/I:", predict_EI(sample_text))


Predicted E/I: E


In [14]:
# Create 4 label columns for each MBTI dimension
df["EI"] = df["type"].str[0]  # E or I
df["SN"] = df["type"].str[1]  # S or N
df["TF"] = df["type"].str[2]  # T or F
df["JP"] = df["type"].str[3]  # J or P

# Check a few rows
df[["type", "EI", "SN", "TF", "JP", "clean_posts"]].head()


Unnamed: 0,type,EI,SN,TF,JP,clean_posts
0,INFJ,I,N,F,J,and moments sportscenter not top ten plays pra...
1,ENTP,E,N,T,P,i m finding the lack of me in these posts very...
2,INTP,I,N,T,P,good one course to which i say i know that s m...
3,INTJ,I,N,T,J,dear i enjoyed our conversation the other day ...
4,ENTJ,E,N,T,J,you re fired that s another silly misconceptio...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_text = df["clean_posts"]

# Use 10,000 features since that worked better before
vectorizer_all = TfidfVectorizer(
    max_features=10000,
    stop_words="english"
)

X_all = vectorizer_all.fit_transform(X_text)
X_all.shape


(8675, 10000)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

dimensions = ["EI", "SN", "TF", "JP"]
models = {}   # to store trained models
scores = {}   # to store accuracy numbers

for dim in dimensions:
    print(f"\n===== Training for {dim} dimension =====")

    y = df[dim]

    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    clf = LogisticRegression(max_iter=2000)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"Accuracy for {dim}: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    models[dim] = clf
    scores[dim] = acc



===== Training for EI dimension =====
Accuracy for EI: 0.7879
              precision    recall  f1-score   support

           E       0.81      0.10      0.19       400
           I       0.79      0.99      0.88      1335

    accuracy                           0.79      1735
   macro avg       0.80      0.55      0.53      1735
weighted avg       0.79      0.79      0.72      1735


===== Training for SN dimension =====
Accuracy for SN: 0.8640
              precision    recall  f1-score   support

           N       0.86      1.00      0.93      1496
           S       1.00      0.01      0.02       239

    accuracy                           0.86      1735
   macro avg       0.93      0.51      0.48      1735
weighted avg       0.88      0.86      0.80      1735


===== Training for TF dimension =====
Accuracy for TF: 0.8161
              precision    recall  f1-score   support

           F       0.83      0.83      0.83       939
           T       0.80      0.81      0.80     

In [17]:
def predict_mbti_from_text(text):
    # reuse your clean_text function from before
    cleaned = clean_text(text)
    vec = vectorizer_all.transform([cleaned])

    letters = []
    for dim in dimensions:
        clf = models[dim]
        pred = clf.predict(vec)[0]
        letters.append(pred)

    return "".join(letters)

# Test it with some example text
sample_text = """
I enjoy spending time thinking about ideas and possibilities.
I like deep conversations and often reflect before I speak.
I prefer a small group of close friends over big parties.
"""

print("Predicted MBTI:", predict_mbti_from_text(sample_text))


Predicted MBTI: INTP


In [18]:
print("Final model accuracies by dimension:")
for dim in dimensions:
    print(f"{dim}: {scores[dim]:.4f}")


Final model accuracies by dimension:
EI: 0.7879
SN: 0.8640
TF: 0.8161
JP: 0.7043
