In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [None]:
url = "https://raw.githubusercontent.com/benedicta-kelechi/datasets/main/mbti_1.csv"
df = pd.read_csv(url)
df.head()


In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"\|\|\|", " ", text)  # separator
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove numbers and punctuation
    text = text.lower()
    return text

df['cleaned_posts'] = df['posts'].apply(clean_text)


In [None]:
X = df['cleaned_posts']
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
model = LogisticRegression(max_iter=300)
model.fit(X_train_tfidf, y_train)


In [None]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
sample_text = """
I love trying new ideas, making new friends, and discussing big philosophical questions.
"""

sample_clean = clean_text(sample_text)
sample_vec = vectorizer.transform([sample_clean])
predicted_type = model.predict(sample_vec)[0]

print("Predicted MBTI Type:", predicted_type)
