In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('Done_mbti.csv')

In [3]:
X = df['posts']
y = df['type']

In [4]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


In [5]:
vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
X = vectorizer.fit_transform(X).toarray()

In [6]:
n_splits = 5
kf = KFold(n_splits=n_splits)
accuracy_list = []
f1_list = []


In [7]:
for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    
    y_pred = gbc.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    accuracy_list.append(accuracy)
    f1_list.append(f1)

In [10]:
accuracy = np.mean(accuracy_list)
f1 = np.mean(f1_list)

In [11]:
print("Gradient Boosting with Cross Validation")
print("Mean Accuracy: {:.2f}".format(accuracy))
print("Mean F1 Score: {:.2f}".format(f1))

Gradient Boosting with Cross Validation
Mean Accuracy: 0.52
Mean F1 Score: 0.51
