In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk

In [2]:
data = pd.read_csv("mbti_cleaned.csv")

## Feature Extraction using TF-IDF and Count Vectorizer
Here we create the TF-IDF Model in order to extract relevant features from the data and send it to the model for classification

In [3]:


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [4]:
data.dropna(inplace= True)
#x_train.isna().sum()

In [5]:
x_train, x_test, y_train, y_test = train_test_split(data['Posts'],data['type'], random_state=0)
c_v = CountVectorizer(stop_words="english")

In [6]:
X_train_counts = c_v.fit_transform(x_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

## Support Vector Machines Model

In [7]:
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(X_train_tfidf,y_train)
y_pred = clf.predict(c_v.transform(x_test))

In [8]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred,target_names=data['type'].unique()))


              precision    recall  f1-score   support

        INFJ       0.60      0.25      0.35        48
        ENTP       0.52      0.38      0.44       143
        INTP       0.59      0.38      0.46        53
        INTJ       0.56      0.36      0.44       151
        ENTJ       0.29      0.22      0.25         9
        INFP       0.00      0.00      0.00         8
        ENFP       0.00      0.00      0.00         9
        ISFP       0.50      0.18      0.27        22
        ENFJ       0.56      0.63      0.59       337
        ISTP       0.58      0.70      0.64       398
        ISFJ       0.58      0.53      0.55       230
        ISTJ       0.61      0.65      0.63       285
        ESTP       0.58      0.41      0.48        34
        ESFP       0.28      0.42      0.33        45
        ESTJ       0.38      0.55      0.45        40
        ESFJ       0.47      0.61      0.53        75

    accuracy                           0.55      1887
   macro avg       0.44   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print(metrics.accuracy_score(y_test,y_pred)*100)

55.21992580816111


# Hyper Parameter Tuning

In [None]:
C = np.arange(0.1,1,0.01)
accuracy=[]
for i in C:
    clf = LinearSVC(C=i).fit(X_train_tfidf,y_train)
    y_pred = clf.predict(c_v.transform(x_test))
    print("C is: ",i,"Accuracy is :",metrics.accuracy_score(y_test,y_pred))
    accuracy.append(metrics.accuracy_score(y_test,y_pred))

C is:  0.1 Accuracy is : 0.566507684154743
C is:  0.11 Accuracy is : 0.5659777424483307
C is:  0.12 Accuracy is : 0.5675675675675675
C is:  0.13 Accuracy is : 0.566507684154743
C is:  0.13999999999999999 Accuracy is : 0.5659777424483307
C is:  0.14999999999999997 Accuracy is : 0.5670376258611552
C is:  0.15999999999999998 Accuracy is : 0.5670376258611552
C is:  0.16999999999999998 Accuracy is : 0.5686274509803921
C is:  0.17999999999999997 Accuracy is : 0.5680975092739798
C is:  0.18999999999999995 Accuracy is : 0.5680975092739798
C is:  0.19999999999999996 Accuracy is : 0.5670376258611552
C is:  0.20999999999999996 Accuracy is : 0.566507684154743
C is:  0.21999999999999995 Accuracy is : 0.5649178590355061
C is:  0.22999999999999995 Accuracy is : 0.5654478007419184
C is:  0.23999999999999994 Accuracy is : 0.5643879173290938
C is:  0.24999999999999992 Accuracy is : 0.5633280339162692
C is:  0.2599999999999999 Accuracy is : 0.5643879173290938
C is:  0.2699999999999999 Accuracy is : 0.564

In [None]:
print(accuracy.index(max(accuracy)))
C[accuracy.index(max(accuracy))]

In [None]:
clf = LinearSVC(C=0.1699).fit(X_train_tfidf,y_train)
y_pred = clf.predict(c_v.transform(x_test))

In [None]:
metrics.accuracy_score(y_test,y_pred)*100   # best Accuracy 