## Install and import the required packages

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

import pickle

## Mount Google Drive and import dataset

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/deploy-ml-web-workshop/mbpt_dataset_2.csv")

data.head()

Unnamed: 0,type,posts
0,INTJ,know intj tool use interaction people excuse a...
1,INTJ,rap music ehh opp yeah know valid well know fa...
2,INTJ,preferably p hd low except wew lad video p min...
3,INTJ,drink like wish could drink red wine give head...
4,INTJ,space program ah bad deal meing freelance max ...


In [4]:
data.count()

type     106067
posts    106067
dtype: int64

In [5]:
data['type'].value_counts()

INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181
Name: type, dtype: int64

## Split train and test (to ensure balanced distribution of data)

In [6]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42, stratify = data.type)

print("\033[93m {}\033[00m" .format('TRAIN DATA \n'), train_data)
print(train_data['type'].value_counts())

print("\033[93m {}\033[00m" .format('TEST DATA \n'),  test_data)
print(test_data['type'].value_counts())

# PS: "\033[93m {}\033[00m" .format('\n AFTER \n') is just a way
# to change the color of the printed string 'TRAIN DATA \n' using ANSI Escape Code.

[93m TRAIN DATA 
[00m         type                                              posts
64196   ENTJ  user put sign username instance meoleme jumal ...
31881   INTP  year issue correct incorrect leave toss labor ...
68961   ENTP  steven spielberg nice rest trash curious thing...
102426  INFP  bad situation actually lot respect lazy people...
19036   INTJ  male friend quite literally drive crazy year a...
...      ...                                                ...
16713   INTJ  ni sound also like stuff say think brilliant i...
28060   INTP  useful time fare much good even feel someone p...
29514   INTP  look super attractive pic yes sometimes see wo...
92206   INFJ  way home school bus really incentive learn dri...
76239   ESFP  style seem familiar yes much friend family kno...

[84853 rows x 2 columns]
INTP    19969
INTJ    17941
INFJ    11970
INFP     9707
ENTP     9380
ENFP     4934
ISTP     2739
ENTJ     2364
ESTP     1589
ENFJ     1227
ISTJ      994
ISFP      700
ISFJ      520


## Tokenize and transform the data

In [7]:
vectorizer = TfidfVectorizer(max_features = 5000, stop_words = "english")

vectorizer.fit(train_data.posts)

In [8]:
train_post = vectorizer.transform(train_data.posts).toarray()

test_post = vectorizer.transform(test_data.posts).toarray()

In [9]:
train_post.shape

(84853, 5000)

In [10]:
target_encoder = LabelEncoder()

train_target = target_encoder.fit_transform(train_data.type)
test_target = target_encoder.fit_transform(test_data.type)

## Models testing and selection

In [11]:
# Store the accuracy of each model

models_accuracy = {}

#### Logistic Regression

In [12]:
model_log = LogisticRegression(max_iter = 3000, C = 0.5, n_jobs = -1)

model_log.fit(train_post, train_target)

In [13]:
print('Train Classification Report \n ', classification_report(train_target, model_log.predict(train_post), zero_division=0, target_names = target_encoder.inverse_transform([i for i in range(16)])))

print('Test Classification Report \n', classification_report(test_target, model_log.predict(test_post), zero_division=0, target_names = target_encoder.inverse_transform([i for i in range(16)])))

Train Classification Report 
                precision    recall  f1-score   support

        ENFJ       0.89      0.54      0.67      1227
        ENFP       0.87      0.78      0.82      4934
        ENTJ       0.91      0.70      0.79      2364
        ENTP       0.86      0.83      0.84      9380
        ESFJ       0.85      0.12      0.21       145
        ESFP       0.90      0.29      0.44       288
        ESTJ       0.97      0.67      0.79       386
        ESTP       0.96      0.86      0.91      1589
        INFJ       0.84      0.87      0.85     11970
        INFP       0.83      0.85      0.84      9707
        INTJ       0.83      0.90      0.86     17941
        INTP       0.84      0.91      0.87     19969
        ISFJ       0.86      0.39      0.54       520
        ISFP       0.83      0.49      0.61       700
        ISTJ       0.89      0.52      0.65       994
        ISTP       0.90      0.74      0.81      2739

    accuracy                           0.85     8

In [14]:
models_accuracy['Logistic Regression'] = accuracy_score(test_target, model_log.predict(test_post))

#### Linear Support Vector Classifier

In [15]:
model_linear_svc=LinearSVC(C = 0.1)

model_linear_svc.fit(train_post, train_target)

In [16]:
print('Train Classification Report \n ', classification_report(train_target, model_linear_svc.predict(train_post), zero_division=0, target_names = target_encoder.inverse_transform([i for i in range(16)])))

print('Test Classification Report \n', classification_report(test_target, model_linear_svc.predict(test_post), zero_division=0, target_names = target_encoder.inverse_transform([i for i in range(16)])))

Train Classification Report 
                precision    recall  f1-score   support

        ENFJ       0.90      0.64      0.75      1227
        ENFP       0.88      0.80      0.84      4934
        ENTJ       0.92      0.77      0.84      2364
        ENTP       0.88      0.84      0.86      9380
        ESFJ       0.95      0.37      0.53       145
        ESFP       0.93      0.50      0.65       288
        ESTJ       0.95      0.80      0.87       386
        ESTP       0.95      0.92      0.93      1589
        INFJ       0.86      0.87      0.87     11970
        INFP       0.85      0.86      0.85      9707
        INTJ       0.85      0.90      0.88     17941
        INTP       0.85      0.92      0.88     19969
        ISFJ       0.91      0.56      0.69       520
        ISFP       0.88      0.62      0.73       700
        ISTJ       0.91      0.66      0.77       994
        ISTP       0.91      0.81      0.86      2739

    accuracy                           0.86     8

In [17]:
models_accuracy['Linear Support Vector Classifier'] = accuracy_score(test_target, model_linear_svc.predict(test_post))

## Models accuracy summary

In [18]:
models_accuracy

accuarcy = pd.DataFrame(models_accuracy.items(), columns = ['Models', 'Test accuracy'])

accuarcy.sort_values(by = 'Test accuracy', ascending = False, ignore_index = True).style.background_gradient(cmap = 'Blues')

Unnamed: 0,Models,Test accuracy
0,Linear Support Vector Classifier,0.820354
1,Logistic Regression,0.811775


## Save (pickle) the final model

In [19]:
# Vectorizer
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

# Model
pickle.dump(model_linear_svc, open('model.pkl', 'wb'))

### Load and test the saved model

In [20]:
loaded_vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
loaded_model = pickle.load(open('model.pkl', 'rb'))

message = ["This is pretty much the worse movie I have ever watched. It's completely thrash!"]
message = loaded_vectorizer.transform(message)

result = loaded_model.predict(message)

print(result, target_encoder.inverse_transform(result))

[11] ['INTP']
