In [1]:
import numpy as np 
import pandas as pd

In [2]:
dataset = pd.read_csv('data/personality_data.csv')

In [3]:
dataset.head()

Unnamed: 0,Age,Gender,Educational Background,Extraversion,Agreeableness,Conscientiousness,Openness to Experience,Emotional Stability,Values/Beliefs,Personality Class
0,17,Male,12th Grade,75,70,80,85,75,70,dependable
1,18,Female,12th Grade,70,80,85,75,70,80,serious
2,17,Male,11th Grade,55,60,75,80,60,75,responsible
3,16,Female,11th Grade,80,70,85,90,80,70,dependable
4,18,Male,12th Grade,65,75,90,70,80,70,serious


In [4]:
dataset.shape

(80, 10)

In [5]:
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype == 'O']
len(categorical_features)

3

In [6]:
categorical_features

['Gender', 'Educational Background', 'Personality Class']

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
LabelEncoder = LabelEncoder()

In [27]:
y_labels = dataset['Personality Class'].unique()

In [42]:
y_labels.reshape(5,-1)

array([['dependable'],
       ['serious'],
       ['responsible'],
       ['extraverted'],
       ['lively']], dtype=object)

In [43]:
encoded = LabelEncoder.fit_transform(y_labels)

In [44]:
encoded.reshape(5,-1)

array([[0],
       [4],
       [3],
       [1],
       [2]])

In [45]:
categorical_data = np.stack([y_labels,encoded],axis=1)

In [46]:
categorical_data = pd.DataFrame(categorical_data)
categorical_data

Unnamed: 0,0,1
0,dependable,0
1,serious,4
2,responsible,3
3,extraverted,1
4,lively,2


In [47]:
for feature in categorical_features:
    dataset[feature] = LabelEncoder.fit_transform(dataset[feature])

In [48]:
dataset

Unnamed: 0,Age,Gender,Educational Background,Extraversion,Agreeableness,Conscientiousness,Openness to Experience,Emotional Stability,Values/Beliefs,Personality Class
0,17,1,2,75,70,80,85,75,70,0
1,18,0,2,70,80,85,75,70,80,4
2,17,1,1,55,60,75,80,60,75,3
3,16,0,1,80,70,85,90,80,70,0
4,18,1,2,65,75,90,70,80,70,4
...,...,...,...,...,...,...,...,...,...,...
75,15,0,0,50,60,70,85,60,75,3
76,17,1,2,75,70,80,85,75,80,1
77,16,0,1,70,70,85,75,70,75,2
78,18,1,2,60,70,80,90,70,75,0


In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [50]:
scaler = StandardScaler()

In [52]:
X_data = dataset.iloc[:,:-1]
y_data = dataset.iloc[:,-1:]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size=0.2, random_state=42, shuffle=True)

Model building 

In [60]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [62]:
y_pred_test = model.predict(X_test)

In [63]:
y_pred_test

array([3, 0, 2, 1, 0, 0, 3, 3, 4, 2, 4, 0, 2, 3, 0, 3])

In [64]:
y_test

Unnamed: 0,Personality Class
30,3
0,0
22,2
31,1
18,0
28,0
10,3
70,3
4,4
12,2


In [65]:
from sklearn.metrics import accuracy_score

In [66]:
accuracy_score(y_test,y_pred_test)

1.0

In [67]:
model.predict(X_train)

array([0, 1, 3, 0, 4, 4, 3, 2, 4, 2, 3, 4, 1, 4, 1, 4, 2, 3, 0, 4, 1, 3,
       4, 0, 1, 0, 2, 0, 0, 3, 1, 1, 2, 0, 1, 0, 3, 2, 4, 1, 0, 4, 0, 2,
       2, 1, 2, 3, 4, 0, 4, 2, 4, 4, 2, 1, 3, 0, 4, 3, 3, 1, 4, 1])

In [68]:
y_train

Unnamed: 0,Personality Class
73,0
61,1
55,3
40,0
9,4
...,...
20,3
60,3
71,1
14,4


In [75]:
from joblib import dump,load

In [76]:
dump(model, 'personality_model.joblib')

['personality_model.joblib']

In [87]:
X_train.loc[2].values

array([17,  1,  1, 55, 60, 75, 80, 60, 75])

In [88]:
input = np.array([17,  1,  1, 55, 60, 75, 80, 60, 75])

In [89]:
input = input.reshape(1,-1)

In [90]:
model.predict(input)



array([3])

In [101]:
y_train.loc[2]

Personality Class    3
Name: 2, dtype: int64

In [102]:
import pickle

In [103]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))