# Categorical Naive Bayes

In [1]:
import pandas as pd
from utils import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split


In [2]:
# Load data
questions = pd.read_csv('data/question_data.csv', sep=';')
questions = questions.set_index('Unnamed: 0')  # set first column (question number) as index
data = pd.read_parquet('data/train.parquet')

In [28]:
# questions
selected_questions = ['q212813'] + preprocessing.top_k_questions(keyword='descriptive', k=10, questions_df=questions, data=data)
unordered_categories, ordered_categories = preprocessing.get_categories(selected_questions, questions, {'q20930', 'q77', 'q80', 'q79'})
feature_target_df = preprocessing.preprocess(data[selected_questions], unordered_categories, ordered_categories)
display(feature_target_df)

selected questions:  ['Do you like watching foreign movies with subtitles?'
 'Which type of wine would you prefer to drink outside of a meal, such as for leisure?'
 'Do you enjoy intense intellectual conversations?'
 'Have you smoked a cigarette in the last 6 months?'
 'How frequently do you drink alcohol?' 'Rate your self-confidence:'
 'How often do you keep your promises?'
 'How often are you open with your feelings?'
 "What's your deal with harder drugs (stuff beyond pot)?"
 'Are you happy with your life?']


Unnamed: 0_level_0,q212813,q416235,q85419,q358084,q501,q77,q20930,q29829,q35660,q80,q4018
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
28586,Centrist,Can't answer without a subtitle,"White (such as Chardonnay, Riesling).",Yes,No,Sometimes,Higher than average,Whenever possible,Usually,I never do drugs.,Yes
8701,Liberal / Left-wing,Yes,Rosé (such as White Zinfindel).,Yes,No,Very often,Higher than average,Always,Usually,I never do drugs.,Yes
31844,Centrist,Yes,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,Average,Whenever possible,Usually,I never do drugs.,Yes
39046,Centrist,Can't answer without a subtitle,Rosé (such as White Zinfindel).,Yes,No,Sometimes,Average,Whenever possible,Usually,I never do drugs.,Yes
21628,Other,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Rarely,Higher than average,Whenever possible,Usually,I never do drugs.,Yes
...,...,...,...,...,...,...,...,...,...,...,...
21184,Other,Can't answer without a subtitle,Rosé (such as White Zinfindel).,Yes,No,Rarely,Average,Always,Always,I never do drugs.,Yes
67652,Other,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,Below average,Always,Rarely,I never do drugs.,Yes
60546,Liberal / Left-wing,Yes,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,"Very, very high",Whenever possible,Always,I never do drugs.,Yes
1726,Other,Can't answer without a subtitle,Rosé (such as White Zinfindel).,No,Yes,Sometimes,Average,Whenever possible,Usually,I never do drugs.,Yes


In [29]:
le_target = LabelEncoder()
y = le_target.fit_transform(feature_target_df.loc[:,'q212813'])
le_target.classes_
print(le_target.inverse_transform([0, 1, 2, 3]))
X = feature_target_df.apply(LabelEncoder().fit_transform, axis=0)
display(y)
display(X)


['Centrist' 'Conservative / Right-wing' 'Liberal / Left-wing' 'Other']


array([0, 2, 0, ..., 2, 3, 3])

Unnamed: 0_level_0,q212813,q416235,q85419,q358084,q501,q77,q20930,q29829,q35660,q80,q4018
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
28586,0,0,3,1,0,2,2,3,3,2,1
8701,2,2,2,1,0,3,2,0,3,2,1
31844,0,2,1,1,0,2,0,3,3,2,1
39046,0,0,2,1,0,2,0,3,3,2,1
21628,3,0,1,1,1,1,2,3,3,2,1
...,...,...,...,...,...,...,...,...,...,...,...
21184,3,0,2,1,0,1,0,0,0,2,1
67652,3,0,1,1,0,2,1,0,2,2,1
60546,2,2,1,1,0,2,3,3,0,2,1
1726,3,0,2,0,1,2,0,3,3,2,1


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)

(17625, 11) (8682, 11)


In [32]:
clf = CategoricalNB()

clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

train_acc = sum(target == prediction for target, prediction in zip(y_train, pred_train))/len(y_train)
test_acc = sum(target == prediction for target, prediction in zip(y_test, pred_test))/len(y_test)
print('train accuracy: ', train_acc)
print('test accuracy: ', test_acc)

(17625, 11)
[0 1 2 3]
[-1.68177985 -2.63266645 -0.87925462 -1.11786018]
[4 3 4 2 2 4 4 4 4 4 2]
(8682,)


(8682,)

(8682,)


array([2, 0, 1, ..., 3, 2, 1])

train accuracy:  1.0
test accuracy:  1.0
