In [98]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import balanced_accuracy_score, accuracy_score

import pandas as pd
from utils import preprocessing

# Load Data

In [2]:
# Load data
questions = pd.read_csv('data/question_data.csv', sep=';')
questions = questions.set_index('Unnamed: 0')  # set first column (question number) as index
df_train = pd.read_parquet('data/train_descriptive.parquet')
df_test = pd.read_parquet('data/test_descriptive.parquet')
political_belief = 'q212813'

In [24]:
K_max = 30
# K_max questions
selected_questions = ['q212813'] + preprocessing.top_k_questions(keyword='descriptive', k=K_max, questions_df=questions, data=df_train)
df_train_top_kmax = preprocessing.preprocess(df_train[selected_questions])
df_test_top_kmax = preprocessing.preprocess(df_test[selected_questions])
display(df_train_top_kmax)

selected questions:  ['Do you like watching foreign movies with subtitles?'
 'Which type of wine would you prefer to drink outside of a meal, such as for leisure?'
 'Do you enjoy intense intellectual conversations?'
 'Have you smoked a cigarette in the last 6 months?'
 'How frequently do you drink alcohol?' 'Rate your self-confidence:'
 'How often do you keep your promises?'
 'How often are you open with your feelings?'
 "What's your deal with harder drugs (stuff beyond pot)?"
 'Are you happy with your life?'
 "What's your relationship with marijuana?"
 'If you had to name your greatest motivation in life thus far, what would it be?'
 'How often do you brush your teeth?'
 'How important are your political beliefs to you?'
 'Do you often make jokes that offend more uptight people?'
 'How much influence or control do your parents have over your life?'
 'Do you keep a budget (of your finances)?'
 'Do you space out or daydream a lot?'
 'How do you feel about documentaries?'
 "If you don't 

Unnamed: 0_level_0,q212813,q416235,q85419,q358084,q501,q77,q20930,q29829,q35660,q80,...,q40441,q53611,q19874,q1062,q158,q80041,q128,q16713,q1707,q8155
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31844,Centrist,Yes,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,Average,Whenever possible,Usually,I never do drugs.,...,Maybe a little bit.,Never.,6-12 months,Usually daily. I skip some.,Yes,Yes,I have no tattoos,No,Meh. I have my ups and downs.,Seldom or never.
21628,Other,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Rarely,Higher than average,Whenever possible,Usually,I never do drugs.,...,No.,Frequently.,12+ months,Usually daily. I skip some.,No,Yes,I have no tattoos,Yes,Cheerful! I have a positive outlook.,Seldom or never.
51271,Other,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Sometimes,Higher than average,Always,Usually,I never do drugs.,...,Maybe a little bit.,Rarely.,12+ months,At least once a day.,Yes,Yes,I have 1 or more BIG tattoos,Yes,Cheerful! I have a positive outlook.,Once every month or three.
12228,Liberal / Left-wing,Can't answer without a subtitle,Rosé (such as White Zinfindel).,Yes,No,Never,Below average,Whenever possible,Usually,I never do drugs.,...,Maybe a little bit.,Rarely.,12+ months,At least once a day.,No,No,I have no tattoos,Yes,Meh. I have my ups and downs.,Seldom or never.
57902,Conservative / Right-wing,No,I don't drink wine.,Yes,Yes,Never,Higher than average,Always,Always,"I've done drugs in the past, but no longer.",...,No.,Never.,6-12 months,At least once a day.,No,Yes,I have no tattoos,Yes,Cheerful! I have a positive outlook.,Seldom or never.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26463,Centrist,Yes,I don't drink wine.,Yes,No,Rarely,Average,Always,Usually,I never do drugs.,...,Yes.,Never.,12+ months,At least once a day.,No,Yes,I have no tattoos,Yes,Cheerful! I have a positive outlook.,Seldom or never.
33094,Liberal / Left-wing,Yes,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Very often,"Very, very high",Always,Never,"I've done drugs in the past, but no longer.",...,Yes.,Never.,12+ months,Usually daily. I skip some.,Yes,No,I have no tattoos,No,Cheerful! I have a positive outlook.,Weekly / Bi-Weekly
59462,Liberal / Left-wing,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,Average,Whenever possible,Usually,I do drugs occasionally.,...,Yes.,Frequently.,6-12 months,At least once a day.,Yes,No,I have no tattoos,No,Cheerful! I have a positive outlook.,Once every month or three.
27366,Other,No,Rosé (such as White Zinfindel).,Yes,No,Sometimes,Higher than average,Always,Usually,"I've done drugs in the past, but no longer.",...,No.,Frequently.,6-12 months,At least once a day.,Yes,Yes,I have no tattoos,Yes,Cheerful! I have a positive outlook.,Seldom or never.


In [25]:
df_train_top_kmax.shape, df_test_top_kmax.shape

((13570, 31), (3334, 31))

## 1. Try top 10

In [12]:
k = 10

In [26]:
# questions
top_10 = selected_questions[:k]
df_train_top_10 = df_train_top_kmax[top_10]
df_test_top_10 = df_test_top_kmax[top_10]
display(df_train_top_10)

Unnamed: 0_level_0,q212813,q416235,q85419,q358084,q501,q77,q20930,q29829,q35660,q80
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
31844,Centrist,Yes,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,Average,Whenever possible,Usually,I never do drugs.
21628,Other,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Rarely,Higher than average,Whenever possible,Usually,I never do drugs.
51271,Other,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Sometimes,Higher than average,Always,Usually,I never do drugs.
12228,Liberal / Left-wing,Can't answer without a subtitle,Rosé (such as White Zinfindel).,Yes,No,Never,Below average,Whenever possible,Usually,I never do drugs.
57902,Conservative / Right-wing,No,I don't drink wine.,Yes,Yes,Never,Higher than average,Always,Always,"I've done drugs in the past, but no longer."
...,...,...,...,...,...,...,...,...,...,...
26463,Centrist,Yes,I don't drink wine.,Yes,No,Rarely,Average,Always,Usually,I never do drugs.
33094,Liberal / Left-wing,Yes,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Very often,"Very, very high",Always,Never,"I've done drugs in the past, but no longer."
59462,Liberal / Left-wing,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,Average,Whenever possible,Usually,I do drugs occasionally.
27366,Other,No,Rosé (such as White Zinfindel).,Yes,No,Sometimes,Higher than average,Always,Usually,"I've done drugs in the past, but no longer."


In [27]:
df_train_top_10.shape

(13570, 10)

In [83]:
one_hot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()
X_top10_train_onehot, y_top10_train_encoded, X_top10_test_onehot, y_top10_test_encoded = preprocessing.encode_train_test_df(one_hot_encoder, label_encoder, df_train_top_10, df_test_top_10)

In [84]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42, class_weight='balanced')
classifier.fit(X_top10_train_onehot, y_top10_train_encoded)
y_top10_pred = classifier.predict(X_top10_test_onehot)

In [68]:
y_top10_pred_decoded = label_encoder.inverse_transform(y_top10_pred)
pd.DataFrame((pd.crosstab(df_test_top_10[political_belief], y_top10_pred_decoded, rownames=['Actual Belief'], colnames=['Predicted Belief'], normalize=0)))

Predicted Belief,Centrist,Conservative / Right-wing,Liberal / Left-wing,Other
Actual Belief,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Centrist,0.29734,0.200313,0.267606,0.234742
Conservative / Right-wing,0.235294,0.315126,0.151261,0.298319
Liberal / Left-wing,0.267477,0.160334,0.362462,0.209726
Other,0.239264,0.186678,0.292726,0.281332


## 1. Try top 30

In [50]:
k = 30

In [51]:
# questions
top_30 = selected_questions[:k]
df_train_top_30 = df_train_top_kmax[top_30]
df_test_top_30 = df_test_top_kmax[top_30]
display(df_train_top_30)

Unnamed: 0_level_0,q212813,q416235,q85419,q358084,q501,q77,q20930,q29829,q35660,q80,...,q442,q40441,q53611,q19874,q1062,q158,q80041,q128,q16713,q1707
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31844,Centrist,Yes,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,Average,Whenever possible,Usually,I never do drugs.,...,Bad,Maybe a little bit.,Never.,6-12 months,Usually daily. I skip some.,Yes,Yes,I have no tattoos,No,Meh. I have my ups and downs.
21628,Other,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Rarely,Higher than average,Whenever possible,Usually,I never do drugs.,...,Bad,No.,Frequently.,12+ months,Usually daily. I skip some.,No,Yes,I have no tattoos,Yes,Cheerful! I have a positive outlook.
51271,Other,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Sometimes,Higher than average,Always,Usually,I never do drugs.,...,Good,Maybe a little bit.,Rarely.,12+ months,At least once a day.,Yes,Yes,I have 1 or more BIG tattoos,Yes,Cheerful! I have a positive outlook.
12228,Liberal / Left-wing,Can't answer without a subtitle,Rosé (such as White Zinfindel).,Yes,No,Never,Below average,Whenever possible,Usually,I never do drugs.,...,Bad,Maybe a little bit.,Rarely.,12+ months,At least once a day.,No,No,I have no tattoos,Yes,Meh. I have my ups and downs.
57902,Conservative / Right-wing,No,I don't drink wine.,Yes,Yes,Never,Higher than average,Always,Always,"I've done drugs in the past, but no longer.",...,Bad,No.,Never.,6-12 months,At least once a day.,No,Yes,I have no tattoos,Yes,Cheerful! I have a positive outlook.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26463,Centrist,Yes,I don't drink wine.,Yes,No,Rarely,Average,Always,Usually,I never do drugs.,...,Bad,Yes.,Never.,12+ months,At least once a day.,No,Yes,I have no tattoos,Yes,Cheerful! I have a positive outlook.
33094,Liberal / Left-wing,Yes,"Red (such as Merlot, Cabernet, Shiraz).",Yes,Yes,Very often,"Very, very high",Always,Never,"I've done drugs in the past, but no longer.",...,Good,Yes.,Never.,12+ months,Usually daily. I skip some.,Yes,No,I have no tattoos,No,Cheerful! I have a positive outlook.
59462,Liberal / Left-wing,Can't answer without a subtitle,"Red (such as Merlot, Cabernet, Shiraz).",Yes,No,Sometimes,Average,Whenever possible,Usually,I do drugs occasionally.,...,Bad,Yes.,Frequently.,6-12 months,At least once a day.,Yes,No,I have no tattoos,No,Cheerful! I have a positive outlook.
27366,Other,No,Rosé (such as White Zinfindel).,Yes,No,Sometimes,Higher than average,Always,Usually,"I've done drugs in the past, but no longer.",...,Bad,No.,Frequently.,6-12 months,At least once a day.,Yes,Yes,I have no tattoos,Yes,Cheerful! I have a positive outlook.


In [52]:
df_train_top_30.shape

(13570, 30)

In [78]:
one_hot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()
X_top30_train_onehot, y_top30_train_encoded, X_top30_test_onehot, y_top30_test_encoded = preprocessing.encode_train_test_df(one_hot_encoder, label_encoder, df_train_top_30, df_test_top_30)

In [81]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42, class_weight='balanced')
classifier.fit(X_top30_train_onehot, y_top30_train_encoded)
y_top30_pred = classifier.predict(X_top30_test_onehot)
y_top30_pred_decoded = label_encoder.inverse_transform(y_top30_pred)


# Evaluation

1. Accuracy, Balanced Accuracy

In [99]:
print('Top 10 accuary: ',accuracy_score(y_top10_test_encoded, y_top10_pred))
print('Top 30 accuary: ',accuracy_score(y_top30_test_encoded, y_top30_pred))

print('Top 10 balanced accuary: ', balanced_accuracy_score(y_top10_test_encoded, y_top10_pred))
print('Top 30 balanced accuary: ', balanced_accuracy_score(y_top30_test_encoded, y_top30_pred))


Top 10 accuary:  0.3188362327534493
Top 30 accuary:  0.4631073785242951
Top 10 balanced accuary:  0.314064953595296
Top 30 balanced accuary:  0.3661152788771038


2. Confusion Matrix

In [95]:
display(pd.DataFrame(pd.crosstab(df_test_top_10[political_belief], 
                        y_top10_pred_decoded,
                        rownames=['Actual Belief'], colnames=['Predicted Belief'], normalize=0)).style.set_caption('Top 10 questions'))
display(pd.DataFrame(pd.crosstab(df_test_top_30[political_belief], 
                        y_top30_pred_decoded, 
                        rownames=['Actual Belief'], colnames=['Predicted Belief'], normalize=0)).style.set_caption('Top 30 questions'))

Predicted Belief,Centrist,Conservative / Right-wing,Liberal / Left-wing,Other
Actual Belief,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Centrist,0.29734,0.200313,0.267606,0.234742
Conservative / Right-wing,0.235294,0.315126,0.151261,0.298319
Liberal / Left-wing,0.267477,0.160334,0.362462,0.209726
Other,0.239264,0.186678,0.292726,0.281332


Predicted Belief,Centrist,Conservative / Right-wing,Liberal / Left-wing,Other
Actual Belief,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Centrist,0.15493,0.014085,0.483568,0.347418
Conservative / Right-wing,0.151261,0.079832,0.436975,0.331933
Liberal / Left-wing,0.098024,0.012918,0.727964,0.161094
Other,0.079755,0.011394,0.369851,0.539001
