### Data pre-processing of the Big Five personality test

In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
# Read in the csv file containing the raw data
data = pd.read_csv("../data/big5/data-final.csv", delimiter='\t')

#### Removing unwanted entries

In [3]:
# Only keep samples if the number of samples from the IP address is equal to 1 (IPC = 1)
data = data[data['IPC'] == 1]
# Only keep columns with questions
questions = data.keys()[:50]
data = data[[i for i in questions]]

In [4]:
# Check for invalid samples
invalid_entries = []

for entry in data.index:
    answers = data.loc[[entry]].values
    # Remove samples with 0 as entry
    if 0 in answers:
        invalid_entries.append(entry)
    # Remove samples that have the same answer for every question
    elif len(np.unique(answers)) == 1:
        invalid_entries.append(entry)
    # Remove samples that contain nan values
    elif np.isnan(answers).any():
        invalid_entries.append(entry)

data = data.drop(invalid_entries)

In [5]:
data_5states = data.to_numpy(dtype=int)

In [8]:
# Keep all five states (q=5)
data_5states = data.to_numpy(dtype=int)

# 11 componenents

# N = 1000
np.savetxt('../data/big5/Big5_q5_n11_N1000.dat', data_5states[:1000,[0,1,10,11,20,21,22,30,31,40,41]], fmt='%i', delimiter='')
# N = 2000
np.savetxt('../data/big5/Big5_q5_n11_N2000.dat', data_5states[:2000,[0,1,10,11,20,21,22,30,31,40,41]], fmt='%i', delimiter='')
# N = 10_000
np.savetxt('../data/big5/Big5_q5_n11_N10000.dat', data_5states[:10_000,[0,1,10,11,20,21,22,30,31,40,41]], fmt='%i', delimiter='')
# N = 100_000
np.savetxt('../data/big5/Big5_q5_n11_N100000.dat', data_5states[:100_000,[0,1,10,11,20,21,22,30,31,40,41]], fmt='%i', delimiter='')
# N = 200_000
np.savetxt('../data/big5/Big5_q5_n11_N200000.dat', data_5states[:200_000,[0,1,10,11,20,21,22,30,31,40,41]], fmt='%i', delimiter='')
# Complete dataset
np.savetxt('../data/big5/Big5_q5_n11.dat', data_5states[:,[0,1,10,11,20,21,22,30,31,40,41]], fmt='%i', delimiter='')


In [6]:
# 12 components

# N = 10_000
np.savetxt('../data/big5/Big5_q5_n12_N10000.dat', data_5states[:10_000,[0,1,2,10,11,20,21,22,30,31,40,41]], fmt='%i', delimiter='')
# N = 100_000
np.savetxt('../data/big5/Big5_q5_n12_N100000.dat', data_5states[:100_000,[0,1,2,10,11,20,21,22,30,31,40,41]], fmt='%i', delimiter='')

In [7]:
# 13 components

# N = 10_000
np.savetxt('../data/big5/Big5_q5_n13_N10000.dat', data_5states[:10_000,[0,1,2,10,11,12,20,21,22,30,31,40,41]], fmt='%i', delimiter='')
# N = 100_000
np.savetxt('../data/big5/Big5_q5_n13_N100000.dat', data_5states[:100_000,[0,1,2,10,11,12,20,21,22,30,31,40,41]], fmt='%i', delimiter='')

In [8]:
# 14 components

# N = 10_000
np.savetxt('../data/big5/Big5_q5_n14_N10000.dat', data_5states[:10_000,[0,1,2,10,11,12,20,21,22,30,31,32,40,41]], fmt='%i', delimiter='')
# N = 100_000
np.savetxt('../data/big5/Big5_q5_n14_N100000.dat', data_5states[:100_000,[0,1,2,10,11,12,20,21,22,30,31,32,40,41]], fmt='%i', delimiter='')

In [9]:
# 15 components

# N = 10_000
np.savetxt('../data/big5/Big5_q5_n15_N10000.dat', data_5states[:10_000,[0,1,2,10,11,12,20,21,22,30,31,32,40,41,42]], fmt='%i', delimiter='')
# N = 100_000
np.savetxt('../data/big5/Big5_q5_n15_N100000.dat', data_5states[:100_000,[0,1,2,10,11,12,20,21,22,30,31,32,40,41,42]], fmt='%i', delimiter='')

#### Discretizing data to have 3 different states

In [136]:
data_3states = data.to_numpy(dtype=int)

##### Scheme 1

- Map score of 4 (slightly agree) and 5 (agree) to state $\alpha = 2$
- Map score of 3 (neutral) to state $\alpha = 1$
- Map score of 2 (slightly disagree) and 1 (disagree) to state $\alpha = 0$ 

In [150]:
data_3states_v1 = np.copy(data_3states)

data_3states_v1[data_3states_v1 < 3 ] = 0
data_3states_v1[data_3states_v1 == 3] = 1
data_3states_v1[data_3states_v1 > 3] = 2

In [13]:
# Save as strings
np.savetxt('../data/Big5/Big5_q3_v1.dat', data_3states_v1, fmt='%i', delimiter='')

In [151]:
# Reduced data (columns: 0,1;10,11;20,21,22;30,31;40,41)
# Only first 1000 samples
small_data_set = data_3states_v1[:1000,[0,1,10,11,20,21,22,30,31,40,41]]
np.savetxt('../data/Big5/Big5_q3_v1_n11_N1000.dat', small_data_set, fmt='%i', delimiter='')

#### Scheme 2

- Calculate the average across, $\bar{x}$, the samples for every question.
- Choose value for $\epsilon$

* Map value above $\bar{x} + \epsilon$ to state $\alpha = 2$
* Map value between $\bar{x} + \epsilon$ and $\bar{x} - \epsilon$ to state $\alpha = 1$
* Map value below $\bar{x} - \epsilon$ to state $\alpha = 0$


In [137]:
data_3states_v2 = np.copy(data_3states)

In [138]:
# Calculate the average across samples for every question
avg = np.average(data_3states_v2, axis=0)
# Value for epsilon
eps = 0.75

In [139]:
data_3states_v2[data_3states < (avg - eps)] = 0
data_3states_v2[data_3states > (avg + eps)] = 2
data_3states_v2[np.all((data_3states > (avg - eps), data_3states < (avg + eps)), axis=0)] = 1

In [116]:
# Save as strings
np.savetxt('../data/Big5/Big5_q3_v2.dat', data_3states_v1, fmt='%i', delimiter='')

In [149]:
# Reduced data (columns: 0,1;10,11;20,21,22;30,31;40,41)
# Only first 1000 samples
small_data_set = data_3states_v2[:1000,[0,1,10,11,20,21,22,30,31,40,41]]
np.savetxt('../data/Big5/Big5_q3_v2_n11_N1000.dat', small_data_set, fmt='%i', delimiter='')