# Data Cleaning and Preparation

We first start by importing useful libraries:

In [5]:
import pandas as pd
import warnings
import numpy as np
import math

Then we can read our csv dataset:

In [6]:
data=pd.read_csv("Financial_literacy.csv")

Let's start checking what we are dealing with:

In [7]:
columnlist=data.columns
print(len(columnlist))

106


In [8]:
data

Unnamed: 0,id,pesofitc,qf1,qf2,qf3_1,qf3_3,qf3_4,qf3_6,qf3_7,qf3_8,...,qk7_2,qk7_3,qd1,qd5b,qd7,qd9,qd10,qd12,SM,AREA5
0,1,0.706596,2,0,0,0,0,0,0,1,...,1,0,0,3,31,3,5,1,0,1
1,2,1.220904,2,1,0,0,0,1,0,0,...,1,1,0,3,53,4,1,1,0,1
2,3,1.797774,1,0,0,0,0,0,0,1,...,1,1,0,1,70,4,6,1,0,4
3,4,1.515358,2,0,0,0,0,0,0,1,...,1,-97,0,2,64,4,4,1,0,4
4,5,0.244932,2,0,0,0,0,0,0,1,...,0,0,1,2,50,3,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2371,692344,3.113144,3,1,1,0,0,0,0,0,...,-97,-97,1,4,18,4,9,1,1,3
2372,692345,2.112678,1,0,1,0,0,0,0,0,...,1,-97,0,1,68,4,6,1,1,3
2373,692346,2.090212,3,1,1,0,0,0,0,0,...,-97,0,0,2,31,1,2,1,1,3
2374,692347,2.040491,3,0,0,0,0,0,0,1,...,-97,-97,1,4,18,3,9,1,1,3


We see we have many columns mainly because for multiple choice questions replies are subdivided in 0 or 1s, meaning each multiple choice question is usually binarized. 

We also have columns that are not useful to us, so we can start by deleting those and then doing some light data aggregation on the others, in a way that preserves all information in the dataset.

In [9]:
data.drop(["id", "pesofitc", "SM", "qd5b", "qd12", 
               "qf3_99", "qf9_99", "qprod1c_99", "qprod1_d", 
               "qprod3_99", "qf12_97", "qf12_99"], axis=1, inplace=True)

The rationale behind the each column elimination is different from column to column but usually is because either the column is not useful (as in "id" column), or it is unclear (as in "pesofitc") or we decided that absence of a reply is in itself already a reply (as in "qprod1c_99"). For more information about column deletion check the report associated with this project.

Now we have to aggregate three other features to create the target variable, as described in the report:

In [10]:
data['defrauded']=0 #we first initialize our target variable to zero; we'll keep it binary for simplicity

Our first line of reasoning is that in some cases the reply already tells us the interviewed person was victim of a fraud, and so for those cases we initialise the variable to 1:

In [11]:
warnings.filterwarnings("ignore")
for i in range(len(data)):
    if data['qprod4_2'][i]==1 or data['qprod4_3'][i]==1: # either one of the two variables -> defrauded
        data['defrauded'][i]=1

In [12]:
len(data[data['defrauded']==1])

147

We see we already have 147 data points, but it's good to try and extract as much info from the data itself because that way we reduce imbalances in victims of a fraud or not. To do so, we have to distinguish between those that are with certainty NOT victims of a fraud, those who have been victims of a fraud (already selected before) and of those who is unclear we have to partition once more in those who is unclear but information is still retrievable or not. We can start by highlighting those who, with certainty, are not victims of a fraud, associated with -1: 

In [13]:
warnings.filterwarnings("ignore")
for i in range(len(data)):
    if data['qprod4_2'][i]==0 and data['qprod4_3'][i]==0 and data['qprod4_1'][i]==0: # all three variables to zero -> no fraud
        data['defrauded'][i]=-1

In [15]:
len(data[data['defrauded']==-1])

1919

Now we can retrieve some data points utilising information from questions that came before:

In [16]:
warnings.filterwarnings("ignore")
for i in range(len(data)):
    counter=0
    if data['qprod4_1'][i]==1 and data['defrauded'][i]==0: # meaning we are considering only those who are unclear
        if data['qprod3_8'][i]==1 or data['qprod3_9'][i]==1 or data['qprod3_10'][i]==1: # if the reply of qprod3_1 was conditioned by 
            # friends or relatives or some financial adviser, we can't consider it fraud since they should have  one's best interest in mind
            counter=1
        if math.isnan(data['qprod3_8'][i]): # we only enter in qprod3_x questions if the person has indeed bought financial products - hence if we see NaN 
            # it means there is some inconsistency within the data 
            counter=1
        if counter!=1:
            data['defrauded'][i]=1

In [17]:
len(data[data['defrauded']==1])

186

For all those data points where still it is unclear if the respondent was victim of a fraud, we can consider that with good probability when the replies are mostly negative (for two thirds negative and for the remaining non positive) the person was indeed not victim of a fraud:

In [10]:
warnings.filterwarnings("ignore")
for i in range(len(data)):
    counter=0
    if data['qprod4_3'][i]==0 and data['qprod4_1'][i]==0 and data['defrauded'][i]==0:
        counter+=1
    if data['qprod4_2'][i]==0 and data['qprod4_1'][i]==0 and data['defrauded'][i]==0:
        counter+=1
    if data['qprod4_3'][i]==0 and data['qprod4_2'][i]==0 and data['defrauded'][i]==0:
        counter+=1
    if counter!=0:
        data['defrauded'][i]=-2 

In [11]:
print(len(data[data['defrauded']==0]))

110


These 110 replies are either inconsistent or no data can be retrieved about them and as such we'll have to delete them:

In [12]:
data.drop(data[data['defrauded']==0].index, inplace=True)
data.reset_index(inplace=True)

In [13]:
data.drop('index', axis=1, inplace=True)

Now we reduce all our columns to zero when the respondent was not victim of a fraud:

In [14]:
for i in range(len(data)):
    if data['defrauded'][i]==-1 or data['defrauded'][i]==-2:
        data['defrauded'][i]=0

We can also now remove the columns that we "aggregated" in the target variable:

In [15]:
data.drop(['qprod4_1', 'qprod4_2', 'qprod4_3'], axis=1, inplace=True)

In [16]:
print(len(data.columns))

92


We see we are left with 92 features. Now we can try and discretise features that can be unified without loss of information or generalisation; we can also discretise a little bit some replies to limit the image space of each explanatory variable.

We can start discretising AREA5, referring to where each respondent lives:

In [17]:
for i in range(len(data)):
    if data['AREA5'][i]==2:
        data['AREA5'][i]=1 # north
    if data['AREA5'][i]==4:
        data['AREA5'][i]=5 # south + islands
# center is left untouched

Now we also should discretise age ranges:

In [18]:
for i in range(len(data)):
    if math.trunc(data['qd7'][i]/10)==1:  # first age range: 18-24
        data['qd7'][i]=1  
    if math.trunc(data['qd7'][i]/10)==2:
        if data['qd7'][i]%10>4:  # second age range: 25-34
            data['qd7'][i]=2
        else: 
            data['qd7'][i]=1
    if math.trunc(data['qd7'][i]/10)==3:
        if data['qd7'][i]%10>4: # third age range: 35-44
            data['qd7'][i]=3
        else: 
            data['qd7'][i]=2
    if math.trunc(data['qd7'][i]/10)==4:
        if data['qd7'][i]%10>4: # fourth age range: 45-54
            data['qd7'][i]=4
        else: 
            data['qd7'][i]=3
    if math.trunc(data['qd7'][i]/10)==5:
        if data['qd7'][i]%10>4: # fifth age range: 55-64
            data['qd7'][i]=5
        else: 
            data['qd7'][i]=4
    if math.trunc(data['qd7'][i]/10)==6:
        if data['qd7'][i]%10>4: # sixth age range: 65-74
            data['qd7'][i]=6
        else: 
            data['qd7'][i]=5
    if math.trunc(data['qd7'][i]/10)==7 or math.trunc(data['qd7'][i]/10)==8 or math.trunc(data['qd7'][i]/10)==9: 
        if data['qd7'][i]%10>4: # last age range: over 75
            data['qd7'][i]=7 
        else: 
            data['qd7'][i]=6

We also discretise education: no education, primary education, secondary education, university education

In [19]:
for i in range(len(data)):
    if data['qd9'][i]==4:
        data['qd9'][i]=5
    if data['qd9'][i]==6:
        data['qd9'][i]=7

In Qf4 we can discretize some values that symbolise absence of data:

In [20]:
for i in range(len(data)):
    if data['qf4'][i]==-97:
        data['qf4'][i]=-99
    if data['qf4'][i]==-98:
        data['qf4'][i]=2

Now we can also reduce the scope of values for qf8:

In [21]:
for i in range(len(data)):
    if data['qf8'][i]==-97:
        data['qf8'][i]=-99

We can also reduce some features through aggregation for example in uniting different multiple choices of Qf9: this can be done by summation along rows and then reduction when the result is zero or positive.

In [22]:
data['qf9_2_3']=data[['qf9_2', 'qf9_3']].sum(axis=1)

In [23]:
data['qf9_7_8']=data[['qf9_7', 'qf9_8']].sum(axis=1)

In [24]:
data['qf9_1_9']=data[['qf9_1', 'qf9_9']].sum(axis=1)

In [25]:
data.drop(['qf9_1', 'qf9_2', 'qf9_3', 'qf9_7', 'qf9_8', 'qf9_9'], axis=1, inplace=True)

In [26]:
for i in range(len(data)):
    if data['qf9_2_3'][i]>=1:
        data['qf9_2_3'][i]=1
    if data['qf9_7_8'][i]>=1:
        data['qf9_7_8'][i]=1
    if data['qf9_1_9'][i]>=1:
        data['qf9_1_9'][i]=1

Now we take care of some missing data in qprod2 and qprod3_x, where the absence of some data is symbolised already by a specific value. Then we can fill that value when the question was not answered directly. Basically we are substituting a NaN value, present but symbolising absence of information, with a value that already symbolised absence of information.

In [27]:
for i in range(len(data)):
    if math.isnan(data['qprod2'][i])==True: # we only need to check this value for NaN because if it is then the whole qprod2 and all qprod3_x features
        # will be NaN
        data['qprod2'][i]=-99
        data['qprod3_1'][i]=-99
        data['qprod3_2'][i]=-99
        data['qprod3_3'][i]=-99
        data['qprod3_4'][i]=-99
        data['qprod3_5'][i]=-99
        data['qprod3_6'][i]=-99
        data['qprod3_7'][i]=-99
        data['qprod3_8'][i]=-99
        data['qprod3_9'][i]=-99
        data['qprod3_10'][i]=-99
        data['qprod3_11'][i]=-99
        data['qprod3_12'][i]=-99
        data['qprod3_13'][i]=-99
        data['qprod3_14'][i]=-99
        data['qprod3_15'][i]=-99
        data['qprod3_16'][i]=-99
        data['qprod3_17'][i]=-99
        data['qprod3_18'][i]=-99

In a similar fashion to qf9 now we can group some features of the qprod3_x question:

In [28]:
data['qprod3_1_16']=data[['qprod3_1', 'qprod3_16']].sum(axis=1)

In [29]:
data['qprod3_3_6']=data[['qprod3_3', 'qprod3_6']].sum(axis=1)

In [30]:
data['qprod3_5_7_12_14']=data[['qprod3_5', 'qprod3_7', 'qprod3_12', 'qprod3_14']].sum(axis=1)

In [31]:
data['qprod3_13_15']=data[['qprod3_13', 'qprod3_15']].sum(axis=1)

In [32]:
for i in range(len(data)):
    if data['qprod3_1_16'][i]>0:
        data['qprod3_1_16'][i]=1
    if data['qprod3_3_6'][i]>0:
        data['qprod3_3_6'][i]=1
    if data['qprod3_5_7_12_14'][i]>0:
        data['qprod3_5_7_12_14'][i]=1
    if data['qprod3_13_15'][i]>0:
        data['qprod3_13_15'][i]=1
    if data['qprod3_1_16'][i]<0:
        data['qprod3_1_16'][i]=-99
    if data['qprod3_3_6'][i]<0:
        data['qprod3_3_6'][i]=-99
    if data['qprod3_5_7_12_14'][i]<0:
        data['qprod3_5_7_12_14'][i]=-99
    if data['qprod3_13_15'][i]<0:
        data['qprod3_13_15'][i]=-99

In [33]:
data.drop(['qprod3_1', 'qprod3_16', 'qprod3_3', 'qprod3_6', 'qprod3_5',
              'qprod3_7', 'qprod3_12', 'qprod3_14', 'qprod3_13', 'qprod3_15'], axis=1, inplace=True)

Now we reduce the scope of values for qf10:

In [34]:
for i in range(len(data)):
    if data['qf10_1'][i]==-97:
        data['qf10_1'][i]=-99
    if data['qf10_2'][i]==-97:
        data['qf10_2'][i]=-99   
    if data['qf10_3'][i]==-97:
        data['qf10_3'][i]=-99
    if data['qf10_4'][i]==-97:
        data['qf10_4'][i]=-99
    if data['qf10_5'][i]==-97:
        data['qf10_5'][i]=-99
    if data['qf10_6'][i]==-97:
        data['qf10_6'][i]=-99
    if data['qf10_7'][i]==-97:
        data['qf10_7'][i]=-99
    if data['qf10_8'][i]==-97:
        data['qf10_8'][i]=-99
    if data['qf10_9'][i]==-97:
        data['qf10_9'][i]=-99
    if data['qf10_10'][i]==-97:
        data['qf10_10'][i]=-99
    if data['qf10_11'][i]==-97:
        data['qf10_11'][i]=-99
    if data['qf10_12'][i]==-97:
        data['qf10_12'][i]=-99

The same is going to be done for qk1:

In [35]:
for i in range(len(data)):
    if data['qk1'][i]==-97:
        data['qk1'][i]=-99

Here we have to take into account that in qf12 any answer's validity is dependent on qf11's results. So we have to account for inconsistent data giving some for of prevalence to qf11. 

In [36]:
for i in range(len(data)):
    if data['qf11'][i]==-99:
        data['qf12_1_a'][i]=-99
        data['qf12_1_b'][i]=-99
        data['qf12_1_c'][i]=-99
        data['qf12_2_d'][i]=-99
        data['qf12_3_e'][i]=-99
        data['qf12_3_f'][i]=-99
        data['qf12_3_g'][i]=-99
        data['qf12_4_k'][i]=-99
        data['qf12_4_l'][i]=-99
        data['qf12_5_m'][i]=-99
        data['qf12_5_o'][i]=-99
        data['qf12_6_p'][i]=-99
        data['qf12_6_q'][i]=-99
        data['qf12_7_r'][i]=-99
    if data['qf11'][i]==0:
        data['qf12_1_a'][i]=0
        data['qf12_1_b'][i]=0
        data['qf12_1_c'][i]=0
        data['qf12_2_d'][i]=0
        data['qf12_3_e'][i]=0
        data['qf12_3_f'][i]=0
        data['qf12_3_g'][i]=0
        data['qf12_4_k'][i]=0
        data['qf12_4_l'][i]=0
        data['qf12_5_m'][i]=0
        data['qf12_5_o'][i]=0
        data['qf12_6_p'][i]=0
        data['qf12_6_q'][i]=0
        data['qf12_7_r'][i]=0

Now we can also unite some qf12 multiple choice questions, in similar fashion as to what has been done before:

In [37]:
data['qf12_1c_3g']=data[['qf12_1_c', 'qf12_3_g']].sum(axis=1)

In [38]:
data['qf12_4k_5m_5o_6p']=data[['qf12_5_m', 'qf12_4_k', 'qf12_5_o', 'qf12_6_p']].sum(axis=1)

In [39]:
for i in range(len(data)):
    if data['qf12_1c_3g'][i]>0:
        data['qf12_1c_3g'][i]=1
    if data['qf12_1c_3g'][i]<0:
        data['qf12_1c_3g'][i]=-99
    if data['qf12_4k_5m_5o_6p'][i]>0:
        data['qf12_4k_5m_5o_6p'][i]=1
    if data['qf12_4k_5m_5o_6p'][i]<0:
        data['qf12_4k_5m_5o_6p'][i]=-99

In [40]:
data.drop(['qf12_5_m', 'qf12_4_k', 'qf12_5_o', 'qf12_6_p', 'qf12_1_c', 'qf12_3_g'], axis=1, inplace=True)

No we also aggregate some values in qf13:

In [41]:
for i in range(len(data)):
    if data['qf13'][i]==-97:
        data['qf13'][i]=-99
    if data['qf13'][i]==2:
        data['qf13'][i]=1

In [42]:
data

Unnamed: 0,qf1,qf2,qf3_1,qf3_3,qf3_4,qf3_6,qf3_7,qf3_8,qf4,qf8,...,defrauded,qf9_2_3,qf9_7_8,qf9_1_9,qprod3_1_16,qprod3_3_6,qprod3_5_7_12_14,qprod3_13_15,qf12_1c_3g,qf12_4k_5m_5o_6p
0,2,0,0,0,0,0,0,1,2,6,...,0,0,0,0,-99.0,-99.0,-99.0,-99.0,0.0,0.0
1,2,1,0,0,0,1,0,0,1,6,...,1,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,0,0,0,1,1,-99,...,0,0,0,0,-99.0,-99.0,-99.0,-99.0,0.0,0.0
3,2,0,0,0,0,0,0,1,2,6,...,0,0,1,0,-99.0,-99.0,-99.0,-99.0,0.0,0.0
4,2,0,0,0,0,0,0,1,0,6,...,0,0,0,1,-99.0,-99.0,-99.0,-99.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2261,3,1,1,0,0,0,0,0,0,6,...,0,0,0,0,-99.0,-99.0,-99.0,-99.0,0.0,0.0
2262,1,0,1,0,0,0,0,0,1,6,...,0,0,0,0,-99.0,-99.0,-99.0,-99.0,0.0,0.0
2263,3,1,1,0,0,0,0,0,1,4,...,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0
2264,3,0,0,0,0,0,0,1,-99,6,...,0,0,0,1,-99.0,-99.0,-99.0,-99.0,0.0,0.0


Now we aggregate some values for all questions related to financial knowledge:

In [43]:
for i in range(len(data)):
    if data['qk3'][i]==-97 or data['qk3'][i]==-99:
        data['qk3'][i]=-99
    if data['qk3'][i]==1 or data['qk3'][i]==2:
        data['qk3'][i]=-1

In [44]:
for i in range(len(data)):
    if data['qk4'][i]!=0 and data['qk4'][i]!=-99 and data['qk4'][i]!=-97:
        data['qk4'][i]=-1
    if data['qk4'][i]==-97 or data['qk4'][i]==-99:
        data['qk4'][i]=-99

In [45]:
for i in range(len(data)):
    if data['qk5'][i]!=102 and data['qk5'][i]!=-99 and data['qk5'][i]!=-97:
        data['qk5'][i]=-1
    if data['qk5'][i]==-97 or data['qk5'][i]==-99:
        data['qk5'][i]=-99

In [46]:
for i in range(len(data)):
    if data['qk6'][i]==2 or data['qk6'][i]==3 or data['qk6'][i]==4:
        data['qk6'][i]=-1
    if data['qk6'][i]==-97 or data['qk6'][i]==-99:
        data['qk6'][i]=-99

QK7_1->QK7_3

In [47]:
for i in range(len(data)):
    if data['qk7_1'][i]==0:
        data['qk7_1'][i]=-1
    if data['qk7_2'][i]==0:
        data['qk7_2'][i]=-1
    if data['qk7_3'][i]==0:
        data['qk7_3'][i]=-1
    if data['qk7_1'][i]==-97 or data['qk7_1'][i]==-99:
        data['qk7_1'][i]=-99
    if data['qk7_2'][i]==-97 or data['qk7_2'][i]==-99:
        data['qk7_2'][i]=-99
    if data['qk7_3'][i]==-97 or data['qk7_3'][i]==-99:
        data['qk7_3'][i]=-99

Now that the light aggregation and preparation is complete we can export the file for further analysis.

In [48]:
data.to_csv('clean_data.csv')