### Detailed Guide for Data Preprocessing
#### This script covers all the major four steps of preprocessing:
 - Handling Null values
 - Detecting Outliers
 - Feature Scaling
 - Encoding Categorical Features

In [1]:
#Importing Basic Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Creating DataFrame
df=pd.read_csv('preprocessing data.csv')
df

Unnamed: 0,A,B,C,D,E,F,G,H
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male
1,32.0,5000.0,2.1,26.0,,Blue,No,Male
2,,6000.0,,125.0,,Black,Yes,
3,54.0,70000.0,,140.0,,Red,,
4,64.0,6500.0,3.6,24.0,574.0,,No,Male
5,46.0,,2.5,,556.0,Red,,Female
6,,,,,545.0,Blue,,Male
7,53.0,4500.0,,,586.0,Red,Yes,
8,,,4.2,26.0,,Black,Yes,Male
9,24.0,3200.0,2.3,25.0,546.0,,,


In [3]:
#Null Value Analysis
df.isnull().sum()

A    3
B    3
C    4
D    3
E    4
F    2
G    4
H    4
dtype: int64

In [4]:
df['A'].mean()

42.285714285714285

## Handling Missing Values

In [5]:
# Imputing mean for missing values

# df['A'].fillna(df['A'].mean())   #this fills the original column by the mean value

In [6]:
#Imputing median for missing values in column A and creating a new column for it
df['A'+'_mean'] = df['A'].fillna(df['A'].mean())
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_mean
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0
1,32.0,5000.0,2.1,26.0,,Blue,No,Male,32.0
2,,6000.0,,125.0,,Black,Yes,,42.285714
3,54.0,70000.0,,140.0,,Red,,,54.0
4,64.0,6500.0,3.6,24.0,574.0,,No,Male,64.0
5,46.0,,2.5,,556.0,Red,,Female,46.0
6,,,,,545.0,Blue,,Male,42.285714
7,53.0,4500.0,,,586.0,Red,Yes,,53.0
8,,,4.2,26.0,,Black,Yes,Male,42.285714
9,24.0,3200.0,2.3,25.0,546.0,,,,24.0


In [7]:
df['A'].median()

46.0

In [8]:
df['A'+'_median'] = df['A'].fillna(df['A'].median())
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_mean,A_median
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,23.0
1,32.0,5000.0,2.1,26.0,,Blue,No,Male,32.0,32.0
2,,6000.0,,125.0,,Black,Yes,,42.285714,46.0
3,54.0,70000.0,,140.0,,Red,,,54.0,54.0
4,64.0,6500.0,3.6,24.0,574.0,,No,Male,64.0,64.0
5,46.0,,2.5,,556.0,Red,,Female,46.0,46.0
6,,,,,545.0,Blue,,Male,42.285714,46.0
7,53.0,4500.0,,,586.0,Red,Yes,,53.0,53.0
8,,,4.2,26.0,,Black,Yes,Male,42.285714,46.0
9,24.0,3200.0,2.3,25.0,546.0,,,,24.0,24.0


#### Random Sample Imputation

In [9]:
#Imputing column B using random sample values:

b = df['B'].isnull().sum()
b

3

In [20]:
#sample function to randomly select 3 values from the column B , 
#afterremoving null values
#random_state: reproducing the same result, you can assign any 
#integer to it.

# When we do not mention random_state , the sample function 
#will select any three values randomly from the column

# Everytime you run the program the values chosen by sample 
#function will CHANGE

# This will generate different results everytime

# to avoide this we mention the random_state as a fixed number,
#so that everytime when the program is executed

# we get the same result. This is called 'Reproducibility of the result'


# In the below code, null values are dropped, from the rest of the value 3 values are randomly chosen

random_sample=df['B'].dropna().sample(b, random_state=0) 
random_sample

#Try using a different random state ( 5 , 50, 500) and re-run the kernel , observe the sample chosen
# try using no random_state, re-run the code 2 to 3 times and observe the same chosen


9    3200.0
2    6000.0
1    5000.0
Name: B, dtype: float64

In [21]:
df['B']

0     8000.0
1     5000.0
2     6000.0
3    70000.0
4     6500.0
5        NaN
6        NaN
7     4500.0
8        NaN
9     3200.0
Name: B, dtype: float64

In [22]:
# The index of those 3 values is matched with null values in original dataframe

random_sample.index = df[df['B'].isnull()].index
random_sample

5    3200.0
6    6000.0
8    5000.0
Name: B, dtype: float64

In [23]:
# Now finally in those index values , the sample values so chosen are filled
df['B']=df['B'].fillna(random_sample)
df['B']

0     8000.0
1     5000.0
2     6000.0
3    70000.0
4     6500.0
5     3200.0
6     6000.0
7     4500.0
8     5000.0
9     3200.0
Name: B, dtype: float64

In [24]:
df['C']

0    1.1
1    2.1
2    NaN
3    NaN
4    3.6
5    2.5
6    NaN
7    NaN
8    4.2
9    2.3
Name: C, dtype: float64

#### Capturing the missingness

In [25]:
# Capturing the (missingness) NaN values with a new feature:
# get_dummies (1 for missing value and 0 for filled value) (Assignment)

df['C_NAN'] = np.where(df['C'].isnull(),1,0)
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_mean,A_median,C_NAN
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,23.0,0
1,32.0,5000.0,2.1,26.0,,Blue,No,Male,32.0,32.0,0
2,,6000.0,,125.0,,Black,Yes,,42.285714,46.0,1
3,54.0,70000.0,,140.0,,Red,,,54.0,54.0,1
4,64.0,6500.0,3.6,24.0,574.0,,No,Male,64.0,64.0,0
5,46.0,3200.0,2.5,,556.0,Red,,Female,46.0,46.0,0
6,,6000.0,,,545.0,Blue,,Male,42.285714,46.0,1
7,53.0,4500.0,,,586.0,Red,Yes,,53.0,53.0,1
8,,5000.0,4.2,26.0,,Black,Yes,Male,42.285714,46.0,0
9,24.0,3200.0,2.3,25.0,546.0,,,,24.0,24.0,0


In [26]:
df['D']

0     23.0
1     26.0
2    125.0
3    140.0
4     24.0
5      NaN
6      NaN
7      NaN
8     26.0
9     25.0
Name: D, dtype: float64

#### End of the distribution Imputation Method

In [30]:
df.D

0     23.000000
1     26.000000
2    125.000000
3    140.000000
4     24.000000
5    213.794598
6    213.794598
7    213.794598
8     26.000000
9     25.000000
Name: D, dtype: float64

In [27]:
# End of the distribution Imputation Method
#df['D']  or df.D   (mean + 3*SD)

eod = df.D.mean() + 3*df.D.std()    
eod
#Mean + 3*SD ... extreme value used... df['D']...df.D

213.79459826718278

In [28]:
# Filling with end of distribution (mean +- 3*SD)

df['D']=df['D'].fillna(eod)  #df['D']  ~ df.D
df['D']

0     23.000000
1     26.000000
2    125.000000
3    140.000000
4     24.000000
5    213.794598
6    213.794598
7    213.794598
8     26.000000
9     25.000000
Name: D, dtype: float64

In [31]:
df['E']

0    555.0
1      NaN
2      NaN
3      NaN
4    574.0
5    556.0
6    545.0
7    586.0
8      NaN
9    546.0
Name: E, dtype: float64

In [32]:
#Arbitrary value imputation
df['E']=df['E'].fillna(586)
df['E']

0    555.0
1    586.0
2    586.0
3    586.0
4    574.0
5    556.0
6    545.0
7    586.0
8    586.0
9    546.0
Name: E, dtype: float64

In [33]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_mean,A_median,C_NAN
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,23.0,0
1,32.0,5000.0,2.1,26.0,586.0,Blue,No,Male,32.0,32.0,0
2,,6000.0,,125.0,586.0,Black,Yes,,42.285714,46.0,1
3,54.0,70000.0,,140.0,586.0,Red,,,54.0,54.0,1
4,64.0,6500.0,3.6,24.0,574.0,,No,Male,64.0,64.0,0
5,46.0,3200.0,2.5,213.794598,556.0,Red,,Female,46.0,46.0,0
6,,6000.0,,213.794598,545.0,Blue,,Male,42.285714,46.0,1
7,53.0,4500.0,,213.794598,586.0,Red,Yes,,53.0,53.0,1
8,,5000.0,4.2,26.0,586.0,Black,Yes,Male,42.285714,46.0,0
9,24.0,3200.0,2.3,25.0,546.0,,,,24.0,24.0,0


In [37]:
mod = df['F'].mode()[0]
mod

'Red'

In [36]:
df['F'].value_counts()     #only for categorical features

Red      4
Blue     2
Black    2
Name: F, dtype: int64

In [None]:
# df['F'].value_counts().sort_values(ascending=False)

In [None]:
# df['F'].value_counts().sort_values(ascending=False).index[0]

In [None]:
#Frequent category imputation for categorical column (mode)

# mode_color=df['F'].value_counts().sort_values(ascending=False).index[0]
# mode_color
# df['F'].value_counts().sort_values(ascending=False).index[0]

In [38]:
df['F']=df['F'].fillna(mod)
df['F']

#df['F'].mode()

0      Red
1     Blue
2    Black
3      Red
4      Red
5      Red
6     Blue
7      Red
8    Black
9      Red
Name: F, dtype: object

In [39]:
df['F'].value_counts()

Red      6
Blue     2
Black    2
Name: F, dtype: int64

In [40]:
df['G']

0    Yes 
1      No
2    Yes 
3     NaN
4      No
5     NaN
6     NaN
7    Yes 
8    Yes 
9     NaN
Name: G, dtype: object

In [41]:
df['G'].value_counts()

Yes     4
No      2
Name: G, dtype: int64

In [42]:
# Treat NaN as a new category
df['G']=df['G'].fillna('missing')
df['G']

0       Yes 
1         No
2       Yes 
3    missing
4         No
5    missing
6    missing
7       Yes 
8       Yes 
9    missing
Name: G, dtype: object

In [43]:
df['G'].value_counts()

Yes        4
missing    4
No         2
Name: G, dtype: int64

#### bfill & ffill

In [81]:
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({'A': [1, None, 3, None, 5],
                   'B': [None, 2, 3, None, 5]})

# Fill missing values using 'bfill'
df_filled = df.bfill()

# Print the filled DataFrame
print(df_filled)

     A    B
0  1.0  2.0
1  3.0  2.0
2  3.0  3.0
3  5.0  5.0
4  5.0  5.0


## Detecting Outliers

In [44]:
tips_df=sns.load_dataset('tips')
tips_df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [46]:
##IQR method of detecting outliers

def iqr_func(data):
    Q1 = np.percentile(data, 25, interpolation = 'midpoint') 
    Q3 = np.percentile(data, 75, interpolation = 'midpoint')
    IQR=Q3-Q1
    lo=Q1-1.5*IQR
    uo=Q3+1.5*IQR
    
    outlier=[]
    for x in data:
        if ((x> uo) or (x<lo)):
            outlier.append(x)
    
    print(lo, uo)
    print('outlier in the dataset are:', outlier)
    
iqr_func(tips_df['total_bill'])

-2.9499999999999993 40.449999999999996
outlier in the dataset are: [48.27, 44.3, 41.19, 48.17, 50.81, 45.35, 40.55, 43.11, 48.33]


In [47]:
## Z score method of detecting outliers

def norm_func(data):
    
    outlier=[]
    Z_scores=[]
    mean=np.mean(data)
    std=np.std(data)
    
    for x in data:
        z=(x-mean)/std
        if z>3 or z<-3:
            Z_scores.append(z)
            outlier.append(x)
    
    print('Z scores of outlier in the dataset are: ', Z_scores)
    print('outlier in the dataset are: ', outlier)
    #final = [i*std+mean for i in outlier]
    #print(final)        
        
norm_func(tips_df['total_bill'])

Z scores of outlier in the dataset are:  [3.2061655335197283, 3.194909533396294, 3.492067936654957, 3.2129191335937883]
outlier in the dataset are:  [48.27, 48.17, 50.81, 48.33]


## Feature Scaling

In [48]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_mean,A_median,C_NAN
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,23.0,0
1,32.0,5000.0,2.1,26.0,586.0,Blue,No,Male,32.0,32.0,0
2,,6000.0,,125.0,586.0,Black,Yes,,42.285714,46.0,1
3,54.0,70000.0,,140.0,586.0,Red,missing,,54.0,54.0,1
4,64.0,6500.0,3.6,24.0,574.0,Red,No,Male,64.0,64.0,0
5,46.0,3200.0,2.5,213.794598,556.0,Red,missing,Female,46.0,46.0,0
6,,6000.0,,213.794598,545.0,Blue,missing,Male,42.285714,46.0,1
7,53.0,4500.0,,213.794598,586.0,Red,Yes,,53.0,53.0,1
8,,5000.0,4.2,26.0,586.0,Black,Yes,Male,42.285714,46.0,0
9,24.0,3200.0,2.3,25.0,546.0,Red,missing,,24.0,24.0,0


In [50]:
#Maximum Absolute Scaling  (x/|max|)

from sklearn.preprocessing import MaxAbsScaler
mas=MaxAbsScaler()    #instatiating the class (creating the object mas to be used for this dataframe)
column = ['A_median','B','D','E']

mas.fit_transform(df[column])   #fit and transform ()

#mas.fit(df[column]) .... kept the maximum value aside
#mas.transform(df[column])..... take the max value and divide each value by it..

#.fit_tranform
#.fit : Calculation: maximum absolute value
#.transform: tranform or apply the calculated value on all columns

array([[0.359375  , 0.11428571, 0.10757989, 0.94709898],
       [0.5       , 0.07142857, 0.12161205, 1.        ],
       [0.71875   , 0.08571429, 0.58467333, 1.        ],
       [0.84375   , 1.        , 0.65483413, 1.        ],
       [1.        , 0.09285714, 0.11225728, 0.97952218],
       [0.71875   , 0.04571429, 1.        , 0.94880546],
       [0.71875   , 0.08571429, 1.        , 0.93003413],
       [0.828125  , 0.06428571, 1.        , 1.        ],
       [0.71875   , 0.07142857, 0.12161205, 1.        ],
       [0.375     , 0.04571429, 0.11693467, 0.93174061]])

In [None]:
df['B']

In [51]:
# Min Max Scaler (X - Xmin)/ (Xmax - Xmin)...Xmin, Xmax 

from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler()

data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

mms.fit_transform(data)

#fit stage: trace the xmax value and xmin value
#transform stage: apply the formula and get the scaled value

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [52]:
## Standardization (most commonly used)   these values are between -3 and 3 , it used the Z score formula
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
ss.fit_transform(data)

#Z = (x - mean)/sd 
#10 numerical features, 2 have outlier issues, 8 are ~ ND, then standardization & normalization

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [53]:
## Normalization (x - xmean / range) #prone to outlier effect
# we did not instantiate it as a object
# we didnot use fit_transform

from sklearn.preprocessing import normalize
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
normalize(data)

array([[-0.4472136 ,  0.89442719],
       [-0.08304548,  0.99654576],
       [ 0.        ,  1.        ],
       [ 0.05547002,  0.99846035]])

In [59]:
## Robust scaling (most of the features are highly skewed use robust Scaler) (likely: -3 and +3)
# (x - Xmedian) / IQR   its just like replace mean with median and replace SD with IQR

from sklearn.preprocessing import RobustScaler

data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

rs = RobustScaler()

rs.fit_transform(data)

array([[-0.85714286, -0.85714286],
       [-0.28571429, -0.28571429],
       [ 0.28571429,  0.28571429],
       [ 1.42857143,  1.42857143]])

## Encoding the categorical variables

In [60]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_mean,A_median,C_NAN
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,23.0,0
1,32.0,5000.0,2.1,26.0,586.0,Blue,No,Male,32.0,32.0,0
2,,6000.0,,125.0,586.0,Black,Yes,,42.285714,46.0,1
3,54.0,70000.0,,140.0,586.0,Red,missing,,54.0,54.0,1
4,64.0,6500.0,3.6,24.0,574.0,Red,No,Male,64.0,64.0,0
5,46.0,3200.0,2.5,213.794598,556.0,Red,missing,Female,46.0,46.0,0
6,,6000.0,,213.794598,545.0,Blue,missing,Male,42.285714,46.0,1
7,53.0,4500.0,,213.794598,586.0,Red,Yes,,53.0,53.0,1
8,,5000.0,4.2,26.0,586.0,Black,Yes,Male,42.285714,46.0,0
9,24.0,3200.0,2.3,25.0,546.0,Red,missing,,24.0,24.0,0


In [61]:
# It creates a column for each class of that feature. These columns are filled with 0 and 1.
# This is exactly what OneHotEncoding also does

pd.get_dummies(df['F'])

# pd.get_dummies(df,columns=['F','G','H'])

Unnamed: 0,Black,Blue,Red
0,0,0,1
1,0,1,0
2,1,0,0
3,0,0,1
4,0,0,1
5,0,0,1
6,0,1,0
7,0,0,1
8,1,0,0
9,0,0,1


In [62]:
df.G

0       Yes 
1         No
2       Yes 
3    missing
4         No
5    missing
6    missing
7       Yes 
8       Yes 
9    missing
Name: G, dtype: object

In [63]:
from sklearn.preprocessing import OneHotEncoder   #importing
ohe = OneHotEncoder()  

In [65]:
df.G

0       Yes 
1         No
2       Yes 
3    missing
4         No
5    missing
6    missing
7       Yes 
8       Yes 
9    missing
Name: G, dtype: object

In [66]:
df.G.values

array(['Yes ', 'No', 'Yes ', 'missing', 'No', 'missing', 'missing',
       'Yes ', 'Yes ', 'missing'], dtype=object)

In [67]:
df.G.values.reshape(-1,1)

array([['Yes '],
       ['No'],
       ['Yes '],
       ['missing'],
       ['No'],
       ['missing'],
       ['missing'],
       ['Yes '],
       ['Yes '],
       ['missing']], dtype=object)

In [68]:
ohe.fit_transform(df.G.values.reshape(-1,1))

<10x3 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [69]:
ohe.fit_transform(df.G.values.reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [70]:
# Using OneHotEncoder

from sklearn.preprocessing import OneHotEncoder   #importing
ohe = OneHotEncoder()    #instatiating by dropping one column of the final array

final=ohe.fit_transform(df.G.values.reshape(-1, 1)).toarray()  #fitting and transforming categories using OneHotEncoder
final

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [71]:
#Adding the arrays to the original dataframe

df['G_No']=final[:,0]
df['G_Yes']=final[:,1]
df['G_Missing']=final[:,2]
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_mean,A_median,C_NAN,G_No,G_Yes,G_Missing
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,23.0,0,0.0,1.0,0.0
1,32.0,5000.0,2.1,26.0,586.0,Blue,No,Male,32.0,32.0,0,1.0,0.0,0.0
2,,6000.0,,125.0,586.0,Black,Yes,,42.285714,46.0,1,0.0,1.0,0.0
3,54.0,70000.0,,140.0,586.0,Red,missing,,54.0,54.0,1,0.0,0.0,1.0
4,64.0,6500.0,3.6,24.0,574.0,Red,No,Male,64.0,64.0,0,1.0,0.0,0.0
5,46.0,3200.0,2.5,213.794598,556.0,Red,missing,Female,46.0,46.0,0,0.0,0.0,1.0
6,,6000.0,,213.794598,545.0,Blue,missing,Male,42.285714,46.0,1,0.0,0.0,1.0
7,53.0,4500.0,,213.794598,586.0,Red,Yes,,53.0,53.0,1,0.0,1.0,0.0
8,,5000.0,4.2,26.0,586.0,Black,Yes,Male,42.285714,46.0,0,0.0,1.0,0.0
9,24.0,3200.0,2.3,25.0,546.0,Red,missing,,24.0,24.0,0,0.0,0.0,1.0


In [75]:
df1 = pd.get_dummies(df['G'])
df1['No']
df['G_No']=df1['No']

0    0
1    1
2    0
3    0
4    1
5    0
6    0
7    0
8    0
9    0
Name: No, dtype: uint8

In [77]:
df['H']

0      Male
1      Male
2       NaN
3       NaN
4      Male
5    Female
6      Male
7       NaN
8      Male
9       NaN
Name: H, dtype: object

In [78]:
#Using LabelEncoder

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le_H=le.fit_transform(df['H'])
le_H

array([1, 1, 2, 2, 1, 0, 1, 2, 1, 2])

In [79]:
df['le_H']=le_H
df[['H','le_H']]

Unnamed: 0,H,le_H
0,Male,1
1,Male,1
2,,2
3,,2
4,Male,1
5,Female,0
6,Male,1
7,,2
8,Male,1
9,,2


In [80]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_mean,A_median,C_NAN,G_No,G_Yes,G_Missing,le_H
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,23.0,0,0.0,1.0,0.0,1
1,32.0,5000.0,2.1,26.0,586.0,Blue,No,Male,32.0,32.0,0,1.0,0.0,0.0,1
2,,6000.0,,125.0,586.0,Black,Yes,,42.285714,46.0,1,0.0,1.0,0.0,2
3,54.0,70000.0,,140.0,586.0,Red,missing,,54.0,54.0,1,0.0,0.0,1.0,2
4,64.0,6500.0,3.6,24.0,574.0,Red,No,Male,64.0,64.0,0,1.0,0.0,0.0,1
5,46.0,3200.0,2.5,213.794598,556.0,Red,missing,Female,46.0,46.0,0,0.0,0.0,1.0,0
6,,6000.0,,213.794598,545.0,Blue,missing,Male,42.285714,46.0,1,0.0,0.0,1.0,1
7,53.0,4500.0,,213.794598,586.0,Red,Yes,,53.0,53.0,1,0.0,1.0,0.0,2
8,,5000.0,4.2,26.0,586.0,Black,Yes,Male,42.285714,46.0,0,0.0,1.0,0.0,1
9,24.0,3200.0,2.3,25.0,546.0,Red,missing,,24.0,24.0,0,0.0,0.0,1.0,2
