In [58]:
import pandas as pd
import numpy as np

# Feature Engineering and Data Cleaning
* Data description taken from https://github.com/rudeboybert/JSE_OkCupid/blob/master/okcupid_codebook.txt
* Note: Missing Data is blank

In [59]:
df = pd.read_csv("okcupidprofiles.csv", index_col = 0)
df_bias = pd.read_csv("okcupidprofiles.csv", index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


## Drop last_online since it is kind of numeric even thought theres only 24 different dates.

In [60]:
df = df.drop(["last_online"], axis = 1)
df_bias = df_bias.drop(["last_online"], axis = 1)

In [61]:
len(df_bias[df_bias["income"] == -1])/len(df_bias)
#So 37% of the data is rather not say, this wont really be useful so lets remove the column
df = df.drop(["income"], axis = 1)
df_bias = df_bias.drop(["income"], axis = 1)

In [62]:
len(df) #We have 60743 profiles 

60743

In [63]:
df.head()

Unnamed: 0_level_0,body_type,diet,drinks,drugs,education,ethnicity,height,job,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
22,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75,transportation,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
35,average,mostly other,often,sometimes,working on space camp,white,70,hospitality / travel,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
38,thin,anything,socially,,graduated from masters program,,68,,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
23,thin,vegetarian,socially,,working on college/university,white,71,student,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
29,athletic,,socially,never,graduated from college/university,"asian, black, other",66,artistic / musical / writer,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single


# Percentage of Na's in Each Predictor

In [64]:
cleaned = (df.isnull().sum() / len(df)) * 100
cleaned = cleaned.drop(cleaned[cleaned == 0].index).sort_values(ascending=False)
print(cleaned)
print(len(cleaned))

offspring      59.857103
diet           41.374315
religion       34.614688
pets           34.109280
drugs          24.473602
sign           19.518298
job            14.808291
education      12.212107
ethnicity      10.662957
smokes         10.391321
body_type       9.500683
drinks          6.180136
speaks          1.399338
status          1.317024
height          1.317024
sex             1.317024
orientation     1.315378
location        1.315378
dtype: float64
18


# Lets fix the Na's in each of the predcitors 
# Since missing data means blank we will drop all the missing data by row 

In [65]:
df_no_na = df.dropna(axis = 0)
print("Number of rows in data without any Na's: ", len(df_no_na), " and with Na's: ", len(df))

Number of rows in data without any Na's:  7125  and with Na's:  60743


### By removing all the Na's we are now missing 88% of the original data set. This may be a problem.
#### Solution 1: Another way to fix the Na's is set up an extra category such as "None". In place of the missing values or blank. 
#### Solution 2: A combination of solution 1 and removing all na's based on what I perceive as "Other: blank"

### Solution 1

In [66]:
df_filled= df.fillna("None")

### Solution 2

In [67]:
df_bias["offspring"] = df_bias["offspring"].fillna("None")
df_bias["diet"] = df_bias["diet"].fillna("Other: blank")
df_bias["religion"] = df_bias ["religion"].fillna("None")

In [68]:
#this seems rather weird since there are no other choices for different pets 
#so we will fill the na's with rather not say as well
df_bias["pets"] = df_bias["pets"].fillna("None")
df_bias["pets"].value_counts()

None                               20719
likes dogs and likes cats          14813
likes dogs                          7224
likes dogs and has cats             4313
has dogs                            4133
has dogs and likes cats             2333
likes dogs and dislikes cats        2029
has dogs and has cats               1474
has cats                            1406
likes cats                          1063
has dogs and dislikes cats           552
dislikes dogs and likes cats         240
dislikes dogs and dislikes cats      196
dislikes cats                        122
dislikes dogs and has cats            81
dislikes dogs                         44
single                                 1
Name: pets, dtype: int64

In [69]:
df_bias = df_bias.dropna(subset= ["drugs"], axis = 0)
#this is a rather interesting column since the only choices are never, sometimes and often
#Usually people would not put something rather than blank as a choice since its generally perceived as bad to do drugs?

In [70]:
df_bias["sign"] = df_bias["sign"].fillna("None")

In [71]:
#since job has most of the choices I will just remove the na's
df_bias = df_bias.dropna(subset= ["job"], axis = 0)

In [72]:
cleaned = (df_bias.isnull().sum() / len(df)) * 100
cleaned = cleaned.drop(cleaned[cleaned == 0].index).sort_values(ascending=False)
print(cleaned)
print(len(cleaned))

ethnicity      4.695191
body_type      4.495991
education      4.377459
smokes         2.643926
drinks         1.149104
speaks         0.041157
status         0.004939
sex            0.004939
orientation    0.003293
location       0.003293
dtype: float64
10


In [73]:
#ethnicity has most of the choice of other so lets just remove the na's
df_bias = df_bias.dropna(subset= ["ethnicity"], axis = 0)

In [74]:
#body_type has rather not say probalby meaning blank so ill remove na's as well
df_bias = df_bias.dropna(subset= ["body_type"], axis = 0)

In [75]:
df_bias["education"] = df_bias["education"].fillna("None")

In [76]:
#same reason as drugs
df_bias = df_bias.dropna(subset= ["smokes"], axis = 0)

In [77]:
#Since these have only a small percentage we'll just remove the na's
df_bias = df_bias.dropna(axis =0)

## Check to see if we have cleaned every column

In [82]:
cleaned = (df_bias.isnull().sum() / len(df)) * 100
cleaned = cleaned.drop(cleaned[cleaned == 0].index).sort_values(ascending=False)
print(cleaned)
print(len(cleaned))
#yes we have finised our feature engineering

Series([], dtype: float64)
0


In [84]:
#age, height(inches), income our numeric so we will alter this before we play around with association rules in R 

In [85]:
df_no_na.to_csv("no_na.csv",encoding='utf-8')
df_filled.to_csv("filled.csv",encoding='utf-8')
df_bias.to_csv("bias.csv",encoding='utf-8')