In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import DBSCAN, KMeans

In [2]:
location = "okcupid_profiles.csv"
data = pd.read_csv(location)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smok

Variables numericas: age, height, income
Variables que sacamos por ahora: last_online, todos los essays
Variables con nulls=

In [4]:
#Para las variables categóricas
def completar_nodijo(columna):
    mask_null = columna.isnull()
    columna[mask_null] = "rather not say"
    

In [5]:
columnas_cat_a_modificar = ['status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity','job', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks']

In [6]:
for i in columnas_cat_a_modificar:
    completar_nodijo(data[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  columna[mask_null] = "rather not say"


In [7]:
data.drugs.value_counts()

never             37724
rather not say    14080
sometimes          7732
often               410
Name: drugs, dtype: int64

In [8]:
#Columnas que no vamos a utilizar ahora
columnas_no_utilizadas = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9']

In [9]:
data_essays = data[columnas_no_utilizadas]

In [10]:
data = data.drop(columnas_no_utilizadas, axis=1)

In [11]:
data.height[data.height.isnull()] = -1
#Seria más prolijo hacerlo con un fill na

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.height[data.height.isnull()] = -1


In [12]:
data.income.isnull().sum()

0

In [13]:
data.age.isnull().sum()

0

In [14]:
data_last_online = data.pop("last_online")

## **Vamos a separar el dataset en variables categóricas y nominales**
Revisar después cuando hagan el laburo en fino de las columnas

In [15]:
data_cat = data[columnas_cat_a_modificar]
data_cat_speaks = data_cat.pop("speaks")
data_cat_ethnicity = data_cat.pop("ethnicity")
data_cat_sign = data_cat.pop("sign")
data_cat_religion = data_cat.pop("religion")
data_nom = data[["age", "height", "income"]]

In [16]:
onehot = OneHotEncoder(sparse=False)

In [17]:
onehotfit = onehot.fit(data_cat)
onehotfit

OneHotEncoder(sparse=False)

In [18]:
data_cat_dummies = onehot.transform(data_cat)

In [19]:
data_cat_dummies = pd.DataFrame(data_cat_dummies, columns=onehot.get_feature_names(data_cat.columns))

In [20]:
onehot.get_feature_names(data_cat.columns)

array(['status_available', 'status_married', 'status_seeing someone',
       'status_single', 'status_unknown', 'sex_f', 'sex_m',
       'orientation_bisexual', 'orientation_gay', 'orientation_straight',
       'body_type_a little extra', 'body_type_athletic',
       'body_type_average', 'body_type_curvy', 'body_type_fit',
       'body_type_full figured', 'body_type_jacked',
       'body_type_overweight', 'body_type_rather not say',
       'body_type_skinny', 'body_type_thin', 'body_type_used up',
       'diet_anything', 'diet_halal', 'diet_kosher',
       'diet_mostly anything', 'diet_mostly halal', 'diet_mostly kosher',
       'diet_mostly other', 'diet_mostly vegan', 'diet_mostly vegetarian',
       'diet_other', 'diet_rather not say', 'diet_strictly anything',
       'diet_strictly halal', 'diet_strictly kosher',
       'diet_strictly other', 'diet_strictly vegan',
       'diet_strictly vegetarian', 'diet_vegan', 'diet_vegetarian',
       'drinks_desperately', 'drinks_not at all', 

### Ahora vamos a standarizar las variables nominales

In [21]:
scaler = StandardScaler()
data_nom_scaled = scaler.fit_transform(data_nom)
data_nom_scaled = pd.DataFrame(data_nom_scaled, columns=data_nom.columns)
data_nom_scaled

Unnamed: 0,age,height,income
0,-1.093898,1.666782,-0.205806
1,0.281370,0.424433,0.616021
2,0.598740,-0.072507,-0.205806
3,-0.988108,0.672903,-0.000341
4,-0.353369,-0.569446,-0.205806
...,...,...,...
59941,2.820327,-1.563325,-0.205806
59942,-0.882318,0.921372,-0.205806
59943,1.021900,0.672903,0.821475
59944,-0.564949,1.169842,-0.205806


In [22]:
data_nom_cat = pd.merge(data_cat_dummies,data_nom_scaled,how="inner", left_index=True, right_index=True)
data_nom_cat

Unnamed: 0,status_available,status_married,status_seeing someone,status_single,status_unknown,sex_f,sex_m,orientation_bisexual,orientation_gay,orientation_straight,...,pets_rather not say,smokes_no,smokes_rather not say,smokes_sometimes,smokes_trying to quit,smokes_when drinking,smokes_yes,age,height,income
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.093898,1.666782,-0.205806
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.281370,0.424433,0.616021
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.598740,-0.072507,-0.205806
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.988108,0.672903,-0.000341
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.353369,-0.569446,-0.205806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.820327,-1.563325,-0.205806
59942,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.882318,0.921372,-0.205806
59943,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.021900,0.672903,0.821475
59944,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.564949,1.169842,-0.205806


In [23]:
kmeans_model = KMeans()
model_fit = kmeans_model.fit(data_nom_cat)

In [24]:
model_fit.n_clusters

8

In [25]:
dbsscan_model = DBSCAN()
model2_fit = dbsscan_model.fit(data_nom_cat)

In [26]:
model2_fit.core_sample_indices_

array([ 1762,  1764,  1786,  2170,  2438,  2655,  2844,  3030,  3171,
        3372,  4225,  4375,  4839,  5855,  5973,  6022,  6203,  8700,
        8813,  9249,  9642, 11406, 11690, 12239, 13030, 15197, 15270,
       15564, 15855, 16011, 16494, 17731, 18455, 19102, 20219, 20287,
       20616, 20898, 20964, 22593, 23235, 23367, 23794, 24962, 25368,
       26675, 28880, 29080, 29307, 29671, 31711, 31873, 31937, 32400,
       32709, 34963, 35347, 35456, 36680, 36846, 37236, 38804, 39136,
       39324, 40830, 42588, 42774, 44602, 46117, 46297, 46490, 46513,
       46546, 46660, 46967, 47003, 48609, 50076, 50240, 50534, 51840,
       52183, 52399, 53566, 54384, 55165, 55195, 55276, 55450, 55768,
       55780, 56808, 58297, 59248, 59436, 59675])