In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import DBSCAN, KMeans

In [36]:
location = "okcupid_profiles.csv"
data = pd.read_csv(location)

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smok

Variables numericas: age, height, income
Variables que sacamos por ahora: last_online, todos los essays
Variables con nulls=

In [38]:
#Para las variables categóricas
def completar_nodijo(columna):
    mask_null = columna.isnull()
    columna[mask_null] = "rather not say"
    

In [39]:
columnas_cat_a_modificar = ['status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity','job', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks']

In [40]:
for i in columnas_cat_a_modificar:
    completar_nodijo(data[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  columna[mask_null] = "rather not say"


In [41]:
data.drugs.value_counts()

never             37724
rather not say    14080
sometimes          7732
often               410
Name: drugs, dtype: int64

In [42]:
#Columnas que no vamos a utilizar ahora
columnas_no_utilizadas = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9']

In [43]:
data_essays = data[columnas_no_utilizadas]

In [44]:
data = data.drop(columnas_no_utilizadas, axis=1)

In [45]:
data.height[data.height.isnull()] = -1
#Seria más prolijo hacerlo con un fill na

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.height[data.height.isnull()] = -1


In [46]:
data.income.isnull().sum()

0

In [47]:
data.age.isnull().sum()

0

In [48]:
data_last_online = data.pop("last_online")

## **Vamos a separar el dataset en variables categóricas y nominales**
Revisar después cuando hagan el laburo en fino de las columnas

In [49]:
data_cat = data[columnas_cat_a_modificar]
data_cat_speaks = data_cat.pop("speaks")
data_cat_ethnicity = data_cat.pop("ethnicity")
data_cat_sign = data_cat.pop("sign")
data_cat_religion = data_cat.pop("religion")
data_nom = data[["age", "height", "income"]]

In [50]:
onehot = OneHotEncoder(sparse=False)

In [51]:
onehotfit = onehot.fit(data_cat)

In [52]:
data_cat.columns

Index(['status', 'sex', 'orientation', 'body_type', 'diet', 'drinks', 'drugs',
       'education', 'job', 'location', 'offspring', 'pets', 'smokes'],
      dtype='object')

In [53]:
onehotfit.categories_

[array(['available', 'married', 'seeing someone', 'single', 'unknown'],
       dtype=object),
 array(['f', 'm'], dtype=object),
 array(['bisexual', 'gay', 'straight'], dtype=object),
 array(['a little extra', 'athletic', 'average', 'curvy', 'fit',
        'full figured', 'jacked', 'overweight', 'rather not say', 'skinny',
        'thin', 'used up'], dtype=object),
 array(['anything', 'halal', 'kosher', 'mostly anything', 'mostly halal',
        'mostly kosher', 'mostly other', 'mostly vegan',
        'mostly vegetarian', 'other', 'rather not say',
        'strictly anything', 'strictly halal', 'strictly kosher',
        'strictly other', 'strictly vegan', 'strictly vegetarian', 'vegan',
        'vegetarian'], dtype=object),
 array(['desperately', 'not at all', 'often', 'rarely', 'rather not say',
        'socially', 'very often'], dtype=object),
 array(['never', 'often', 'rather not say', 'sometimes'], dtype=object),
 array(['college/university', 'dropped out of college/university',
  

In [54]:
data_cat_dummies = onehot.transform(data_cat)

In [55]:
data_cat_dummies

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [56]:
data_cat_dummies = pd.DataFrame(data_cat_dummies, columns=onehot.get_feature_names(data_cat.columns))

In [57]:
onehot.get_feature_names(data_cat.columns)

array(['status_available', 'status_married', 'status_seeing someone',
       'status_single', 'status_unknown', 'sex_f', 'sex_m',
       'orientation_bisexual', 'orientation_gay', 'orientation_straight',
       'body_type_a little extra', 'body_type_athletic',
       'body_type_average', 'body_type_curvy', 'body_type_fit',
       'body_type_full figured', 'body_type_jacked',
       'body_type_overweight', 'body_type_rather not say',
       'body_type_skinny', 'body_type_thin', 'body_type_used up',
       'diet_anything', 'diet_halal', 'diet_kosher',
       'diet_mostly anything', 'diet_mostly halal', 'diet_mostly kosher',
       'diet_mostly other', 'diet_mostly vegan', 'diet_mostly vegetarian',
       'diet_other', 'diet_rather not say', 'diet_strictly anything',
       'diet_strictly halal', 'diet_strictly kosher',
       'diet_strictly other', 'diet_strictly vegan',
       'diet_strictly vegetarian', 'diet_vegan', 'diet_vegetarian',
       'drinks_desperately', 'drinks_not at all', 

### Ahora vamos a standarizar las variables nominales

In [58]:
scaler = StandardScaler()
data_nom_scaled = scaler.fit_transform(data_nom)
data_nom_scaled = pd.DataFrame(data_nom_scaled, columns=data_nom.columns)
data_nom_scaled

Unnamed: 0,age,height,income
0,-1.093898,1.666782,-0.205806
1,0.281370,0.424433,0.616021
2,0.598740,-0.072507,-0.205806
3,-0.988108,0.672903,-0.000341
4,-0.353369,-0.569446,-0.205806
...,...,...,...
59941,2.820327,-1.563325,-0.205806
59942,-0.882318,0.921372,-0.205806
59943,1.021900,0.672903,0.821475
59944,-0.564949,1.169842,-0.205806


In [59]:
data_nom_cat = pd.merge(data_cat_dummies,data_nom_scaled,how="inner", left_index=True, right_index=True)
data_nom_cat

Unnamed: 0,status_available,status_married,status_seeing someone,status_single,status_unknown,sex_f,sex_m,orientation_bisexual,orientation_gay,orientation_straight,...,pets_rather not say,smokes_no,smokes_rather not say,smokes_sometimes,smokes_trying to quit,smokes_when drinking,smokes_yes,age,height,income
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.093898,1.666782,-0.205806
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.281370,0.424433,0.616021
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.598740,-0.072507,-0.205806
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.988108,0.672903,-0.000341
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.353369,-0.569446,-0.205806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.820327,-1.563325,-0.205806
59942,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.882318,0.921372,-0.205806
59943,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.021900,0.672903,0.821475
59944,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.564949,1.169842,-0.205806


In [60]:
kmeans_model = KMeans()
model_fit = kmeans_model.fit(data_nom_cat)

In [61]:
model_fit.n_clusters

8

In [62]:
#dbsscan_model = DBSCAN()
#model2_fit = dbsscan_model.fit(data_nom_cat)

In [63]:
model2_fit.core_sample_indices_

In [80]:
#obtener un conjunto unico labels por cada etnicity 
#data["ethnicity"].value_counts()
enthinicity_labels = set()

#quiero lista unica de elementos x eso uso set()
for label in data["ethnicity"]:
    etnias = label.split(',')
    # para que no se repita el nombre si o si debo remover espacios
    for i in etnias:
        etnia = i.strip()

        enthinicity_labels.add(etnia)
    
# enthinicity_labels.remove('rather not say') ?    para trabajar c lista
enthinicity_labels = list(enthinicity_labels)
enthinicity_labels 

['native american',
 'black',
 'white',
 'asian',
 'indian',
 'middle eastern',
 'rather not say',
 'pacific islander',
 'hispanic / latin',
 'other']

In [90]:
# Obtener columnas indicadoras por cada valiable categorica 

total_puntos = len(data["ethnicity"])


ethni_cols = {}   

for etnia in enthinicity_labels:
    ethni_cols[etnia] = [0.0 for i in range(total_puntos)]
    
for indice in range(total_puntos):
    etnias = data["ethnicity"][indice]
    for etnia in etnias.split(','):
        ethni_cols[etnia.strip()][indice] = 1.0
    
# add to DF
data_nom_cat = pd.merge(data_nom_cat, pd.DataFrame(ethni_cols),how="inner", left_index=True, right_index=True)



In [91]:
data_nom_cat

Unnamed: 0,status_available,status_married,status_seeing someone,status_single,status_unknown,sex_f,sex_m,orientation_bisexual,orientation_gay,orientation_straight,...,native american,black,white,asian,indian,middle eastern,rather not say,pacific islander,hispanic / latin,other
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
59942,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
59943,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
59944,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
