In [1]:
# Importar las librerías a utilizar
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2,'font.family': [u'times']})

import matplotlib.pylab as plt
import numpy as np
from sklearn import metrics

In [2]:
# Leer los datos a analizar
smoke = pd.read_csv("Child smokers.csv", na_values = ' ')
print('shape: ', smoke.shape)
smoke.head() 
smoke.tail()

shape:  (654, 5)


Unnamed: 0,Age (years),Height (cm),FEV (litres),Sex,Smoker
649,15,152,2.278,female,smoker
650,16,183,4.872,male,smoker
651,16,170,4.27,male,smoker
652,15,173,3.727,male,smoker
653,16,160,2.795,female,smoker


In [3]:
# Convertir valores categoricos a numéricos
smoke['Sex'] = smoke['Sex'].astype('category').cat.codes
smoke['Smoker'] = smoke['Smoker'].astype('category').cat.codes
smoke

Unnamed: 0,Age (years),Height (cm),FEV (litres),Sex,Smoker
0,9,145,1.708,0,0
1,8,171,1.724,0,0
2,7,138,1.720,0,0
3,9,135,1.558,1,0
4,9,145,1.895,1,0
...,...,...,...,...,...
649,15,152,2.278,0,1
650,16,183,4.872,1,1
651,16,170,4.270,1,1
652,15,173,3.727,1,1


In [4]:
#Convertimos las filas vacia a tipo NaN
popNaN = smoke.dropna()

popNaN

Unnamed: 0,Age (years),Height (cm),FEV (litres),Sex,Smoker
0,9,145,1.708,0,0
1,8,171,1.724,0,0
2,7,138,1.720,0,0
3,9,135,1.558,1,0
4,9,145,1.895,1,0
...,...,...,...,...,...
649,15,152,2.278,0,1
650,16,183,4.872,1,1
651,16,170,4.270,1,1
652,15,173,3.727,1,1


In [5]:
print ("Min en Age (years):", smoke['Age (years)'].min())
print ("Min en Height (cm):", smoke['Height (cm)'].min())
print ("Min en FEV (litres):", smoke['FEV (litres)'].min())
print ("MIN en Sex:", smoke['Sex'].min())
print ("MIN en Smoker:", smoke['Smoker'].min())

Min en Age (years): 3
Min en Height (cm): 117
Min en FEV (litres): 0.7909999999999999
MIN en Sex: 0
MIN en Smoker: 0


In [6]:
# Función que recibe un DataFrame, una proporción y el nombre de la clase,
# y genera cuatro conjuntos de datos para entrenamiento
# y pruebas del algoritmo de aprendizaje
from sklearn.model_selection import train_test_split

def split_label(pace, test_size, label):
    train, test = train_test_split(smoke, test_size=test_size)
    features = smoke.columns.drop(label)
    train_X = train[features]
    train_Y = train[label]
    test_X = test[features]
    test_Y = test[label]
    return train_X, train_Y, test_X, test_Y

In [7]:
# Dividir el conjunto de datos 
train_X, train_Y, test_X, test_Y = split_label(smoke, 0.2, 'Smoker')

In [9]:
#Aplicar one hot encoding a la columna 
from sklearn.preprocessing import OneHotEncoder

one = OneHotEncoder(handle_unknown='ignore')

result = one.fit_transform(train_X['Sex'].values.reshape(-1,1)).toarray()
train_X_1 = train_X
train_X_1[['0', '1']] = pd.DataFrame(result, index = train_X_1.index)
train_X_1

Unnamed: 0,Age (years),Height (cm),FEV (litres),Sex,0,1
205,6,140,1.697,0,1.0,0.0
512,13,160,2.449,0,1.0,0.0
44,5,127,1.343,0,1.0,0.0
193,6,133,1.826,1,0.0,1.0
188,9,156,2.717,1,0.0,1.0
...,...,...,...,...,...,...
306,8,152,2.328,0,1.0,0.0
329,13,173,3.549,1,0.0,1.0
66,9,147,2.069,1,0.0,1.0
36,9,150,2.725,1,0.0,1.0


In [10]:
# Aplicar escalamiento [0,1] a todos los valores
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
train_X_2 = min_max.fit_transform(train_X_1.values)
train_X_2 = pd.DataFrame(train_X_2, columns=train_X_1.columns)
train_X_2

Unnamed: 0,Age (years),Height (cm),FEV (litres),Sex,0,1
0,0.1875,0.323944,0.186920,0.0,1.0,0.0
1,0.6250,0.605634,0.342067,0.0,1.0,0.0
2,0.1250,0.140845,0.113885,0.0,1.0,0.0
3,0.1875,0.225352,0.213534,1.0,0.0,1.0
4,0.3750,0.549296,0.397359,1.0,0.0,1.0
...,...,...,...,...,...,...
518,0.3125,0.492958,0.317103,0.0,1.0,0.0
519,0.6250,0.788732,0.569012,1.0,0.0,1.0
520,0.3750,0.422535,0.263668,1.0,0.0,1.0
521,0.3750,0.464789,0.399010,1.0,0.0,1.0


In [12]:
# Transformación del DataFrame original

result = one.fit_transform(smoke['Sex'].values.reshape(-1,1)).toarray()
smoke_1 = smoke
smoke_1[['0', '1']] = pd.DataFrame(result, index = smoke_1.index)
smoke_2 = min_max.fit_transform(smoke_1.values)
smoke_2 = pd.DataFrame(smoke_2, columns = smoke_1.columns)

In [13]:
# Uso de clasificación por KMeans
from sklearn.cluster import KMeans
clu = KMeans(n_clusters = 3)
clu.fit(smoke_2)
clu.cluster_centers_

array([[ 4.38430060e-01,  5.71177062e-01,  4.04127635e-01,
         1.00000000e+00,  7.73809524e-02, -1.05471187e-15,
         1.00000000e+00],
       [ 3.97849462e-01,  4.84072896e-01,  3.17515288e-01,
         1.11022302e-15,  2.63677968e-16,  1.00000000e+00,
         1.11022302e-15],
       [ 6.41025641e-01,  6.61610690e-01,  4.34815817e-01,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00]])