In [83]:
from sklearn.datasets import make_regression
import numpy as np
import pandas as pd
%matplotlib notebook
from matplotlib import pyplot as plt
import scipy.stats

In [84]:
#Crearemos la función load_dataset y cargaremos del dataset de Abalone abriendo el fitxer abalone.csv
def load_dataset(path):
    dataset = pd.read_csv(path, header=0, delimiter=',')
    return dataset

dataset = load_dataset('abalone.csv')


In [85]:
print("Primeros 5 valores de la base de datos:")
dataset.head()

Primeros 5 valores de la base de datos:


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [86]:
print("Dimensiones de la base de datos:")
print(dataset.shape)

Dimensiones de la base de datos:
(4177, 9)


In [87]:
print("Numero de valores null:")
print(dataset.isnull().sum())

Numero de valores null:
Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64


In [88]:
#Llamaremos a la función describe para ver si hay inconsistencias en los datos
print("Estadisticas sobre los atributos de la base de datos:")
dataset.describe()

Estadisticas sobre los atributos de la base de datos:


Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [89]:
#Como podemos ver detectamos ciertas inconsistencias en la altura, pues es imposible que sea 0, para ver cuales
#tienen como valor 0, haremos una consulta para conocer sus indices
print("Muestras con altura en 0:")
dataset[dataset['Height'] == 0]


Muestras con altura en 0:


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
1257,I,0.43,0.34,0.0,0.428,0.2065,0.086,0.115,8
3996,I,0.315,0.23,0.0,0.134,0.0575,0.0285,0.3505,6


In [90]:
#Una vez conocidos los indices con valor 0, los eliminaremos de la base de datos
dataset.drop(index=[1257,3996], inplace = True)

In [91]:
#Confirmamos que se han eliminado haciendo otra vez la consulta:
print("Dimensiones base de datos")
print(dataset.shape)
print("Muestras con altura 0")
dataset[dataset['Height'] == 0]


Dimensiones base de datos
(4175, 9)
Muestras con altura 0


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings


In [92]:
#El enunciado nos dice que la edad es igual al numero de anillos + 1,5
dataset['age'] = dataset.Rings + 1.5

In [93]:
print("Mostramos otra vez los 5 primeros valores para ver que el nuevo atributo llamado age se ha añadido correctamente")
dataset.head()

Mostramos otra vez los 5 primeros valores para ver que el nuevo atributo llamado age se ha añadido correctamente


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,16.5
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,8.5
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,10.5
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,11.5
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,8.5


In [94]:
#Cambiamos el nombre a las variables para evitar problemas con los espacios y las mayusculas.
dataset.rename(columns={"Sex":"sex","Length":"length","Diameter":"diameter","Height":"height","Whole weight":"whole_weight",
              "Shucked weight":"shucked_weight","Viscera weight":"viscera_weight","Shell weight":"shell_weight",
                       "Rings":"rings"}, inplace = True)

In [95]:
print("Mostramos otra vez los 5 primeros valores para ver que se han renombrado correctamente")
dataset.head()

Mostramos otra vez los 5 primeros valores para ver que se han renombrado correctamente


Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,16.5
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,8.5
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,10.5
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,11.5
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,8.5
