# Tratamiento de datos faltantes

In [13]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [14]:
## Exportacion del csv

df = pd.read_csv("titanic.csv")

In [15]:
df.isnull().sum()

name          0
gender        0
age           2
class         0
embarked      0
country      81
ticketno    891
fare        916
sibsp       900
parch       900
survived      0
dtype: int64

In [16]:
datos_nulos_pais = df[df['country'].isnull()]

In [17]:
datos_nulos_pais.head()

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
89,"Barber, Miss. Ellen Mary",female,27.0,1st,S,,19877.0,78.17,0.0,0.0,yes
117,"Bidois, Miss. Rosalie",female,46.0,1st,C,,17757.0,247.1006,0.0,0.0,yes
118,"Bing, Mr. Lee",male,32.0,3rd,S,,1601.0,56.0911,0.0,0.0,yes
121,"Birnbaum, Mr. Jakob",male,24.0,1st,C,,13905.0,26.0,0.0,0.0,no
131,"Bostandyeff, Mr. Guentcho",male,26.0,3rd,S,,349224.0,7.1711,0.0,0.0,no


In [18]:
## La manera mas facil para solucionar esto es simplemente borrando los datos de las filas por ejemplo
dataset_sin_nulos = df.dropna(subset=["country",'age'])

In [19]:
dataset_sin_nulos.isnull().sum() # El problema de esto es que en pequenios dataset va a generar que nos quedemos sin datos 

name          0
gender        0
age           0
class         0
embarked      0
country       0
ticketno    887
fare        911
sibsp       895
parch       895
survived      0
dtype: int64

### Rellenar las filas faltantees con informacion 
Esta es una mejor alternativa que ponemos datos por defecto para solucionar en este caso a country se le pondra por defecto desconocido

In [28]:
print(df['country'].fillna('desconocido'))

0       United States
1       United States
2       United States
3             England
4              Norway
            ...      
2202          England
2203          England
2204          England
2205          England
2206          England
Name: country, Length: 2207, dtype: object


Se puede emplear un diccionario para establecer los valores faltantes que se pueden usar, y con el parametro values se puede realizar esto.

In [29]:
valores_por_defecto = {'country':'desconocido','ticketno':'0'}

In [31]:
df.fillna(value=valores_por_defecto,inplace=True)

# Manejar datos faltantes con la mediana, media y moda

In [34]:
promedio = df['age'].mean()
mediana = df['age'].median()
moda = df['age'].mode()[0]
print(promedio,mediana,moda)

30.444444444444397 29.0 22.0


In [35]:
#Para este caso se reemplazara con la mediana
df['age'].fillna(mediana)

0       42.0
1       13.0
2       16.0
3       39.0
4       16.0
        ... 
2202    41.0
2203    40.0
2204    32.0
2205    20.0
2206    26.0
Name: age, Length: 2207, dtype: float64