# Tratar datos duplicados y perdidos

In [4]:
import pandas as pd
df = pd.read_csv('base_datos_2008.csv', nrows= 1e6) # 1e6 = un millon de datos

Vamos a empezar hablando de los casos de duplicados, que, a no ser que se trate de casos muy específicos, donde nos interese contabilizar cuántas veces se repiten los casos concretos, el procedimiento habitual es eliminarlos.

In [5]:
# Crear un DataFrame con datos duplicados
df_duplicate = df.append(df)

In [7]:
# Reordenar
df_duplicate = df_duplicate.sample(frac = 1)

In [8]:
# Aplicar la funcion que elimina los casos faltantes
df_clean = df_duplicate.drop_duplicates()

In [9]:
# Es necesario comprobar si el numero de filas de dataframe nuevo es igual la cantidad de filas original
len(df_clean) == len(df) # Si es false quiere decir que habia filas repetidas

False

In [10]:
# El dataframe limpio tenia filas repetidas por lo que se eliminaron (cuatro)
len(df_clean)

999996

### Eliminación de datos específica

In [12]:
df_clean.drop_duplicates(subset = "DayofMonth") # Subset busca duplicados sobre la columna especificada

# Mantiene una sola fila para cada valor distinto de DayofMonth

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
830468,2008,2,24,7,1857.0,1856,2044.0,2049,UA,664,...,4.0,16.0,0,,0,,,,,
635397,2008,2,11,1,957.0,950,1243.0,1245,WN,1480,...,4.0,12.0,0,,0,,,,,
925691,2008,2,7,4,845.0,845,845.0,854,EV,4715,...,5.0,18.0,0,,0,,,,,
700797,2008,2,6,3,650.0,700,724.0,730,XE,7759,...,13.0,8.0,0,,0,,,,,
851501,2008,2,4,1,945.0,950,1152.0,1210,US,1557,...,5.0,11.0,0,,0,,,,,
702002,2008,2,8,5,2011.0,2000,2149.0,2134,XE,7811,...,22.0,15.0,0,,0,0.0,0.0,4.0,0.0,11.0
747200,2008,2,29,5,1745.0,1745,1825.0,1824,YV,1006,...,6.0,9.0,0,,0,,,,,
845606,2008,2,27,3,1043.0,1041,1153.0,1154,UA,1548,...,5.0,19.0,0,,0,,,,,
991029,2008,2,12,2,,1745,,1835,MQ,4246,...,,,1,C,0,,,,,
476433,2008,1,1,2,,1925,,2155,9E,5951,...,,,1,A,0,,,,,


### Gestionar datos faltantes

In [13]:
df.dropna() # Si lanza un dataframe vacio indica que no hay filas que no contengan un dato faltante (na)
# Esto es debido a que si el vuelo esta cancelado no tiene hora de llegada
# Si tiene hora de llegada el vuelo no esta cancelado

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay


In [14]:
# Gestionar datos faltantes de una forma no tan estricta
df.dropna(thresh = 25) # Se especifica cuantas entradas de todas las columnas queremos que esten llenas de informacion relevante, por ejemplo  25 entradas llenas de las 29 que hay

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0
6,2008,1,3,4,1937.0,1830,2037.0,1940,WN,509,...,3.0,7.0,0,,0,10.0,0.0,0.0,0.0,47.0
11,2008,1,3,4,1644.0,1510,1845.0,1725,WN,1333,...,6.0,8.0,0,,0,8.0,0.0,0.0,0.0,72.0
16,2008,1,3,4,1452.0,1425,1640.0,1625,WN,675,...,7.0,8.0,0,,0,3.0,0.0,0.0,0.0,12.0
18,2008,1,3,4,1323.0,1255,1526.0,1510,WN,4,...,4.0,9.0,0,,0,0.0,0.0,0.0,0.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999973,2008,2,23,6,1306.0,1235,1514.0,1415,MQ,4694,...,3.0,50.0,0,,0,31.0,0.0,28.0,0.0,0.0
999983,2008,2,28,4,1458.0,1235,1645.0,1415,MQ,4694,...,3.0,37.0,0,,0,0.0,0.0,7.0,0.0,143.0
999988,2008,2,4,1,1754.0,1655,2056.0,2005,MQ,4695,...,2.0,9.0,0,,0,0.0,0.0,0.0,0.0,51.0
999989,2008,2,5,2,1812.0,1655,2129.0,2005,MQ,4695,...,4.0,12.0,0,,0,10.0,0.0,7.0,0.0,67.0


In [15]:
# Se recomienda usar la siguiente linea
df.dropna(thresh = len(df.columns) - 2) # Si el DataFrame aumenta en tamaño se indica que solo tenga dos valores faltantes, las columnas con datos faltantes van e n bloque

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0
6,2008,1,3,4,1937.0,1830,2037.0,1940,WN,509,...,3.0,7.0,0,,0,10.0,0.0,0.0,0.0,47.0
11,2008,1,3,4,1644.0,1510,1845.0,1725,WN,1333,...,6.0,8.0,0,,0,8.0,0.0,0.0,0.0,72.0
16,2008,1,3,4,1452.0,1425,1640.0,1625,WN,675,...,7.0,8.0,0,,0,3.0,0.0,0.0,0.0,12.0
18,2008,1,3,4,1323.0,1255,1526.0,1510,WN,4,...,4.0,9.0,0,,0,0.0,0.0,0.0,0.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999973,2008,2,23,6,1306.0,1235,1514.0,1415,MQ,4694,...,3.0,50.0,0,,0,31.0,0.0,28.0,0.0,0.0
999983,2008,2,28,4,1458.0,1235,1645.0,1415,MQ,4694,...,3.0,37.0,0,,0,0.0,0.0,7.0,0.0,143.0
999988,2008,2,4,1,1754.0,1655,2056.0,2005,MQ,4695,...,2.0,9.0,0,,0,0.0,0.0,0.0,0.0,51.0
999989,2008,2,5,2,1812.0,1655,2129.0,2005,MQ,4695,...,4.0,12.0,0,,0,10.0,0.0,7.0,0.0,67.0


### Gestionar datos faltantes en una columna en concreto

In [16]:
df.dropna(subset = ["CancellationCode"]) # Con el comando subset se asigna una lista con el nombre de la columna que se quiere filtrar
# Devuelve un dataframe que no tiene NA en la columna CancellationCode

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
178,2008,1,3,4,,700,,830,WN,126,...,,,1,A,0,,,,,
373,2008,1,3,4,,1100,,1215,WN,1146,...,,,1,A,0,,,,,
399,2008,1,3,4,,905,,1025,WN,469,...,,,1,A,0,,,,,
401,2008,1,3,4,,1620,,1740,WN,618,...,,,1,C,0,,,,,
415,2008,1,3,4,,1930,,2035,WN,2528,...,,,1,A,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999913,2008,2,11,1,,855,,1210,MQ,4693,...,,,1,A,0,,,,,
999919,2008,2,17,7,,855,,1210,MQ,4693,...,,,1,B,0,,,,,
999961,2008,2,17,7,,1445,,1630,MQ,4694,...,,,1,B,0,,,,,
999971,2008,2,22,5,,1235,,1415,MQ,4694,...,,,1,B,0,,,,,
